aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-05-25 12:14:07 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-25 12:14:07 -0400
commit798ce8f1cca29dcc3f4b55947f611f4ffb32ac2b (patch)
tree15fba84ba4b930397c29fe562504f66211365699
parent22e12bbc9bc38c6d0bd541d061a0f547596fc19d (diff)
parent1547010e6e15a3f44f49381246421a1e19de526e (diff)
Merge branch 'for-2.6.40/core' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.40/core' of git://git.kernel.dk/linux-2.6-block: (40 commits) cfq-iosched: free cic_index if cfqd allocation fails cfq-iosched: remove unused 'group_changed' in cfq_service_tree_add() cfq-iosched: reduce bit operations in cfq_choose_req() cfq-iosched: algebraic simplification in cfq_prio_to_maxrq() blk-cgroup: Initialize ioc->cgroup_changed at ioc creation time block: move bd_set_size() above rescan_partitions() in __blkdev_get() block: call elv_bio_merged() when merged cfq-iosched: Make IO merge related stats per cpu cfq-iosched: Fix a memory leak of per cpu stats for root group backing-dev: Kill set but not used var in bdi_debug_stats_show() block: get rid of on-stack plugging debug checks blk-throttle: Make no throttling rule group processing lockless blk-cgroup: Make cgroup stat reset path blkg->lock free for dispatch stats blk-cgroup: Make 64bit per cpu stats safe on 32bit arch blk-throttle: Make dispatch stats per cpu blk-throttle: Free up a group only after one rcu grace period blk-throttle: Use helper function to add root throtl group to lists blk-throttle: Introduce a helper function to fill in device details blk-throttle: Dynamically allocate root group blk-cgroup: Allow sleeping while dynamically allocating a group ...
-rw-r--r--Documentation/ABI/testing/sysfs-block64
-rw-r--r--block/blk-cgroup.c200
-rw-r--r--block/blk-cgroup.h40
-rw-r--r--block/blk-core.c32
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-flush.c16
-rw-r--r--block/blk-ioc.c3
-rw-r--r--block/blk-lib.c82
-rw-r--r--block/blk-settings.c9
-rw-r--r--block/blk-sysfs.c3
-rw-r--r--block/blk-throttle.c313
-rw-r--r--block/blk.h23
-rw-r--r--block/cfq-iosched.c232
-rw-r--r--block/elevator.c11
-rw-r--r--drivers/ata/libata-scsi.c13
-rw-r--r--drivers/block/paride/pcd.c2
-rw-r--r--drivers/cdrom/viocd.c4
-rw-r--r--drivers/ide/ide-cd.c3
-rw-r--r--drivers/scsi/sr.c2
-rw-r--r--fs/block_dev.c17
-rw-r--r--fs/partitions/check.c8
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/blkdev.h15
-rw-r--r--include/linux/genhd.h2
-rw-r--r--mm/backing-dev.c4
25 files changed, 785 insertions, 317 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 4873c759d535..c1eb41cb9876 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -142,3 +142,67 @@ Description:
142 with the previous I/O request are enabled. When set to 2, 142 with the previous I/O request are enabled. When set to 2,
143 all merge tries are disabled. The default value is 0 - 143 all merge tries are disabled. The default value is 0 -
144 which enables all types of merge tries. 144 which enables all types of merge tries.
145
146What: /sys/block/<disk>/discard_alignment
147Date: May 2011
148Contact: Martin K. Petersen <martin.petersen@oracle.com>
149Description:
150 Devices that support discard functionality may
151 internally allocate space in units that are bigger than
152 the exported logical block size. The discard_alignment
153 parameter indicates how many bytes the beginning of the
154 device is offset from the internal allocation unit's
155 natural alignment.
156
157What: /sys/block/<disk>/<partition>/discard_alignment
158Date: May 2011
159Contact: Martin K. Petersen <martin.petersen@oracle.com>
160Description:
161 Devices that support discard functionality may
162 internally allocate space in units that are bigger than
163 the exported logical block size. The discard_alignment
164 parameter indicates how many bytes the beginning of the
165 partition is offset from the internal allocation unit's
166 natural alignment.
167
168What: /sys/block/<disk>/queue/discard_granularity
169Date: May 2011
170Contact: Martin K. Petersen <martin.petersen@oracle.com>
171Description:
172 Devices that support discard functionality may
173 internally allocate space using units that are bigger
174 than the logical block size. The discard_granularity
175 parameter indicates the size of the internal allocation
176 unit in bytes if reported by the device. Otherwise the
177 discard_granularity will be set to match the device's
178 physical block size. A discard_granularity of 0 means
179 that the device does not support discard functionality.
180
181What: /sys/block/<disk>/queue/discard_max_bytes
182Date: May 2011
183Contact: Martin K. Petersen <martin.petersen@oracle.com>
184Description:
185 Devices that support discard functionality may have
186 internal limits on the number of bytes that can be
187 trimmed or unmapped in a single operation. Some storage
188 protocols also have inherent limits on the number of
189 blocks that can be described in a single command. The
190 discard_max_bytes parameter is set by the device driver
191 to the maximum number of bytes that can be discarded in
192 a single operation. Discard requests issued to the
193 device must not exceed this limit. A discard_max_bytes
194 value of 0 means that the device does not support
195 discard functionality.
196
197What: /sys/block/<disk>/queue/discard_zeroes_data
198Date: May 2011
199Contact: Martin K. Petersen <martin.petersen@oracle.com>
200Description:
201 Devices that support discard functionality may return
202 stale or random data when a previously discarded block
203 is read back. This can cause problems if the filesystem
204 expects discarded blocks to be explicitly cleared. If a
205 device reports that it deterministically returns zeroes
206 when a discarded area is read the discard_zeroes_data
207 parameter will be set to one. Otherwise it will be 0 and
208 the result of reading a discarded area is undefined.
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 471fdcc5df85..07371cfdfae6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -385,25 +385,40 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
385 385
386 spin_lock_irqsave(&blkg->stats_lock, flags); 386 spin_lock_irqsave(&blkg->stats_lock, flags);
387 blkg->stats.time += time; 387 blkg->stats.time += time;
388#ifdef CONFIG_DEBUG_BLK_CGROUP
388 blkg->stats.unaccounted_time += unaccounted_time; 389 blkg->stats.unaccounted_time += unaccounted_time;
390#endif
389 spin_unlock_irqrestore(&blkg->stats_lock, flags); 391 spin_unlock_irqrestore(&blkg->stats_lock, flags);
390} 392}
391EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 393EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
392 394
395/*
396 * should be called under rcu read lock or queue lock to make sure blkg pointer
397 * is valid.
398 */
393void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 399void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
394 uint64_t bytes, bool direction, bool sync) 400 uint64_t bytes, bool direction, bool sync)
395{ 401{
396 struct blkio_group_stats *stats; 402 struct blkio_group_stats_cpu *stats_cpu;
397 unsigned long flags; 403 unsigned long flags;
398 404
399 spin_lock_irqsave(&blkg->stats_lock, flags); 405 /*
400 stats = &blkg->stats; 406 * Disabling interrupts to provide mutual exclusion between two
401 stats->sectors += bytes >> 9; 407 * writes on same cpu. It probably is not needed for 64bit. Not
402 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, 408 * optimizing that case yet.
403 sync); 409 */
404 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, 410 local_irq_save(flags);
405 direction, sync); 411
406 spin_unlock_irqrestore(&blkg->stats_lock, flags); 412 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
413
414 u64_stats_update_begin(&stats_cpu->syncp);
415 stats_cpu->sectors += bytes >> 9;
416 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
417 1, direction, sync);
418 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
419 bytes, direction, sync);
420 u64_stats_update_end(&stats_cpu->syncp);
421 local_irq_restore(flags);
407} 422}
408EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); 423EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
409 424
@@ -426,18 +441,44 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
426} 441}
427EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); 442EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
428 443
444/* Merged stats are per cpu. */
429void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, 445void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
430 bool sync) 446 bool sync)
431{ 447{
448 struct blkio_group_stats_cpu *stats_cpu;
432 unsigned long flags; 449 unsigned long flags;
433 450
434 spin_lock_irqsave(&blkg->stats_lock, flags); 451 /*
435 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, 452 * Disabling interrupts to provide mutual exclusion between two
436 sync); 453 * writes on same cpu. It probably is not needed for 64bit. Not
437 spin_unlock_irqrestore(&blkg->stats_lock, flags); 454 * optimizing that case yet.
455 */
456 local_irq_save(flags);
457
458 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
459
460 u64_stats_update_begin(&stats_cpu->syncp);
461 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
462 direction, sync);
463 u64_stats_update_end(&stats_cpu->syncp);
464 local_irq_restore(flags);
438} 465}
439EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 466EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
440 467
468/*
469 * This function allocates the per cpu stats for blkio_group. Should be called
470 * from sleepable context as alloc_per_cpu() requires that.
471 */
472int blkio_alloc_blkg_stats(struct blkio_group *blkg)
473{
474 /* Allocate memory for per cpu stats */
475 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
476 if (!blkg->stats_cpu)
477 return -ENOMEM;
478 return 0;
479}
480EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
481
441void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 482void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
442 struct blkio_group *blkg, void *key, dev_t dev, 483 struct blkio_group *blkg, void *key, dev_t dev,
443 enum blkio_policy_id plid) 484 enum blkio_policy_id plid)
@@ -508,6 +549,30 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
508} 549}
509EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 550EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
510 551
552static void blkio_reset_stats_cpu(struct blkio_group *blkg)
553{
554 struct blkio_group_stats_cpu *stats_cpu;
555 int i, j, k;
556 /*
557 * Note: On 64 bit arch this should not be an issue. This has the
558 * possibility of returning some inconsistent value on 32bit arch
559 * as 64bit update on 32bit is non atomic. Taking care of this
560 * corner case makes code very complicated, like sending IPIs to
561 * cpus, taking care of stats of offline cpus etc.
562 *
563 * reset stats is anyway more of a debug feature and this sounds a
564 * corner case. So I am not complicating the code yet until and
565 * unless this becomes a real issue.
566 */
567 for_each_possible_cpu(i) {
568 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
569 stats_cpu->sectors = 0;
570 for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
571 for (k = 0; k < BLKIO_STAT_TOTAL; k++)
572 stats_cpu->stat_arr_cpu[j][k] = 0;
573 }
574}
575
511static int 576static int
512blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 577blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
513{ 578{
@@ -552,7 +617,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
552 } 617 }
553#endif 618#endif
554 spin_unlock(&blkg->stats_lock); 619 spin_unlock(&blkg->stats_lock);
620
621 /* Reset Per cpu stats which don't take blkg->stats_lock */
622 blkio_reset_stats_cpu(blkg);
555 } 623 }
624
556 spin_unlock_irq(&blkcg->lock); 625 spin_unlock_irq(&blkcg->lock);
557 return 0; 626 return 0;
558} 627}
@@ -598,6 +667,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
598 return val; 667 return val;
599} 668}
600 669
670
671static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
672 enum stat_type_cpu type, enum stat_sub_type sub_type)
673{
674 int cpu;
675 struct blkio_group_stats_cpu *stats_cpu;
676 u64 val = 0, tval;
677
678 for_each_possible_cpu(cpu) {
679 unsigned int start;
680 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
681
682 do {
683 start = u64_stats_fetch_begin(&stats_cpu->syncp);
684 if (type == BLKIO_STAT_CPU_SECTORS)
685 tval = stats_cpu->sectors;
686 else
687 tval = stats_cpu->stat_arr_cpu[type][sub_type];
688 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
689
690 val += tval;
691 }
692
693 return val;
694}
695
696static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
697 struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
698{
699 uint64_t disk_total, val;
700 char key_str[MAX_KEY_LEN];
701 enum stat_sub_type sub_type;
702
703 if (type == BLKIO_STAT_CPU_SECTORS) {
704 val = blkio_read_stat_cpu(blkg, type, 0);
705 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
706 }
707
708 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
709 sub_type++) {
710 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
711 val = blkio_read_stat_cpu(blkg, type, sub_type);
712 cb->fill(cb, key_str, val);
713 }
714
715 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
716 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
717
718 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
719 cb->fill(cb, key_str, disk_total);
720 return disk_total;
721}
722
601/* This should be called with blkg->stats_lock held */ 723/* This should be called with blkg->stats_lock held */
602static uint64_t blkio_get_stat(struct blkio_group *blkg, 724static uint64_t blkio_get_stat(struct blkio_group *blkg,
603 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) 725 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
@@ -609,9 +731,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
609 if (type == BLKIO_STAT_TIME) 731 if (type == BLKIO_STAT_TIME)
610 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 732 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
611 blkg->stats.time, cb, dev); 733 blkg->stats.time, cb, dev);
612 if (type == BLKIO_STAT_SECTORS)
613 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
614 blkg->stats.sectors, cb, dev);
615#ifdef CONFIG_DEBUG_BLK_CGROUP 734#ifdef CONFIG_DEBUG_BLK_CGROUP
616 if (type == BLKIO_STAT_UNACCOUNTED_TIME) 735 if (type == BLKIO_STAT_UNACCOUNTED_TIME)
617 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 736 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
@@ -1075,8 +1194,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1075} 1194}
1076 1195
1077static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, 1196static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1078 struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, 1197 struct cftype *cft, struct cgroup_map_cb *cb,
1079 bool show_total) 1198 enum stat_type type, bool show_total, bool pcpu)
1080{ 1199{
1081 struct blkio_group *blkg; 1200 struct blkio_group *blkg;
1082 struct hlist_node *n; 1201 struct hlist_node *n;
@@ -1087,10 +1206,15 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1087 if (blkg->dev) { 1206 if (blkg->dev) {
1088 if (!cftype_blkg_same_policy(cft, blkg)) 1207 if (!cftype_blkg_same_policy(cft, blkg))
1089 continue; 1208 continue;
1090 spin_lock_irq(&blkg->stats_lock); 1209 if (pcpu)
1091 cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, 1210 cgroup_total += blkio_get_stat_cpu(blkg, cb,
1092 type); 1211 blkg->dev, type);
1093 spin_unlock_irq(&blkg->stats_lock); 1212 else {
1213 spin_lock_irq(&blkg->stats_lock);
1214 cgroup_total += blkio_get_stat(blkg, cb,
1215 blkg->dev, type);
1216 spin_unlock_irq(&blkg->stats_lock);
1217 }
1094 } 1218 }
1095 } 1219 }
1096 if (show_total) 1220 if (show_total)
@@ -1114,47 +1238,47 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1114 switch(name) { 1238 switch(name) {
1115 case BLKIO_PROP_time: 1239 case BLKIO_PROP_time:
1116 return blkio_read_blkg_stats(blkcg, cft, cb, 1240 return blkio_read_blkg_stats(blkcg, cft, cb,
1117 BLKIO_STAT_TIME, 0); 1241 BLKIO_STAT_TIME, 0, 0);
1118 case BLKIO_PROP_sectors: 1242 case BLKIO_PROP_sectors:
1119 return blkio_read_blkg_stats(blkcg, cft, cb, 1243 return blkio_read_blkg_stats(blkcg, cft, cb,
1120 BLKIO_STAT_SECTORS, 0); 1244 BLKIO_STAT_CPU_SECTORS, 0, 1);
1121 case BLKIO_PROP_io_service_bytes: 1245 case BLKIO_PROP_io_service_bytes:
1122 return blkio_read_blkg_stats(blkcg, cft, cb, 1246 return blkio_read_blkg_stats(blkcg, cft, cb,
1123 BLKIO_STAT_SERVICE_BYTES, 1); 1247 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1124 case BLKIO_PROP_io_serviced: 1248 case BLKIO_PROP_io_serviced:
1125 return blkio_read_blkg_stats(blkcg, cft, cb, 1249 return blkio_read_blkg_stats(blkcg, cft, cb,
1126 BLKIO_STAT_SERVICED, 1); 1250 BLKIO_STAT_CPU_SERVICED, 1, 1);
1127 case BLKIO_PROP_io_service_time: 1251 case BLKIO_PROP_io_service_time:
1128 return blkio_read_blkg_stats(blkcg, cft, cb, 1252 return blkio_read_blkg_stats(blkcg, cft, cb,
1129 BLKIO_STAT_SERVICE_TIME, 1); 1253 BLKIO_STAT_SERVICE_TIME, 1, 0);
1130 case BLKIO_PROP_io_wait_time: 1254 case BLKIO_PROP_io_wait_time:
1131 return blkio_read_blkg_stats(blkcg, cft, cb, 1255 return blkio_read_blkg_stats(blkcg, cft, cb,
1132 BLKIO_STAT_WAIT_TIME, 1); 1256 BLKIO_STAT_WAIT_TIME, 1, 0);
1133 case BLKIO_PROP_io_merged: 1257 case BLKIO_PROP_io_merged:
1134 return blkio_read_blkg_stats(blkcg, cft, cb, 1258 return blkio_read_blkg_stats(blkcg, cft, cb,
1135 BLKIO_STAT_MERGED, 1); 1259 BLKIO_STAT_CPU_MERGED, 1, 1);
1136 case BLKIO_PROP_io_queued: 1260 case BLKIO_PROP_io_queued:
1137 return blkio_read_blkg_stats(blkcg, cft, cb, 1261 return blkio_read_blkg_stats(blkcg, cft, cb,
1138 BLKIO_STAT_QUEUED, 1); 1262 BLKIO_STAT_QUEUED, 1, 0);
1139#ifdef CONFIG_DEBUG_BLK_CGROUP 1263#ifdef CONFIG_DEBUG_BLK_CGROUP
1140 case BLKIO_PROP_unaccounted_time: 1264 case BLKIO_PROP_unaccounted_time:
1141 return blkio_read_blkg_stats(blkcg, cft, cb, 1265 return blkio_read_blkg_stats(blkcg, cft, cb,
1142 BLKIO_STAT_UNACCOUNTED_TIME, 0); 1266 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1143 case BLKIO_PROP_dequeue: 1267 case BLKIO_PROP_dequeue:
1144 return blkio_read_blkg_stats(blkcg, cft, cb, 1268 return blkio_read_blkg_stats(blkcg, cft, cb,
1145 BLKIO_STAT_DEQUEUE, 0); 1269 BLKIO_STAT_DEQUEUE, 0, 0);
1146 case BLKIO_PROP_avg_queue_size: 1270 case BLKIO_PROP_avg_queue_size:
1147 return blkio_read_blkg_stats(blkcg, cft, cb, 1271 return blkio_read_blkg_stats(blkcg, cft, cb,
1148 BLKIO_STAT_AVG_QUEUE_SIZE, 0); 1272 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1149 case BLKIO_PROP_group_wait_time: 1273 case BLKIO_PROP_group_wait_time:
1150 return blkio_read_blkg_stats(blkcg, cft, cb, 1274 return blkio_read_blkg_stats(blkcg, cft, cb,
1151 BLKIO_STAT_GROUP_WAIT_TIME, 0); 1275 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1152 case BLKIO_PROP_idle_time: 1276 case BLKIO_PROP_idle_time:
1153 return blkio_read_blkg_stats(blkcg, cft, cb, 1277 return blkio_read_blkg_stats(blkcg, cft, cb,
1154 BLKIO_STAT_IDLE_TIME, 0); 1278 BLKIO_STAT_IDLE_TIME, 0, 0);
1155 case BLKIO_PROP_empty_time: 1279 case BLKIO_PROP_empty_time:
1156 return blkio_read_blkg_stats(blkcg, cft, cb, 1280 return blkio_read_blkg_stats(blkcg, cft, cb,
1157 BLKIO_STAT_EMPTY_TIME, 0); 1281 BLKIO_STAT_EMPTY_TIME, 0, 0);
1158#endif 1282#endif
1159 default: 1283 default:
1160 BUG(); 1284 BUG();
@@ -1164,10 +1288,10 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1164 switch(name){ 1288 switch(name){
1165 case BLKIO_THROTL_io_service_bytes: 1289 case BLKIO_THROTL_io_service_bytes:
1166 return blkio_read_blkg_stats(blkcg, cft, cb, 1290 return blkio_read_blkg_stats(blkcg, cft, cb,
1167 BLKIO_STAT_SERVICE_BYTES, 1); 1291 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1168 case BLKIO_THROTL_io_serviced: 1292 case BLKIO_THROTL_io_serviced:
1169 return blkio_read_blkg_stats(blkcg, cft, cb, 1293 return blkio_read_blkg_stats(blkcg, cft, cb,
1170 BLKIO_STAT_SERVICED, 1); 1294 BLKIO_STAT_CPU_SERVICED, 1, 1);
1171 default: 1295 default:
1172 BUG(); 1296 BUG();
1173 } 1297 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c774930cc206..a71d2904ffb9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17#include <linux/u64_stats_sync.h>
17 18
18enum blkio_policy_id { 19enum blkio_policy_id {
19 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ 20 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
@@ -36,22 +37,15 @@ enum stat_type {
36 * request completion for IOs doen by this cgroup. This may not be 37 * request completion for IOs doen by this cgroup. This may not be
37 * accurate when NCQ is turned on. */ 38 * accurate when NCQ is turned on. */
38 BLKIO_STAT_SERVICE_TIME = 0, 39 BLKIO_STAT_SERVICE_TIME = 0,
39 /* Total bytes transferred */
40 BLKIO_STAT_SERVICE_BYTES,
41 /* Total IOs serviced, post merge */
42 BLKIO_STAT_SERVICED,
43 /* Total time spent waiting in scheduler queue in ns */ 40 /* Total time spent waiting in scheduler queue in ns */
44 BLKIO_STAT_WAIT_TIME, 41 BLKIO_STAT_WAIT_TIME,
45 /* Number of IOs merged */
46 BLKIO_STAT_MERGED,
47 /* Number of IOs queued up */ 42 /* Number of IOs queued up */
48 BLKIO_STAT_QUEUED, 43 BLKIO_STAT_QUEUED,
49 /* All the single valued stats go below this */ 44 /* All the single valued stats go below this */
50 BLKIO_STAT_TIME, 45 BLKIO_STAT_TIME,
51 BLKIO_STAT_SECTORS, 46#ifdef CONFIG_DEBUG_BLK_CGROUP
52 /* Time not charged to this cgroup */ 47 /* Time not charged to this cgroup */
53 BLKIO_STAT_UNACCOUNTED_TIME, 48 BLKIO_STAT_UNACCOUNTED_TIME,
54#ifdef CONFIG_DEBUG_BLK_CGROUP
55 BLKIO_STAT_AVG_QUEUE_SIZE, 49 BLKIO_STAT_AVG_QUEUE_SIZE,
56 BLKIO_STAT_IDLE_TIME, 50 BLKIO_STAT_IDLE_TIME,
57 BLKIO_STAT_EMPTY_TIME, 51 BLKIO_STAT_EMPTY_TIME,
@@ -60,6 +54,18 @@ enum stat_type {
60#endif 54#endif
61}; 55};
62 56
57/* Per cpu stats */
58enum stat_type_cpu {
59 BLKIO_STAT_CPU_SECTORS,
60 /* Total bytes transferred */
61 BLKIO_STAT_CPU_SERVICE_BYTES,
62 /* Total IOs serviced, post merge */
63 BLKIO_STAT_CPU_SERVICED,
64 /* Number of IOs merged */
65 BLKIO_STAT_CPU_MERGED,
66 BLKIO_STAT_CPU_NR
67};
68
63enum stat_sub_type { 69enum stat_sub_type {
64 BLKIO_STAT_READ = 0, 70 BLKIO_STAT_READ = 0,
65 BLKIO_STAT_WRITE, 71 BLKIO_STAT_WRITE,
@@ -116,11 +122,11 @@ struct blkio_cgroup {
116struct blkio_group_stats { 122struct blkio_group_stats {
117 /* total disk time and nr sectors dispatched by this group */ 123 /* total disk time and nr sectors dispatched by this group */
118 uint64_t time; 124 uint64_t time;
119 uint64_t sectors;
120 /* Time not charged to this cgroup */
121 uint64_t unaccounted_time;
122 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; 125 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
123#ifdef CONFIG_DEBUG_BLK_CGROUP 126#ifdef CONFIG_DEBUG_BLK_CGROUP
127 /* Time not charged to this cgroup */
128 uint64_t unaccounted_time;
129
124 /* Sum of number of IOs queued across all samples */ 130 /* Sum of number of IOs queued across all samples */
125 uint64_t avg_queue_size_sum; 131 uint64_t avg_queue_size_sum;
126 /* Count of samples taken for average */ 132 /* Count of samples taken for average */
@@ -145,6 +151,13 @@ struct blkio_group_stats {
145#endif 151#endif
146}; 152};
147 153
154/* Per cpu blkio group stats */
155struct blkio_group_stats_cpu {
156 uint64_t sectors;
157 uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
158 struct u64_stats_sync syncp;
159};
160
148struct blkio_group { 161struct blkio_group {
149 /* An rcu protected unique identifier for the group */ 162 /* An rcu protected unique identifier for the group */
150 void *key; 163 void *key;
@@ -160,6 +173,8 @@ struct blkio_group {
160 /* Need to serialize the stats in the case of reset/update */ 173 /* Need to serialize the stats in the case of reset/update */
161 spinlock_t stats_lock; 174 spinlock_t stats_lock;
162 struct blkio_group_stats stats; 175 struct blkio_group_stats stats;
176 /* Per cpu stats pointer */
177 struct blkio_group_stats_cpu __percpu *stats_cpu;
163}; 178};
164 179
165struct blkio_policy_node { 180struct blkio_policy_node {
@@ -295,6 +310,7 @@ extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
295extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 310extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
296 struct blkio_group *blkg, void *key, dev_t dev, 311 struct blkio_group *blkg, void *key, dev_t dev,
297 enum blkio_policy_id plid); 312 enum blkio_policy_id plid);
313extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
298extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 314extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
299extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 315extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
300 void *key); 316 void *key);
@@ -322,6 +338,8 @@ static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
322 struct blkio_group *blkg, void *key, dev_t dev, 338 struct blkio_group *blkg, void *key, dev_t dev,
323 enum blkio_policy_id plid) {} 339 enum blkio_policy_id plid) {}
324 340
341static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
342
325static inline int 343static inline int
326blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 344blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
327 345
diff --git a/block/blk-core.c b/block/blk-core.c
index 3fe00a14822a..c8303e9d919d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -569,8 +569,6 @@ int blk_get_queue(struct request_queue *q)
569 569
570static inline void blk_free_request(struct request_queue *q, struct request *rq) 570static inline void blk_free_request(struct request_queue *q, struct request *rq)
571{ 571{
572 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
573
574 if (rq->cmd_flags & REQ_ELVPRIV) 572 if (rq->cmd_flags & REQ_ELVPRIV)
575 elv_put_request(q, rq); 573 elv_put_request(q, rq);
576 mempool_free(rq, q->rq.rq_pool); 574 mempool_free(rq, q->rq.rq_pool);
@@ -1110,14 +1108,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1110{ 1108{
1111 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1109 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1112 1110
1113 /*
1114 * Debug stuff, kill later
1115 */
1116 if (!rq_mergeable(req)) {
1117 blk_dump_rq_flags(req, "back");
1118 return false;
1119 }
1120
1121 if (!ll_back_merge_fn(q, req, bio)) 1111 if (!ll_back_merge_fn(q, req, bio))
1122 return false; 1112 return false;
1123 1113
@@ -1132,6 +1122,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1132 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1122 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1133 1123
1134 drive_stat_acct(req, 0); 1124 drive_stat_acct(req, 0);
1125 elv_bio_merged(q, req, bio);
1135 return true; 1126 return true;
1136} 1127}
1137 1128
@@ -1141,14 +1132,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1141 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1132 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1142 sector_t sector; 1133 sector_t sector;
1143 1134
1144 /*
1145 * Debug stuff, kill later
1146 */
1147 if (!rq_mergeable(req)) {
1148 blk_dump_rq_flags(req, "front");
1149 return false;
1150 }
1151
1152 if (!ll_front_merge_fn(q, req, bio)) 1135 if (!ll_front_merge_fn(q, req, bio))
1153 return false; 1136 return false;
1154 1137
@@ -1173,6 +1156,7 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1173 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1156 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1174 1157
1175 drive_stat_acct(req, 0); 1158 drive_stat_acct(req, 0);
1159 elv_bio_merged(q, req, bio);
1176 return true; 1160 return true;
1177} 1161}
1178 1162
@@ -1258,14 +1242,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1258 1242
1259 el_ret = elv_merge(q, &req, bio); 1243 el_ret = elv_merge(q, &req, bio);
1260 if (el_ret == ELEVATOR_BACK_MERGE) { 1244 if (el_ret == ELEVATOR_BACK_MERGE) {
1261 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1262 if (bio_attempt_back_merge(q, req, bio)) { 1245 if (bio_attempt_back_merge(q, req, bio)) {
1263 if (!attempt_back_merge(q, req)) 1246 if (!attempt_back_merge(q, req))
1264 elv_merged_request(q, req, el_ret); 1247 elv_merged_request(q, req, el_ret);
1265 goto out_unlock; 1248 goto out_unlock;
1266 } 1249 }
1267 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 1250 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1268 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1269 if (bio_attempt_front_merge(q, req, bio)) { 1251 if (bio_attempt_front_merge(q, req, bio)) {
1270 if (!attempt_front_merge(q, req)) 1252 if (!attempt_front_merge(q, req))
1271 elv_merged_request(q, req, el_ret); 1253 elv_merged_request(q, req, el_ret);
@@ -1320,10 +1302,6 @@ get_rq:
1320 if (__rq->q != q) 1302 if (__rq->q != q)
1321 plug->should_sort = 1; 1303 plug->should_sort = 1;
1322 } 1304 }
1323 /*
1324 * Debug flag, kill later
1325 */
1326 req->cmd_flags |= REQ_ON_PLUG;
1327 list_add_tail(&req->queuelist, &plug->list); 1305 list_add_tail(&req->queuelist, &plug->list);
1328 drive_stat_acct(req, 1); 1306 drive_stat_acct(req, 1);
1329 } else { 1307 } else {
@@ -1550,7 +1528,8 @@ static inline void __generic_make_request(struct bio *bio)
1550 goto end_io; 1528 goto end_io;
1551 } 1529 }
1552 1530
1553 blk_throtl_bio(q, &bio); 1531 if (blk_throtl_bio(q, &bio))
1532 goto end_io;
1554 1533
1555 /* 1534 /*
1556 * If bio = NULL, bio has been throttled and will be submitted 1535 * If bio = NULL, bio has been throttled and will be submitted
@@ -2748,7 +2727,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2748 while (!list_empty(&list)) { 2727 while (!list_empty(&list)) {
2749 rq = list_entry_rq(list.next); 2728 rq = list_entry_rq(list.next);
2750 list_del_init(&rq->queuelist); 2729 list_del_init(&rq->queuelist);
2751 BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
2752 BUG_ON(!rq->q); 2730 BUG_ON(!rq->q);
2753 if (rq->q != q) { 2731 if (rq->q != q) {
2754 /* 2732 /*
@@ -2760,8 +2738,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2760 depth = 0; 2738 depth = 0;
2761 spin_lock(q->queue_lock); 2739 spin_lock(q->queue_lock);
2762 } 2740 }
2763 rq->cmd_flags &= ~REQ_ON_PLUG;
2764
2765 /* 2741 /*
2766 * rq is already accounted, so use raw insert 2742 * rq is already accounted, so use raw insert
2767 */ 2743 */
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 81e31819a597..8a0e7ec056e7 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -56,7 +56,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
56 spin_lock_irq(q->queue_lock); 56 spin_lock_irq(q->queue_lock);
57 __elv_add_request(q, rq, where); 57 __elv_add_request(q, rq, where);
58 __blk_run_queue(q); 58 __blk_run_queue(q);
59 /* the queue is stopped so it won't be plugged+unplugged */ 59 /* the queue is stopped so it won't be run */
60 if (rq->cmd_type == REQ_TYPE_PM_RESUME) 60 if (rq->cmd_type == REQ_TYPE_PM_RESUME)
61 q->request_fn(q); 61 q->request_fn(q);
62 spin_unlock_irq(q->queue_lock); 62 spin_unlock_irq(q->queue_lock);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 6c9b5e189e62..bb21e4c36f70 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -212,13 +212,19 @@ static void flush_end_io(struct request *flush_rq, int error)
212 } 212 }
213 213
214 /* 214 /*
215 * Moving a request silently to empty queue_head may stall the 215 * Kick the queue to avoid stall for two cases:
216 * queue. Kick the queue in those cases. This function is called 216 * 1. Moving a request silently to empty queue_head may stall the
217 * from request completion path and calling directly into 217 * queue.
218 * request_fn may confuse the driver. Always use kblockd. 218 * 2. When flush request is running in non-queueable queue, the
219 * queue is hold. Restart the queue after flush request is finished
220 * to avoid stall.
221 * This function is called from request completion path and calling
222 * directly into request_fn may confuse the driver. Always use
223 * kblockd.
219 */ 224 */
220 if (queued) 225 if (queued || q->flush_queue_delayed)
221 blk_run_queue_async(q); 226 blk_run_queue_async(q);
227 q->flush_queue_delayed = 0;
222} 228}
223 229
224/** 230/**
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b791022beef3..c898049dafd5 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -96,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
96 INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); 96 INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
97 INIT_HLIST_HEAD(&ret->cic_list); 97 INIT_HLIST_HEAD(&ret->cic_list);
98 ret->ioc_data = NULL; 98 ret->ioc_data = NULL;
99#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
100 ret->cgroup_changed = 0;
101#endif
99 } 102 }
100 103
101 return ret; 104 return ret;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 25de73e4759b..78e627e2581d 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,17 +9,20 @@
9 9
10#include "blk.h" 10#include "blk.h"
11 11
12static void blkdev_discard_end_io(struct bio *bio, int err) 12struct bio_batch {
13{ 13 atomic_t done;
14 if (err) { 14 unsigned long flags;
15 if (err == -EOPNOTSUPP) 15 struct completion *wait;
16 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 16};
17 clear_bit(BIO_UPTODATE, &bio->bi_flags);
18 }
19 17
20 if (bio->bi_private) 18static void bio_batch_end_io(struct bio *bio, int err)
21 complete(bio->bi_private); 19{
20 struct bio_batch *bb = bio->bi_private;
22 21
22 if (err && (err != -EOPNOTSUPP))
23 clear_bit(BIO_UPTODATE, &bb->flags);
24 if (atomic_dec_and_test(&bb->done))
25 complete(bb->wait);
23 bio_put(bio); 26 bio_put(bio);
24} 27}
25 28
@@ -41,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
41 struct request_queue *q = bdev_get_queue(bdev); 44 struct request_queue *q = bdev_get_queue(bdev);
42 int type = REQ_WRITE | REQ_DISCARD; 45 int type = REQ_WRITE | REQ_DISCARD;
43 unsigned int max_discard_sectors; 46 unsigned int max_discard_sectors;
47 struct bio_batch bb;
44 struct bio *bio; 48 struct bio *bio;
45 int ret = 0; 49 int ret = 0;
46 50
@@ -67,7 +71,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
67 type |= REQ_SECURE; 71 type |= REQ_SECURE;
68 } 72 }
69 73
70 while (nr_sects && !ret) { 74 atomic_set(&bb.done, 1);
75 bb.flags = 1 << BIO_UPTODATE;
76 bb.wait = &wait;
77
78 while (nr_sects) {
71 bio = bio_alloc(gfp_mask, 1); 79 bio = bio_alloc(gfp_mask, 1);
72 if (!bio) { 80 if (!bio) {
73 ret = -ENOMEM; 81 ret = -ENOMEM;
@@ -75,9 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
75 } 83 }
76 84
77 bio->bi_sector = sector; 85 bio->bi_sector = sector;
78 bio->bi_end_io = blkdev_discard_end_io; 86 bio->bi_end_io = bio_batch_end_io;
79 bio->bi_bdev = bdev; 87 bio->bi_bdev = bdev;
80 bio->bi_private = &wait; 88 bio->bi_private = &bb;
81 89
82 if (nr_sects > max_discard_sectors) { 90 if (nr_sects > max_discard_sectors) {
83 bio->bi_size = max_discard_sectors << 9; 91 bio->bi_size = max_discard_sectors << 9;
@@ -88,45 +96,21 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
88 nr_sects = 0; 96 nr_sects = 0;
89 } 97 }
90 98
91 bio_get(bio); 99 atomic_inc(&bb.done);
92 submit_bio(type, bio); 100 submit_bio(type, bio);
101 }
93 102
103 /* Wait for bios in-flight */
104 if (!atomic_dec_and_test(&bb.done))
94 wait_for_completion(&wait); 105 wait_for_completion(&wait);
95 106
96 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 107 if (!test_bit(BIO_UPTODATE, &bb.flags))
97 ret = -EOPNOTSUPP; 108 ret = -EIO;
98 else if (!bio_flagged(bio, BIO_UPTODATE))
99 ret = -EIO;
100 bio_put(bio);
101 }
102 109
103 return ret; 110 return ret;
104} 111}
105EXPORT_SYMBOL(blkdev_issue_discard); 112EXPORT_SYMBOL(blkdev_issue_discard);
106 113
107struct bio_batch
108{
109 atomic_t done;
110 unsigned long flags;
111 struct completion *wait;
112};
113
114static void bio_batch_end_io(struct bio *bio, int err)
115{
116 struct bio_batch *bb = bio->bi_private;
117
118 if (err) {
119 if (err == -EOPNOTSUPP)
120 set_bit(BIO_EOPNOTSUPP, &bb->flags);
121 else
122 clear_bit(BIO_UPTODATE, &bb->flags);
123 }
124 if (bb)
125 if (atomic_dec_and_test(&bb->done))
126 complete(bb->wait);
127 bio_put(bio);
128}
129
130/** 114/**
131 * blkdev_issue_zeroout - generate number of zero filed write bios 115 * blkdev_issue_zeroout - generate number of zero filed write bios
132 * @bdev: blockdev to issue 116 * @bdev: blockdev to issue
@@ -151,7 +135,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
151 bb.flags = 1 << BIO_UPTODATE; 135 bb.flags = 1 << BIO_UPTODATE;
152 bb.wait = &wait; 136 bb.wait = &wait;
153 137
154submit:
155 ret = 0; 138 ret = 0;
156 while (nr_sects != 0) { 139 while (nr_sects != 0) {
157 bio = bio_alloc(gfp_mask, 140 bio = bio_alloc(gfp_mask,
@@ -168,9 +151,6 @@ submit:
168 151
169 while (nr_sects != 0) { 152 while (nr_sects != 0) {
170 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); 153 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
171 if (sz == 0)
172 /* bio has maximum size possible */
173 break;
174 ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); 154 ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
175 nr_sects -= ret >> 9; 155 nr_sects -= ret >> 9;
176 sector += ret >> 9; 156 sector += ret >> 9;
@@ -190,16 +170,6 @@ submit:
190 /* One of bios in the batch was completed with error.*/ 170 /* One of bios in the batch was completed with error.*/
191 ret = -EIO; 171 ret = -EIO;
192 172
193 if (ret)
194 goto out;
195
196 if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
197 ret = -EOPNOTSUPP;
198 goto out;
199 }
200 if (nr_sects != 0)
201 goto submit;
202out:
203 return ret; 173 return ret;
204} 174}
205EXPORT_SYMBOL(blkdev_issue_zeroout); 175EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1fa769293597..fa1eb0449a05 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -120,7 +120,7 @@ void blk_set_default_limits(struct queue_limits *lim)
120 lim->discard_granularity = 0; 120 lim->discard_granularity = 0;
121 lim->discard_alignment = 0; 121 lim->discard_alignment = 0;
122 lim->discard_misaligned = 0; 122 lim->discard_misaligned = 0;
123 lim->discard_zeroes_data = -1; 123 lim->discard_zeroes_data = 1;
124 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 124 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
125 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 125 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
126 lim->alignment_offset = 0; 126 lim->alignment_offset = 0;
@@ -166,6 +166,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
166 166
167 blk_set_default_limits(&q->limits); 167 blk_set_default_limits(&q->limits);
168 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); 168 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
169 q->limits.discard_zeroes_data = 0;
169 170
170 /* 171 /*
171 * by default assume old behaviour and bounce for any highmem page 172 * by default assume old behaviour and bounce for any highmem page
@@ -790,6 +791,12 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush)
790} 791}
791EXPORT_SYMBOL_GPL(blk_queue_flush); 792EXPORT_SYMBOL_GPL(blk_queue_flush);
792 793
794void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
795{
796 q->flush_not_queueable = !queueable;
797}
798EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
799
793static int __init blk_settings_init(void) 800static int __init blk_settings_init(void)
794{ 801{
795 blk_max_low_pfn = max_low_pfn - 1; 802 blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index bd236313f35d..d935bd859c87 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -152,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
152 152
153static ssize_t queue_discard_max_show(struct request_queue *q, char *page) 153static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
154{ 154{
155 return queue_var_show(q->limits.max_discard_sectors << 9, page); 155 return sprintf(page, "%llu\n",
156 (unsigned long long)q->limits.max_discard_sectors << 9);
156} 157}
157 158
158static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) 159static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 252a81a306f7..a62be8d0dc1b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -78,6 +78,8 @@ struct throtl_grp {
78 78
79 /* Some throttle limits got updated for the group */ 79 /* Some throttle limits got updated for the group */
80 int limits_changed; 80 int limits_changed;
81
82 struct rcu_head rcu_head;
81}; 83};
82 84
83struct throtl_data 85struct throtl_data
@@ -88,7 +90,7 @@ struct throtl_data
88 /* service tree for active throtl groups */ 90 /* service tree for active throtl groups */
89 struct throtl_rb_root tg_service_tree; 91 struct throtl_rb_root tg_service_tree;
90 92
91 struct throtl_grp root_tg; 93 struct throtl_grp *root_tg;
92 struct request_queue *queue; 94 struct request_queue *queue;
93 95
94 /* Total Number of queued bios on READ and WRITE lists */ 96 /* Total Number of queued bios on READ and WRITE lists */
@@ -151,56 +153,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
151 return tg; 153 return tg;
152} 154}
153 155
154static void throtl_put_tg(struct throtl_grp *tg) 156static void throtl_free_tg(struct rcu_head *head)
155{ 157{
156 BUG_ON(atomic_read(&tg->ref) <= 0); 158 struct throtl_grp *tg;
157 if (!atomic_dec_and_test(&tg->ref)) 159
158 return; 160 tg = container_of(head, struct throtl_grp, rcu_head);
161 free_percpu(tg->blkg.stats_cpu);
159 kfree(tg); 162 kfree(tg);
160} 163}
161 164
162static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, 165static void throtl_put_tg(struct throtl_grp *tg)
163 struct blkio_cgroup *blkcg)
164{ 166{
165 struct throtl_grp *tg = NULL; 167 BUG_ON(atomic_read(&tg->ref) <= 0);
166 void *key = td; 168 if (!atomic_dec_and_test(&tg->ref))
167 struct backing_dev_info *bdi = &td->queue->backing_dev_info; 169 return;
168 unsigned int major, minor;
169 170
170 /* 171 /*
171 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix 172 * A group is freed in rcu manner. But having an rcu lock does not
172 * tree of blkg (instead of traversing through hash list all 173 * mean that one can access all the fields of blkg and assume these
173 * the time. 174 * are valid. For example, don't try to follow throtl_data and
175 * request queue links.
176 *
177 * Having a reference to blkg under an rcu allows acess to only
178 * values local to groups like group stats and group rate limits
174 */ 179 */
180 call_rcu(&tg->rcu_head, throtl_free_tg);
181}
175 182
176 /* 183static void throtl_init_group(struct throtl_grp *tg)
177 * This is the common case when there are no blkio cgroups. 184{
178 * Avoid lookup in this case
179 */
180 if (blkcg == &blkio_root_cgroup)
181 tg = &td->root_tg;
182 else
183 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
184
185 /* Fill in device details for root group */
186 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
187 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
188 tg->blkg.dev = MKDEV(major, minor);
189 goto done;
190 }
191
192 if (tg)
193 goto done;
194
195 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
196 if (!tg)
197 goto done;
198
199 INIT_HLIST_NODE(&tg->tg_node); 185 INIT_HLIST_NODE(&tg->tg_node);
200 RB_CLEAR_NODE(&tg->rb_node); 186 RB_CLEAR_NODE(&tg->rb_node);
201 bio_list_init(&tg->bio_lists[0]); 187 bio_list_init(&tg->bio_lists[0]);
202 bio_list_init(&tg->bio_lists[1]); 188 bio_list_init(&tg->bio_lists[1]);
203 td->limits_changed = false; 189 tg->limits_changed = false;
190
191 /* Practically unlimited BW */
192 tg->bps[0] = tg->bps[1] = -1;
193 tg->iops[0] = tg->iops[1] = -1;
204 194
205 /* 195 /*
206 * Take the initial reference that will be released on destroy 196 * Take the initial reference that will be released on destroy
@@ -209,33 +199,181 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
209 * exit or cgroup deletion path depending on who is exiting first. 199 * exit or cgroup deletion path depending on who is exiting first.
210 */ 200 */
211 atomic_set(&tg->ref, 1); 201 atomic_set(&tg->ref, 1);
202}
203
204/* Should be called with rcu read lock held (needed for blkcg) */
205static void
206throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
207{
208 hlist_add_head(&tg->tg_node, &td->tg_list);
209 td->nr_undestroyed_grps++;
210}
211
212static void
213__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
214{
215 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
216 unsigned int major, minor;
217
218 if (!tg || tg->blkg.dev)
219 return;
220
221 /*
222 * Fill in device details for a group which might not have been
223 * filled at group creation time as queue was being instantiated
224 * and driver had not attached a device yet
225 */
226 if (bdi->dev && dev_name(bdi->dev)) {
227 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
228 tg->blkg.dev = MKDEV(major, minor);
229 }
230}
231
232/*
233 * Should be called with without queue lock held. Here queue lock will be
234 * taken rarely. It will be taken only once during life time of a group
235 * if need be
236 */
237static void
238throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
239{
240 if (!tg || tg->blkg.dev)
241 return;
242
243 spin_lock_irq(td->queue->queue_lock);
244 __throtl_tg_fill_dev_details(td, tg);
245 spin_unlock_irq(td->queue->queue_lock);
246}
247
248static void throtl_init_add_tg_lists(struct throtl_data *td,
249 struct throtl_grp *tg, struct blkio_cgroup *blkcg)
250{
251 __throtl_tg_fill_dev_details(td, tg);
212 252
213 /* Add group onto cgroup list */ 253 /* Add group onto cgroup list */
214 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
215 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, 254 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
216 MKDEV(major, minor), BLKIO_POLICY_THROTL); 255 tg->blkg.dev, BLKIO_POLICY_THROTL);
217 256
218 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); 257 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
219 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); 258 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
220 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); 259 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
221 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); 260 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
222 261
223 hlist_add_head(&tg->tg_node, &td->tg_list); 262 throtl_add_group_to_td_list(td, tg);
224 td->nr_undestroyed_grps++; 263}
225done: 264
265/* Should be called without queue lock and outside of rcu period */
266static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
267{
268 struct throtl_grp *tg = NULL;
269 int ret;
270
271 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
272 if (!tg)
273 return NULL;
274
275 ret = blkio_alloc_blkg_stats(&tg->blkg);
276
277 if (ret) {
278 kfree(tg);
279 return NULL;
280 }
281
282 throtl_init_group(tg);
226 return tg; 283 return tg;
227} 284}
228 285
229static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 286static struct
287throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
230{ 288{
231 struct throtl_grp *tg = NULL; 289 struct throtl_grp *tg = NULL;
290 void *key = td;
291
292 /*
293 * This is the common case when there are no blkio cgroups.
294 * Avoid lookup in this case
295 */
296 if (blkcg == &blkio_root_cgroup)
297 tg = td->root_tg;
298 else
299 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
300
301 __throtl_tg_fill_dev_details(td, tg);
302 return tg;
303}
304
305/*
306 * This function returns with queue lock unlocked in case of error, like
307 * request queue is no more
308 */
309static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
310{
311 struct throtl_grp *tg = NULL, *__tg = NULL;
232 struct blkio_cgroup *blkcg; 312 struct blkio_cgroup *blkcg;
313 struct request_queue *q = td->queue;
233 314
234 rcu_read_lock(); 315 rcu_read_lock();
235 blkcg = task_blkio_cgroup(current); 316 blkcg = task_blkio_cgroup(current);
236 tg = throtl_find_alloc_tg(td, blkcg); 317 tg = throtl_find_tg(td, blkcg);
237 if (!tg) 318 if (tg) {
238 tg = &td->root_tg; 319 rcu_read_unlock();
320 return tg;
321 }
322
323 /*
324 * Need to allocate a group. Allocation of group also needs allocation
325 * of per cpu stats which in-turn takes a mutex() and can block. Hence
326 * we need to drop rcu lock and queue_lock before we call alloc
327 *
328 * Take the request queue reference to make sure queue does not
329 * go away once we return from allocation.
330 */
331 blk_get_queue(q);
332 rcu_read_unlock();
333 spin_unlock_irq(q->queue_lock);
334
335 tg = throtl_alloc_tg(td);
336 /*
337 * We might have slept in group allocation. Make sure queue is not
338 * dead
339 */
340 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
341 blk_put_queue(q);
342 if (tg)
343 kfree(tg);
344
345 return ERR_PTR(-ENODEV);
346 }
347 blk_put_queue(q);
348
349 /* Group allocated and queue is still alive. take the lock */
350 spin_lock_irq(q->queue_lock);
351
352 /*
353 * Initialize the new group. After sleeping, read the blkcg again.
354 */
355 rcu_read_lock();
356 blkcg = task_blkio_cgroup(current);
357
358 /*
359 * If some other thread already allocated the group while we were
360 * not holding queue lock, free up the group
361 */
362 __tg = throtl_find_tg(td, blkcg);
363
364 if (__tg) {
365 kfree(tg);
366 rcu_read_unlock();
367 return __tg;
368 }
369
370 /* Group allocation failed. Account the IO to root group */
371 if (!tg) {
372 tg = td->root_tg;
373 return tg;
374 }
375
376 throtl_init_add_tg_lists(td, tg, blkcg);
239 rcu_read_unlock(); 377 rcu_read_unlock();
240 return tg; 378 return tg;
241} 379}
@@ -544,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
544 return 0; 682 return 0;
545} 683}
546 684
685static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
686 if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
687 return 1;
688 return 0;
689}
690
547/* 691/*
548 * Returns whether one can dispatch a bio or not. Also returns approx number 692 * Returns whether one can dispatch a bio or not. Also returns approx number
549 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 693 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
@@ -608,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
608 tg->bytes_disp[rw] += bio->bi_size; 752 tg->bytes_disp[rw] += bio->bi_size;
609 tg->io_disp[rw]++; 753 tg->io_disp[rw]++;
610 754
611 /*
612 * TODO: This will take blkg->stats_lock. Figure out a way
613 * to avoid this cost.
614 */
615 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); 755 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
616} 756}
617 757
@@ -989,15 +1129,51 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
989 struct throtl_grp *tg; 1129 struct throtl_grp *tg;
990 struct bio *bio = *biop; 1130 struct bio *bio = *biop;
991 bool rw = bio_data_dir(bio), update_disptime = true; 1131 bool rw = bio_data_dir(bio), update_disptime = true;
1132 struct blkio_cgroup *blkcg;
992 1133
993 if (bio->bi_rw & REQ_THROTTLED) { 1134 if (bio->bi_rw & REQ_THROTTLED) {
994 bio->bi_rw &= ~REQ_THROTTLED; 1135 bio->bi_rw &= ~REQ_THROTTLED;
995 return 0; 1136 return 0;
996 } 1137 }
997 1138
1139 /*
1140 * A throtl_grp pointer retrieved under rcu can be used to access
1141 * basic fields like stats and io rates. If a group has no rules,
1142 * just update the dispatch stats in lockless manner and return.
1143 */
1144
1145 rcu_read_lock();
1146 blkcg = task_blkio_cgroup(current);
1147 tg = throtl_find_tg(td, blkcg);
1148 if (tg) {
1149 throtl_tg_fill_dev_details(td, tg);
1150
1151 if (tg_no_rule_group(tg, rw)) {
1152 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
1153 rw, bio->bi_rw & REQ_SYNC);
1154 rcu_read_unlock();
1155 return 0;
1156 }
1157 }
1158 rcu_read_unlock();
1159
1160 /*
1161 * Either group has not been allocated yet or it is not an unlimited
1162 * IO group
1163 */
1164
998 spin_lock_irq(q->queue_lock); 1165 spin_lock_irq(q->queue_lock);
999 tg = throtl_get_tg(td); 1166 tg = throtl_get_tg(td);
1000 1167
1168 if (IS_ERR(tg)) {
1169 if (PTR_ERR(tg) == -ENODEV) {
1170 /*
1171 * Queue is gone. No queue lock held here.
1172 */
1173 return -ENODEV;
1174 }
1175 }
1176
1001 if (tg->nr_queued[rw]) { 1177 if (tg->nr_queued[rw]) {
1002 /* 1178 /*
1003 * There is already another bio queued in same dir. No 1179 * There is already another bio queued in same dir. No
@@ -1060,39 +1236,24 @@ int blk_throtl_init(struct request_queue *q)
1060 INIT_HLIST_HEAD(&td->tg_list); 1236 INIT_HLIST_HEAD(&td->tg_list);
1061 td->tg_service_tree = THROTL_RB_ROOT; 1237 td->tg_service_tree = THROTL_RB_ROOT;
1062 td->limits_changed = false; 1238 td->limits_changed = false;
1239 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1063 1240
1064 /* Init root group */ 1241 /* alloc and Init root group. */
1065 tg = &td->root_tg; 1242 td->queue = q;
1066 INIT_HLIST_NODE(&tg->tg_node); 1243 tg = throtl_alloc_tg(td);
1067 RB_CLEAR_NODE(&tg->rb_node);
1068 bio_list_init(&tg->bio_lists[0]);
1069 bio_list_init(&tg->bio_lists[1]);
1070
1071 /* Practically unlimited BW */
1072 tg->bps[0] = tg->bps[1] = -1;
1073 tg->iops[0] = tg->iops[1] = -1;
1074 td->limits_changed = false;
1075 1244
1076 /* 1245 if (!tg) {
1077 * Set root group reference to 2. One reference will be dropped when 1246 kfree(td);
1078 * all groups on tg_list are being deleted during queue exit. Other 1247 return -ENOMEM;
1079 * reference will remain there as we don't want to delete this group 1248 }
1080 * as it is statically allocated and gets destroyed when throtl_data
1081 * goes away.
1082 */
1083 atomic_set(&tg->ref, 2);
1084 hlist_add_head(&tg->tg_node, &td->tg_list);
1085 td->nr_undestroyed_grps++;
1086 1249
1087 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1250 td->root_tg = tg;
1088 1251
1089 rcu_read_lock(); 1252 rcu_read_lock();
1090 blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, 1253 throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
1091 0, BLKIO_POLICY_THROTL);
1092 rcu_read_unlock(); 1254 rcu_read_unlock();
1093 1255
1094 /* Attach throtl data to request queue */ 1256 /* Attach throtl data to request queue */
1095 td->queue = q;
1096 q->td = td; 1257 q->td = td;
1097 return 0; 1258 return 0;
1098} 1259}
diff --git a/block/blk.h b/block/blk.h
index 61263463e38e..d6586287adc9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -62,7 +62,28 @@ static inline struct request *__elv_next_request(struct request_queue *q)
62 return rq; 62 return rq;
63 } 63 }
64 64
65 if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 65 /*
66 * Flush request is running and flush request isn't queueable
67 * in the drive, we can hold the queue till flush request is
68 * finished. Even we don't do this, driver can't dispatch next
69 * requests and will requeue them. And this can improve
70 * throughput too. For example, we have request flush1, write1,
71 * flush 2. flush1 is dispatched, then queue is hold, write1
72 * isn't inserted to queue. After flush1 is finished, flush2
73 * will be dispatched. Since disk cache is already clean,
74 * flush2 will be finished very soon, so looks like flush2 is
75 * folded to flush1.
76 * Since the queue is hold, a flag is set to indicate the queue
77 * should be restarted later. Please see flush_end_io() for
78 * details.
79 */
80 if (q->flush_pending_idx != q->flush_running_idx &&
81 !queue_flush_queueable(q)) {
82 q->flush_queue_delayed = 1;
83 return NULL;
84 }
85 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
86 !q->elevator->ops->elevator_dispatch_fn(q, 0))
66 return NULL; 87 return NULL;
67 } 88 }
68} 89}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ab7a9e6a9b1c..7c52d6888924 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -300,7 +300,9 @@ struct cfq_data {
300 300
301 /* List of cfq groups being managed on this device*/ 301 /* List of cfq groups being managed on this device*/
302 struct hlist_head cfqg_list; 302 struct hlist_head cfqg_list;
303 struct rcu_head rcu; 303
304 /* Number of groups which are on blkcg->blkg_list */
305 unsigned int nr_blkcg_linked_grps;
304}; 306};
305 307
306static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 308static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -665,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
665 if (rq2 == NULL) 667 if (rq2 == NULL)
666 return rq1; 668 return rq1;
667 669
668 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 670 if (rq_is_sync(rq1) != rq_is_sync(rq2))
669 return rq1; 671 return rq_is_sync(rq1) ? rq1 : rq2;
670 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 672
671 return rq2; 673 if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
672 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) 674 return rq1->cmd_flags & REQ_META ? rq1 : rq2;
673 return rq1;
674 else if ((rq2->cmd_flags & REQ_META) &&
675 !(rq1->cmd_flags & REQ_META))
676 return rq2;
677 675
678 s1 = blk_rq_pos(rq1); 676 s1 = blk_rq_pos(rq1);
679 s2 = blk_rq_pos(rq2); 677 s2 = blk_rq_pos(rq2);
@@ -1014,28 +1012,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
1014 cfqg->needs_update = true; 1012 cfqg->needs_update = true;
1015} 1013}
1016 1014
1017static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, 1015static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
1018 struct blkio_cgroup *blkcg, int create) 1016 struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
1019{ 1017{
1020 struct cfq_group *cfqg = NULL;
1021 void *key = cfqd;
1022 int i, j;
1023 struct cfq_rb_root *st;
1024 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 1018 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1025 unsigned int major, minor; 1019 unsigned int major, minor;
1026 1020
1027 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 1021 /*
1028 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 1022 * Add group onto cgroup list. It might happen that bdi->dev is
1023 * not initialized yet. Initialize this new group without major
1024 * and minor info and this info will be filled in once a new thread
1025 * comes for IO.
1026 */
1027 if (bdi->dev) {
1029 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1028 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1030 cfqg->blkg.dev = MKDEV(major, minor); 1029 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1031 goto done; 1030 (void *)cfqd, MKDEV(major, minor));
1032 } 1031 } else
1033 if (cfqg || !create) 1032 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1034 goto done; 1033 (void *)cfqd, 0);
1034
1035 cfqd->nr_blkcg_linked_grps++;
1036 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1037
1038 /* Add group on cfqd list */
1039 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1040}
1041
1042/*
1043 * Should be called from sleepable context. No request queue lock as per
1044 * cpu stats are allocated dynamically and alloc_percpu needs to be called
1045 * from sleepable context.
1046 */
1047static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
1048{
1049 struct cfq_group *cfqg = NULL;
1050 int i, j, ret;
1051 struct cfq_rb_root *st;
1035 1052
1036 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 1053 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1037 if (!cfqg) 1054 if (!cfqg)
1038 goto done; 1055 return NULL;
1039 1056
1040 for_each_cfqg_st(cfqg, i, j, st) 1057 for_each_cfqg_st(cfqg, i, j, st)
1041 *st = CFQ_RB_ROOT; 1058 *st = CFQ_RB_ROOT;
@@ -1049,43 +1066,94 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
1049 */ 1066 */
1050 cfqg->ref = 1; 1067 cfqg->ref = 1;
1051 1068
1069 ret = blkio_alloc_blkg_stats(&cfqg->blkg);
1070 if (ret) {
1071 kfree(cfqg);
1072 return NULL;
1073 }
1074
1075 return cfqg;
1076}
1077
1078static struct cfq_group *
1079cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
1080{
1081 struct cfq_group *cfqg = NULL;
1082 void *key = cfqd;
1083 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1084 unsigned int major, minor;
1085
1052 /* 1086 /*
1053 * Add group onto cgroup list. It might happen that bdi->dev is 1087 * This is the common case when there are no blkio cgroups.
1054 * not initialized yet. Initialize this new group without major 1088 * Avoid lookup in this case
1055 * and minor info and this info will be filled in once a new thread
1056 * comes for IO. See code above.
1057 */ 1089 */
1058 if (bdi->dev) { 1090 if (blkcg == &blkio_root_cgroup)
1059 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1091 cfqg = &cfqd->root_group;
1060 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1092 else
1061 MKDEV(major, minor)); 1093 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1062 } else
1063 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1064 0);
1065
1066 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1067 1094
1068 /* Add group on cfqd list */ 1095 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1069 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1096 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1097 cfqg->blkg.dev = MKDEV(major, minor);
1098 }
1070 1099
1071done:
1072 return cfqg; 1100 return cfqg;
1073} 1101}
1074 1102
1075/* 1103/*
1076 * Search for the cfq group current task belongs to. If create = 1, then also 1104 * Search for the cfq group current task belongs to. request_queue lock must
1077 * create the cfq group if it does not exist. request_queue lock must be held. 1105 * be held.
1078 */ 1106 */
1079static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1107static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1080{ 1108{
1081 struct blkio_cgroup *blkcg; 1109 struct blkio_cgroup *blkcg;
1082 struct cfq_group *cfqg = NULL; 1110 struct cfq_group *cfqg = NULL, *__cfqg = NULL;
1111 struct request_queue *q = cfqd->queue;
1083 1112
1084 rcu_read_lock(); 1113 rcu_read_lock();
1085 blkcg = task_blkio_cgroup(current); 1114 blkcg = task_blkio_cgroup(current);
1086 cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create); 1115 cfqg = cfq_find_cfqg(cfqd, blkcg);
1087 if (!cfqg && create) 1116 if (cfqg) {
1117 rcu_read_unlock();
1118 return cfqg;
1119 }
1120
1121 /*
1122 * Need to allocate a group. Allocation of group also needs allocation
1123 * of per cpu stats which in-turn takes a mutex() and can block. Hence
1124 * we need to drop rcu lock and queue_lock before we call alloc.
1125 *
1126 * Not taking any queue reference here and assuming that queue is
1127 * around by the time we return. CFQ queue allocation code does
1128 * the same. It might be racy though.
1129 */
1130
1131 rcu_read_unlock();
1132 spin_unlock_irq(q->queue_lock);
1133
1134 cfqg = cfq_alloc_cfqg(cfqd);
1135
1136 spin_lock_irq(q->queue_lock);
1137
1138 rcu_read_lock();
1139 blkcg = task_blkio_cgroup(current);
1140
1141 /*
1142 * If some other thread already allocated the group while we were
1143 * not holding queue lock, free up the group
1144 */
1145 __cfqg = cfq_find_cfqg(cfqd, blkcg);
1146
1147 if (__cfqg) {
1148 kfree(cfqg);
1149 rcu_read_unlock();
1150 return __cfqg;
1151 }
1152
1153 if (!cfqg)
1088 cfqg = &cfqd->root_group; 1154 cfqg = &cfqd->root_group;
1155
1156 cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1089 rcu_read_unlock(); 1157 rcu_read_unlock();
1090 return cfqg; 1158 return cfqg;
1091} 1159}
@@ -1118,6 +1186,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
1118 return; 1186 return;
1119 for_each_cfqg_st(cfqg, i, j, st) 1187 for_each_cfqg_st(cfqg, i, j, st)
1120 BUG_ON(!RB_EMPTY_ROOT(&st->rb)); 1188 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1189 free_percpu(cfqg->blkg.stats_cpu);
1121 kfree(cfqg); 1190 kfree(cfqg);
1122} 1191}
1123 1192
@@ -1176,7 +1245,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1176} 1245}
1177 1246
1178#else /* GROUP_IOSCHED */ 1247#else /* GROUP_IOSCHED */
1179static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1248static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1180{ 1249{
1181 return &cfqd->root_group; 1250 return &cfqd->root_group;
1182} 1251}
@@ -1210,7 +1279,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1210 struct cfq_rb_root *service_tree; 1279 struct cfq_rb_root *service_tree;
1211 int left; 1280 int left;
1212 int new_cfqq = 1; 1281 int new_cfqq = 1;
1213 int group_changed = 0;
1214 1282
1215 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1283 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1216 cfqq_type(cfqq)); 1284 cfqq_type(cfqq));
@@ -1281,7 +1349,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1281 rb_link_node(&cfqq->rb_node, parent, p); 1349 rb_link_node(&cfqq->rb_node, parent, p);
1282 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1350 rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1283 service_tree->count++; 1351 service_tree->count++;
1284 if ((add_front || !new_cfqq) && !group_changed) 1352 if (add_front || !new_cfqq)
1285 return; 1353 return;
1286 cfq_group_notify_queue_add(cfqd, cfqq->cfqg); 1354 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1287} 1355}
@@ -2029,7 +2097,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2029 2097
2030 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 2098 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2031 2099
2032 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 2100 return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2033} 2101}
2034 2102
2035/* 2103/*
@@ -2911,7 +2979,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2911 struct cfq_group *cfqg; 2979 struct cfq_group *cfqg;
2912 2980
2913retry: 2981retry:
2914 cfqg = cfq_get_cfqg(cfqd, 1); 2982 cfqg = cfq_get_cfqg(cfqd);
2915 cic = cfq_cic_lookup(cfqd, ioc); 2983 cic = cfq_cic_lookup(cfqd, ioc);
2916 /* cic always exists here */ 2984 /* cic always exists here */
2917 cfqq = cic_to_cfqq(cic, is_sync); 2985 cfqq = cic_to_cfqq(cic, is_sync);
@@ -3815,15 +3883,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
3815 cfq_put_queue(cfqd->async_idle_cfqq); 3883 cfq_put_queue(cfqd->async_idle_cfqq);
3816} 3884}
3817 3885
3818static void cfq_cfqd_free(struct rcu_head *head)
3819{
3820 kfree(container_of(head, struct cfq_data, rcu));
3821}
3822
3823static void cfq_exit_queue(struct elevator_queue *e) 3886static void cfq_exit_queue(struct elevator_queue *e)
3824{ 3887{
3825 struct cfq_data *cfqd = e->elevator_data; 3888 struct cfq_data *cfqd = e->elevator_data;
3826 struct request_queue *q = cfqd->queue; 3889 struct request_queue *q = cfqd->queue;
3890 bool wait = false;
3827 3891
3828 cfq_shutdown_timer_wq(cfqd); 3892 cfq_shutdown_timer_wq(cfqd);
3829 3893
@@ -3842,7 +3906,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
3842 3906
3843 cfq_put_async_queues(cfqd); 3907 cfq_put_async_queues(cfqd);
3844 cfq_release_cfq_groups(cfqd); 3908 cfq_release_cfq_groups(cfqd);
3845 cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3909
3910 /*
3911 * If there are groups which we could not unlink from blkcg list,
3912 * wait for a rcu period for them to be freed.
3913 */
3914 if (cfqd->nr_blkcg_linked_grps)
3915 wait = true;
3846 3916
3847 spin_unlock_irq(q->queue_lock); 3917 spin_unlock_irq(q->queue_lock);
3848 3918
@@ -3852,8 +3922,25 @@ static void cfq_exit_queue(struct elevator_queue *e)
3852 ida_remove(&cic_index_ida, cfqd->cic_index); 3922 ida_remove(&cic_index_ida, cfqd->cic_index);
3853 spin_unlock(&cic_index_lock); 3923 spin_unlock(&cic_index_lock);
3854 3924
3855 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3925 /*
3856 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3926 * Wait for cfqg->blkg->key accessors to exit their grace periods.
3927 * Do this wait only if there are other unlinked groups out
3928 * there. This can happen if cgroup deletion path claimed the
3929 * responsibility of cleaning up a group before queue cleanup code
3930 * get to the group.
3931 *
3932 * Do not call synchronize_rcu() unconditionally as there are drivers
3933 * which create/delete request queue hundreds of times during scan/boot
3934 * and synchronize_rcu() can take significant time and slow down boot.
3935 */
3936 if (wait)
3937 synchronize_rcu();
3938
3939#ifdef CONFIG_CFQ_GROUP_IOSCHED
3940 /* Free up per cpu stats for root group */
3941 free_percpu(cfqd->root_group.blkg.stats_cpu);
3942#endif
3943 kfree(cfqd);
3857} 3944}
3858 3945
3859static int cfq_alloc_cic_index(void) 3946static int cfq_alloc_cic_index(void)
@@ -3886,8 +3973,12 @@ static void *cfq_init_queue(struct request_queue *q)
3886 return NULL; 3973 return NULL;
3887 3974
3888 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3975 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3889 if (!cfqd) 3976 if (!cfqd) {
3977 spin_lock(&cic_index_lock);
3978 ida_remove(&cic_index_ida, i);
3979 spin_unlock(&cic_index_lock);
3890 return NULL; 3980 return NULL;
3981 }
3891 3982
3892 /* 3983 /*
3893 * Don't need take queue_lock in the routine, since we are 3984 * Don't need take queue_lock in the routine, since we are
@@ -3909,14 +4000,29 @@ static void *cfq_init_queue(struct request_queue *q)
3909 4000
3910#ifdef CONFIG_CFQ_GROUP_IOSCHED 4001#ifdef CONFIG_CFQ_GROUP_IOSCHED
3911 /* 4002 /*
3912 * Take a reference to root group which we never drop. This is just 4003 * Set root group reference to 2. One reference will be dropped when
3913 * to make sure that cfq_put_cfqg() does not try to kfree root group 4004 * all groups on cfqd->cfqg_list are being deleted during queue exit.
4005 * Other reference will remain there as we don't want to delete this
4006 * group as it is statically allocated and gets destroyed when
4007 * throtl_data goes away.
3914 */ 4008 */
3915 cfqg->ref = 1; 4009 cfqg->ref = 2;
4010
4011 if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
4012 kfree(cfqg);
4013 kfree(cfqd);
4014 return NULL;
4015 }
4016
3916 rcu_read_lock(); 4017 rcu_read_lock();
4018
3917 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 4019 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
3918 (void *)cfqd, 0); 4020 (void *)cfqd, 0);
3919 rcu_read_unlock(); 4021 rcu_read_unlock();
4022 cfqd->nr_blkcg_linked_grps++;
4023
4024 /* Add group on cfqd->cfqg_list */
4025 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
3920#endif 4026#endif
3921 /* 4027 /*
3922 * Not strictly needed (since RB_ROOT just clears the node and we 4028 * Not strictly needed (since RB_ROOT just clears the node and we
diff --git a/block/elevator.c b/block/elevator.c
index 45ca1e34f582..b0b38ce0dcb6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name)
155 155
156 e = elevator_find(name); 156 e = elevator_find(name);
157 if (!e) { 157 if (!e) {
158 char elv[ELV_NAME_MAX + strlen("-iosched")];
159
160 spin_unlock(&elv_list_lock); 158 spin_unlock(&elv_list_lock);
161 159 request_module("%s-iosched", name);
162 snprintf(elv, sizeof(elv), "%s-iosched", name);
163
164 request_module("%s", elv);
165 spin_lock(&elv_list_lock); 160 spin_lock(&elv_list_lock);
166 e = elevator_find(name); 161 e = elevator_find(name);
167 } 162 }
@@ -421,8 +416,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
421 struct list_head *entry; 416 struct list_head *entry;
422 int stop_flags; 417 int stop_flags;
423 418
424 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
425
426 if (q->last_merge == rq) 419 if (q->last_merge == rq)
427 q->last_merge = NULL; 420 q->last_merge = NULL;
428 421
@@ -661,8 +654,6 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
661 654
662 rq->q = q; 655 rq->q = q;
663 656
664 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
665
666 if (rq->cmd_flags & REQ_SOFTBARRIER) { 657 if (rq->cmd_flags & REQ_SOFTBARRIER) {
667 /* barriers are scheduling boundary, update end_sector */ 658 /* barriers are scheduling boundary, update end_sector */
668 if (rq->cmd_type == REQ_TYPE_FS || 659 if (rq->cmd_type == REQ_TYPE_FS ||
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 30ea95f43e79..d51f9795c064 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1089,21 +1089,21 @@ static int atapi_drain_needed(struct request *rq)
1089static int ata_scsi_dev_config(struct scsi_device *sdev, 1089static int ata_scsi_dev_config(struct scsi_device *sdev,
1090 struct ata_device *dev) 1090 struct ata_device *dev)
1091{ 1091{
1092 struct request_queue *q = sdev->request_queue;
1093
1092 if (!ata_id_has_unload(dev->id)) 1094 if (!ata_id_has_unload(dev->id))
1093 dev->flags |= ATA_DFLAG_NO_UNLOAD; 1095 dev->flags |= ATA_DFLAG_NO_UNLOAD;
1094 1096
1095 /* configure max sectors */ 1097 /* configure max sectors */
1096 blk_queue_max_hw_sectors(sdev->request_queue, dev->max_sectors); 1098 blk_queue_max_hw_sectors(q, dev->max_sectors);
1097 1099
1098 if (dev->class == ATA_DEV_ATAPI) { 1100 if (dev->class == ATA_DEV_ATAPI) {
1099 struct request_queue *q = sdev->request_queue;
1100 void *buf; 1101 void *buf;
1101 1102
1102 sdev->sector_size = ATA_SECT_SIZE; 1103 sdev->sector_size = ATA_SECT_SIZE;
1103 1104
1104 /* set DMA padding */ 1105 /* set DMA padding */
1105 blk_queue_update_dma_pad(sdev->request_queue, 1106 blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1);
1106 ATA_DMA_PAD_SZ - 1);
1107 1107
1108 /* configure draining */ 1108 /* configure draining */
1109 buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); 1109 buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL);
@@ -1131,8 +1131,7 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
1131 "sector_size=%u > PAGE_SIZE, PIO may malfunction\n", 1131 "sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
1132 sdev->sector_size); 1132 sdev->sector_size);
1133 1133
1134 blk_queue_update_dma_alignment(sdev->request_queue, 1134 blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
1135 sdev->sector_size - 1);
1136 1135
1137 if (dev->flags & ATA_DFLAG_AN) 1136 if (dev->flags & ATA_DFLAG_AN)
1138 set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events); 1137 set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);
@@ -1145,6 +1144,8 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
1145 scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth); 1144 scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth);
1146 } 1145 }
1147 1146
1147 blk_queue_flush_queueable(q, false);
1148
1148 dev->sdev = sdev; 1149 dev->sdev = sdev;
1149 return 0; 1150 return 0;
1150} 1151}
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 8690e31d9932..a0aabd904a51 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -320,6 +320,8 @@ static void pcd_init_units(void)
320 disk->first_minor = unit; 320 disk->first_minor = unit;
321 strcpy(disk->disk_name, cd->name); /* umm... */ 321 strcpy(disk->disk_name, cd->name); /* umm... */
322 disk->fops = &pcd_bdops; 322 disk->fops = &pcd_bdops;
323 disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
324 disk->events = DISK_EVENT_MEDIA_CHANGE;
323 } 325 }
324} 326}
325 327
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index e427fbe45999..ae15a4ddaa9b 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -625,7 +625,9 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
625 blk_queue_max_hw_sectors(q, 4096 / 512); 625 blk_queue_max_hw_sectors(q, 4096 / 512);
626 gendisk->queue = q; 626 gendisk->queue = q;
627 gendisk->fops = &viocd_fops; 627 gendisk->fops = &viocd_fops;
628 gendisk->flags = GENHD_FL_CD|GENHD_FL_REMOVABLE; 628 gendisk->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE |
629 GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
630 gendisk->events = DISK_EVENT_MEDIA_CHANGE;
629 set_capacity(gendisk, 0); 631 set_capacity(gendisk, 0);
630 gendisk->private_data = d; 632 gendisk->private_data = d;
631 d->viocd_disk = gendisk; 633 d->viocd_disk = gendisk;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index a5ec5a7cb381..6e5123b1d341 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1781,7 +1781,8 @@ static int ide_cd_probe(ide_drive_t *drive)
1781 1781
1782 ide_cd_read_toc(drive, &sense); 1782 ide_cd_read_toc(drive, &sense);
1783 g->fops = &idecd_ops; 1783 g->fops = &idecd_ops;
1784 g->flags |= GENHD_FL_REMOVABLE; 1784 g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
1785 g->events = DISK_EVENT_MEDIA_CHANGE;
1785 add_disk(g); 1786 add_disk(g);
1786 return 0; 1787 return 0;
1787 1788
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 95019c747cc1..4778e2707168 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -636,7 +636,7 @@ static int sr_probe(struct device *dev)
636 disk->first_minor = minor; 636 disk->first_minor = minor;
637 sprintf(disk->disk_name, "sr%d", minor); 637 sprintf(disk->disk_name, "sr%d", minor);
638 disk->fops = &sr_bdops; 638 disk->fops = &sr_bdops;
639 disk->flags = GENHD_FL_CD; 639 disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
640 disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; 640 disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST;
641 641
642 blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT); 642 blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bf9c7a720371..1f2b19978333 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1238,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1238 res = __blkdev_get(bdev, mode, 0); 1238 res = __blkdev_get(bdev, mode, 0);
1239 1239
1240 if (whole) { 1240 if (whole) {
1241 struct gendisk *disk = whole->bd_disk;
1242
1241 /* finish claiming */ 1243 /* finish claiming */
1242 mutex_lock(&bdev->bd_mutex); 1244 mutex_lock(&bdev->bd_mutex);
1243 spin_lock(&bdev_lock); 1245 spin_lock(&bdev_lock);
@@ -1264,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1264 spin_unlock(&bdev_lock); 1266 spin_unlock(&bdev_lock);
1265 1267
1266 /* 1268 /*
1267 * Block event polling for write claims. Any write 1269 * Block event polling for write claims if requested. Any
1268 * holder makes the write_holder state stick until all 1270 * write holder makes the write_holder state stick until
1269 * are released. This is good enough and tracking 1271 * all are released. This is good enough and tracking
1270 * individual writeable reference is too fragile given 1272 * individual writeable reference is too fragile given the
1271 * the way @mode is used in blkdev_get/put(). 1273 * way @mode is used in blkdev_get/put().
1272 */ 1274 */
1273 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { 1275 if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
1276 !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
1274 bdev->bd_write_holder = true; 1277 bdev->bd_write_holder = true;
1275 disk_block_events(bdev->bd_disk); 1278 disk_block_events(disk);
1276 } 1279 }
1277 1280
1278 mutex_unlock(&bdev->bd_mutex); 1281 mutex_unlock(&bdev->bd_mutex);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index d545e97d99c3..8ed4d3433199 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev,
255 struct device_attribute *attr, char *buf) 255 struct device_attribute *attr, char *buf)
256{ 256{
257 struct hd_struct *p = dev_to_part(dev); 257 struct hd_struct *p = dev_to_part(dev);
258 return sprintf(buf, "%u\n", p->discard_alignment); 258 struct gendisk *disk = dev_to_disk(dev);
259
260 return sprintf(buf, "%u\n",
261 queue_limit_discard_alignment(&disk->queue->limits,
262 p->start_sect));
259} 263}
260 264
261ssize_t part_stat_show(struct device *dev, 265ssize_t part_stat_show(struct device *dev,
@@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
449 p->start_sect = start; 453 p->start_sect = start;
450 p->alignment_offset = 454 p->alignment_offset =
451 queue_limit_alignment_offset(&disk->queue->limits, start); 455 queue_limit_alignment_offset(&disk->queue->limits, start);
452 p->discard_alignment =
453 queue_limit_discard_alignment(&disk->queue->limits, start);
454 p->nr_sects = len; 456 p->nr_sects = len;
455 p->partno = partno; 457 p->partno = partno;
456 p->policy = get_disk_ro(disk); 458 p->policy = get_disk_ro(disk);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index be50d9e70a7d..2a7cea53ca0d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -151,7 +151,6 @@ enum rq_flag_bits {
151 __REQ_IO_STAT, /* account I/O stat */ 151 __REQ_IO_STAT, /* account I/O stat */
152 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 152 __REQ_MIXED_MERGE, /* merge of different types, fail separately */
153 __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ 153 __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */
154 __REQ_ON_PLUG, /* on plug list */
155 __REQ_NR_BITS, /* stops here */ 154 __REQ_NR_BITS, /* stops here */
156}; 155};
157 156
@@ -192,6 +191,5 @@ enum rq_flag_bits {
192#define REQ_IO_STAT (1 << __REQ_IO_STAT) 191#define REQ_IO_STAT (1 << __REQ_IO_STAT)
193#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 192#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
194#define REQ_SECURE (1 << __REQ_SECURE) 193#define REQ_SECURE (1 << __REQ_SECURE)
195#define REQ_ON_PLUG (1 << __REQ_ON_PLUG)
196 194
197#endif /* __LINUX_BLK_TYPES_H */ 195#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2ad95fa1d130..ae9091a68480 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -257,7 +257,7 @@ struct queue_limits {
257 unsigned char misaligned; 257 unsigned char misaligned;
258 unsigned char discard_misaligned; 258 unsigned char discard_misaligned;
259 unsigned char cluster; 259 unsigned char cluster;
260 signed char discard_zeroes_data; 260 unsigned char discard_zeroes_data;
261}; 261};
262 262
263struct request_queue 263struct request_queue
@@ -364,6 +364,8 @@ struct request_queue
364 * for flush operations 364 * for flush operations
365 */ 365 */
366 unsigned int flush_flags; 366 unsigned int flush_flags;
367 unsigned int flush_not_queueable:1;
368 unsigned int flush_queue_delayed:1;
367 unsigned int flush_pending_idx:1; 369 unsigned int flush_pending_idx:1;
368 unsigned int flush_running_idx:1; 370 unsigned int flush_running_idx:1;
369 unsigned long flush_pending_since; 371 unsigned long flush_pending_since;
@@ -843,6 +845,7 @@ extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
843extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); 845extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
844extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); 846extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
845extern void blk_queue_flush(struct request_queue *q, unsigned int flush); 847extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
848extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
846extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); 849extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
847 850
848extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); 851extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
@@ -1066,13 +1069,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
1066{ 1069{
1067 unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1); 1070 unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1);
1068 1071
1072 if (!lim->max_discard_sectors)
1073 return 0;
1074
1069 return (lim->discard_granularity + lim->discard_alignment - alignment) 1075 return (lim->discard_granularity + lim->discard_alignment - alignment)
1070 & (lim->discard_granularity - 1); 1076 & (lim->discard_granularity - 1);
1071} 1077}
1072 1078
1073static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) 1079static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
1074{ 1080{
1075 if (q->limits.discard_zeroes_data == 1) 1081 if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
1076 return 1; 1082 return 1;
1077 1083
1078 return 0; 1084 return 0;
@@ -1111,6 +1117,11 @@ static inline unsigned int block_size(struct block_device *bdev)
1111 return bdev->bd_block_size; 1117 return bdev->bd_block_size;
1112} 1118}
1113 1119
1120static inline bool queue_flush_queueable(struct request_queue *q)
1121{
1122 return !q->flush_not_queueable;
1123}
1124
1114typedef struct {struct page *v;} Sector; 1125typedef struct {struct page *v;} Sector;
1115 1126
1116unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *); 1127unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index d764a426e9fd..b78956b3c2e7 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -100,7 +100,6 @@ struct hd_struct {
100 sector_t start_sect; 100 sector_t start_sect;
101 sector_t nr_sects; 101 sector_t nr_sects;
102 sector_t alignment_offset; 102 sector_t alignment_offset;
103 unsigned int discard_alignment;
104 struct device __dev; 103 struct device __dev;
105 struct kobject *holder_dir; 104 struct kobject *holder_dir;
106 int policy, partno; 105 int policy, partno;
@@ -127,6 +126,7 @@ struct hd_struct {
127#define GENHD_FL_SUPPRESS_PARTITION_INFO 32 126#define GENHD_FL_SUPPRESS_PARTITION_INFO 32
128#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ 127#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
129#define GENHD_FL_NATIVE_CAPACITY 128 128#define GENHD_FL_NATIVE_CAPACITY 128
129#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256
130 130
131enum { 131enum {
132 DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ 132 DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index befc87531e4f..f032e6e1e09a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -63,10 +63,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
63 unsigned long background_thresh; 63 unsigned long background_thresh;
64 unsigned long dirty_thresh; 64 unsigned long dirty_thresh;
65 unsigned long bdi_thresh; 65 unsigned long bdi_thresh;
66 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; 66 unsigned long nr_dirty, nr_io, nr_more_io;
67 struct inode *inode; 67 struct inode *inode;
68 68
69 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 69 nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_wb_list_lock); 70 spin_lock(&inode_wb_list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 72 nr_dirty++;