diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-block | 64 | ||||
-rw-r--r-- | block/blk-cgroup.c | 200 | ||||
-rw-r--r-- | block/blk-cgroup.h | 40 | ||||
-rw-r--r-- | block/blk-core.c | 32 | ||||
-rw-r--r-- | block/blk-exec.c | 2 | ||||
-rw-r--r-- | block/blk-flush.c | 16 | ||||
-rw-r--r-- | block/blk-ioc.c | 3 | ||||
-rw-r--r-- | block/blk-lib.c | 82 | ||||
-rw-r--r-- | block/blk-settings.c | 9 | ||||
-rw-r--r-- | block/blk-sysfs.c | 3 | ||||
-rw-r--r-- | block/blk-throttle.c | 313 | ||||
-rw-r--r-- | block/blk.h | 23 | ||||
-rw-r--r-- | block/cfq-iosched.c | 232 | ||||
-rw-r--r-- | block/elevator.c | 11 | ||||
-rw-r--r-- | drivers/ata/libata-scsi.c | 13 | ||||
-rw-r--r-- | drivers/block/paride/pcd.c | 2 | ||||
-rw-r--r-- | drivers/cdrom/viocd.c | 4 | ||||
-rw-r--r-- | drivers/ide/ide-cd.c | 3 | ||||
-rw-r--r-- | drivers/scsi/sr.c | 2 | ||||
-rw-r--r-- | fs/block_dev.c | 17 | ||||
-rw-r--r-- | fs/partitions/check.c | 8 | ||||
-rw-r--r-- | include/linux/blk_types.h | 2 | ||||
-rw-r--r-- | include/linux/blkdev.h | 15 | ||||
-rw-r--r-- | include/linux/genhd.h | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 4 |
25 files changed, 785 insertions, 317 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 4873c759d535..c1eb41cb9876 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block | |||
@@ -142,3 +142,67 @@ Description: | |||
142 | with the previous I/O request are enabled. When set to 2, | 142 | with the previous I/O request are enabled. When set to 2, |
143 | all merge tries are disabled. The default value is 0 - | 143 | all merge tries are disabled. The default value is 0 - |
144 | which enables all types of merge tries. | 144 | which enables all types of merge tries. |
145 | |||
146 | What: /sys/block/<disk>/discard_alignment | ||
147 | Date: May 2011 | ||
148 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
149 | Description: | ||
150 | Devices that support discard functionality may | ||
151 | internally allocate space in units that are bigger than | ||
152 | the exported logical block size. The discard_alignment | ||
153 | parameter indicates how many bytes the beginning of the | ||
154 | device is offset from the internal allocation unit's | ||
155 | natural alignment. | ||
156 | |||
157 | What: /sys/block/<disk>/<partition>/discard_alignment | ||
158 | Date: May 2011 | ||
159 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
160 | Description: | ||
161 | Devices that support discard functionality may | ||
162 | internally allocate space in units that are bigger than | ||
163 | the exported logical block size. The discard_alignment | ||
164 | parameter indicates how many bytes the beginning of the | ||
165 | partition is offset from the internal allocation unit's | ||
166 | natural alignment. | ||
167 | |||
168 | What: /sys/block/<disk>/queue/discard_granularity | ||
169 | Date: May 2011 | ||
170 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
171 | Description: | ||
172 | Devices that support discard functionality may | ||
173 | internally allocate space using units that are bigger | ||
174 | than the logical block size. The discard_granularity | ||
175 | parameter indicates the size of the internal allocation | ||
176 | unit in bytes if reported by the device. Otherwise the | ||
177 | discard_granularity will be set to match the device's | ||
178 | physical block size. A discard_granularity of 0 means | ||
179 | that the device does not support discard functionality. | ||
180 | |||
181 | What: /sys/block/<disk>/queue/discard_max_bytes | ||
182 | Date: May 2011 | ||
183 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
184 | Description: | ||
185 | Devices that support discard functionality may have | ||
186 | internal limits on the number of bytes that can be | ||
187 | trimmed or unmapped in a single operation. Some storage | ||
188 | protocols also have inherent limits on the number of | ||
189 | blocks that can be described in a single command. The | ||
190 | discard_max_bytes parameter is set by the device driver | ||
191 | to the maximum number of bytes that can be discarded in | ||
192 | a single operation. Discard requests issued to the | ||
193 | device must not exceed this limit. A discard_max_bytes | ||
194 | value of 0 means that the device does not support | ||
195 | discard functionality. | ||
196 | |||
197 | What: /sys/block/<disk>/queue/discard_zeroes_data | ||
198 | Date: May 2011 | ||
199 | Contact: Martin K. Petersen <martin.petersen@oracle.com> | ||
200 | Description: | ||
201 | Devices that support discard functionality may return | ||
202 | stale or random data when a previously discarded block | ||
203 | is read back. This can cause problems if the filesystem | ||
204 | expects discarded blocks to be explicitly cleared. If a | ||
205 | device reports that it deterministically returns zeroes | ||
206 | when a discarded area is read the discard_zeroes_data | ||
207 | parameter will be set to one. Otherwise it will be 0 and | ||
208 | the result of reading a discarded area is undefined. | ||
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 471fdcc5df85..07371cfdfae6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -385,25 +385,40 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, | |||
385 | 385 | ||
386 | spin_lock_irqsave(&blkg->stats_lock, flags); | 386 | spin_lock_irqsave(&blkg->stats_lock, flags); |
387 | blkg->stats.time += time; | 387 | blkg->stats.time += time; |
388 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
388 | blkg->stats.unaccounted_time += unaccounted_time; | 389 | blkg->stats.unaccounted_time += unaccounted_time; |
390 | #endif | ||
389 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | 391 | spin_unlock_irqrestore(&blkg->stats_lock, flags); |
390 | } | 392 | } |
391 | EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); | 393 | EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); |
392 | 394 | ||
395 | /* | ||
396 | * should be called under rcu read lock or queue lock to make sure blkg pointer | ||
397 | * is valid. | ||
398 | */ | ||
393 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, | 399 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, |
394 | uint64_t bytes, bool direction, bool sync) | 400 | uint64_t bytes, bool direction, bool sync) |
395 | { | 401 | { |
396 | struct blkio_group_stats *stats; | 402 | struct blkio_group_stats_cpu *stats_cpu; |
397 | unsigned long flags; | 403 | unsigned long flags; |
398 | 404 | ||
399 | spin_lock_irqsave(&blkg->stats_lock, flags); | 405 | /* |
400 | stats = &blkg->stats; | 406 | * Disabling interrupts to provide mutual exclusion between two |
401 | stats->sectors += bytes >> 9; | 407 | * writes on same cpu. It probably is not needed for 64bit. Not |
402 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, | 408 | * optimizing that case yet. |
403 | sync); | 409 | */ |
404 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, | 410 | local_irq_save(flags); |
405 | direction, sync); | 411 | |
406 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | 412 | stats_cpu = this_cpu_ptr(blkg->stats_cpu); |
413 | |||
414 | u64_stats_update_begin(&stats_cpu->syncp); | ||
415 | stats_cpu->sectors += bytes >> 9; | ||
416 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], | ||
417 | 1, direction, sync); | ||
418 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], | ||
419 | bytes, direction, sync); | ||
420 | u64_stats_update_end(&stats_cpu->syncp); | ||
421 | local_irq_restore(flags); | ||
407 | } | 422 | } |
408 | EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); | 423 | EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); |
409 | 424 | ||
@@ -426,18 +441,44 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg, | |||
426 | } | 441 | } |
427 | EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); | 442 | EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); |
428 | 443 | ||
444 | /* Merged stats are per cpu. */ | ||
429 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | 445 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, |
430 | bool sync) | 446 | bool sync) |
431 | { | 447 | { |
448 | struct blkio_group_stats_cpu *stats_cpu; | ||
432 | unsigned long flags; | 449 | unsigned long flags; |
433 | 450 | ||
434 | spin_lock_irqsave(&blkg->stats_lock, flags); | 451 | /* |
435 | blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, | 452 | * Disabling interrupts to provide mutual exclusion between two |
436 | sync); | 453 | * writes on same cpu. It probably is not needed for 64bit. Not |
437 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | 454 | * optimizing that case yet. |
455 | */ | ||
456 | local_irq_save(flags); | ||
457 | |||
458 | stats_cpu = this_cpu_ptr(blkg->stats_cpu); | ||
459 | |||
460 | u64_stats_update_begin(&stats_cpu->syncp); | ||
461 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, | ||
462 | direction, sync); | ||
463 | u64_stats_update_end(&stats_cpu->syncp); | ||
464 | local_irq_restore(flags); | ||
438 | } | 465 | } |
439 | EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); | 466 | EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); |
440 | 467 | ||
468 | /* | ||
469 | * This function allocates the per cpu stats for blkio_group. Should be called | ||
470 | * from sleepable context as alloc_per_cpu() requires that. | ||
471 | */ | ||
472 | int blkio_alloc_blkg_stats(struct blkio_group *blkg) | ||
473 | { | ||
474 | /* Allocate memory for per cpu stats */ | ||
475 | blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); | ||
476 | if (!blkg->stats_cpu) | ||
477 | return -ENOMEM; | ||
478 | return 0; | ||
479 | } | ||
480 | EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); | ||
481 | |||
441 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 482 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
442 | struct blkio_group *blkg, void *key, dev_t dev, | 483 | struct blkio_group *blkg, void *key, dev_t dev, |
443 | enum blkio_policy_id plid) | 484 | enum blkio_policy_id plid) |
@@ -508,6 +549,30 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) | |||
508 | } | 549 | } |
509 | EXPORT_SYMBOL_GPL(blkiocg_lookup_group); | 550 | EXPORT_SYMBOL_GPL(blkiocg_lookup_group); |
510 | 551 | ||
552 | static void blkio_reset_stats_cpu(struct blkio_group *blkg) | ||
553 | { | ||
554 | struct blkio_group_stats_cpu *stats_cpu; | ||
555 | int i, j, k; | ||
556 | /* | ||
557 | * Note: On 64 bit arch this should not be an issue. This has the | ||
558 | * possibility of returning some inconsistent value on 32bit arch | ||
559 | * as 64bit update on 32bit is non atomic. Taking care of this | ||
560 | * corner case makes code very complicated, like sending IPIs to | ||
561 | * cpus, taking care of stats of offline cpus etc. | ||
562 | * | ||
563 | * reset stats is anyway more of a debug feature and this sounds a | ||
564 | * corner case. So I am not complicating the code yet until and | ||
565 | * unless this becomes a real issue. | ||
566 | */ | ||
567 | for_each_possible_cpu(i) { | ||
568 | stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); | ||
569 | stats_cpu->sectors = 0; | ||
570 | for(j = 0; j < BLKIO_STAT_CPU_NR; j++) | ||
571 | for (k = 0; k < BLKIO_STAT_TOTAL; k++) | ||
572 | stats_cpu->stat_arr_cpu[j][k] = 0; | ||
573 | } | ||
574 | } | ||
575 | |||
511 | static int | 576 | static int |
512 | blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) | 577 | blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) |
513 | { | 578 | { |
@@ -552,7 +617,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) | |||
552 | } | 617 | } |
553 | #endif | 618 | #endif |
554 | spin_unlock(&blkg->stats_lock); | 619 | spin_unlock(&blkg->stats_lock); |
620 | |||
621 | /* Reset Per cpu stats which don't take blkg->stats_lock */ | ||
622 | blkio_reset_stats_cpu(blkg); | ||
555 | } | 623 | } |
624 | |||
556 | spin_unlock_irq(&blkcg->lock); | 625 | spin_unlock_irq(&blkcg->lock); |
557 | return 0; | 626 | return 0; |
558 | } | 627 | } |
@@ -598,6 +667,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, | |||
598 | return val; | 667 | return val; |
599 | } | 668 | } |
600 | 669 | ||
670 | |||
671 | static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, | ||
672 | enum stat_type_cpu type, enum stat_sub_type sub_type) | ||
673 | { | ||
674 | int cpu; | ||
675 | struct blkio_group_stats_cpu *stats_cpu; | ||
676 | u64 val = 0, tval; | ||
677 | |||
678 | for_each_possible_cpu(cpu) { | ||
679 | unsigned int start; | ||
680 | stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); | ||
681 | |||
682 | do { | ||
683 | start = u64_stats_fetch_begin(&stats_cpu->syncp); | ||
684 | if (type == BLKIO_STAT_CPU_SECTORS) | ||
685 | tval = stats_cpu->sectors; | ||
686 | else | ||
687 | tval = stats_cpu->stat_arr_cpu[type][sub_type]; | ||
688 | } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); | ||
689 | |||
690 | val += tval; | ||
691 | } | ||
692 | |||
693 | return val; | ||
694 | } | ||
695 | |||
696 | static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, | ||
697 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) | ||
698 | { | ||
699 | uint64_t disk_total, val; | ||
700 | char key_str[MAX_KEY_LEN]; | ||
701 | enum stat_sub_type sub_type; | ||
702 | |||
703 | if (type == BLKIO_STAT_CPU_SECTORS) { | ||
704 | val = blkio_read_stat_cpu(blkg, type, 0); | ||
705 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); | ||
706 | } | ||
707 | |||
708 | for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; | ||
709 | sub_type++) { | ||
710 | blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); | ||
711 | val = blkio_read_stat_cpu(blkg, type, sub_type); | ||
712 | cb->fill(cb, key_str, val); | ||
713 | } | ||
714 | |||
715 | disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + | ||
716 | blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); | ||
717 | |||
718 | blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); | ||
719 | cb->fill(cb, key_str, disk_total); | ||
720 | return disk_total; | ||
721 | } | ||
722 | |||
601 | /* This should be called with blkg->stats_lock held */ | 723 | /* This should be called with blkg->stats_lock held */ |
602 | static uint64_t blkio_get_stat(struct blkio_group *blkg, | 724 | static uint64_t blkio_get_stat(struct blkio_group *blkg, |
603 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) | 725 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) |
@@ -609,9 +731,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, | |||
609 | if (type == BLKIO_STAT_TIME) | 731 | if (type == BLKIO_STAT_TIME) |
610 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | 732 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, |
611 | blkg->stats.time, cb, dev); | 733 | blkg->stats.time, cb, dev); |
612 | if (type == BLKIO_STAT_SECTORS) | ||
613 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
614 | blkg->stats.sectors, cb, dev); | ||
615 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 734 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
616 | if (type == BLKIO_STAT_UNACCOUNTED_TIME) | 735 | if (type == BLKIO_STAT_UNACCOUNTED_TIME) |
617 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | 736 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, |
@@ -1075,8 +1194,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, | |||
1075 | } | 1194 | } |
1076 | 1195 | ||
1077 | static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, | 1196 | static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, |
1078 | struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, | 1197 | struct cftype *cft, struct cgroup_map_cb *cb, |
1079 | bool show_total) | 1198 | enum stat_type type, bool show_total, bool pcpu) |
1080 | { | 1199 | { |
1081 | struct blkio_group *blkg; | 1200 | struct blkio_group *blkg; |
1082 | struct hlist_node *n; | 1201 | struct hlist_node *n; |
@@ -1087,10 +1206,15 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, | |||
1087 | if (blkg->dev) { | 1206 | if (blkg->dev) { |
1088 | if (!cftype_blkg_same_policy(cft, blkg)) | 1207 | if (!cftype_blkg_same_policy(cft, blkg)) |
1089 | continue; | 1208 | continue; |
1090 | spin_lock_irq(&blkg->stats_lock); | 1209 | if (pcpu) |
1091 | cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, | 1210 | cgroup_total += blkio_get_stat_cpu(blkg, cb, |
1092 | type); | 1211 | blkg->dev, type); |
1093 | spin_unlock_irq(&blkg->stats_lock); | 1212 | else { |
1213 | spin_lock_irq(&blkg->stats_lock); | ||
1214 | cgroup_total += blkio_get_stat(blkg, cb, | ||
1215 | blkg->dev, type); | ||
1216 | spin_unlock_irq(&blkg->stats_lock); | ||
1217 | } | ||
1094 | } | 1218 | } |
1095 | } | 1219 | } |
1096 | if (show_total) | 1220 | if (show_total) |
@@ -1114,47 +1238,47 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, | |||
1114 | switch(name) { | 1238 | switch(name) { |
1115 | case BLKIO_PROP_time: | 1239 | case BLKIO_PROP_time: |
1116 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1240 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1117 | BLKIO_STAT_TIME, 0); | 1241 | BLKIO_STAT_TIME, 0, 0); |
1118 | case BLKIO_PROP_sectors: | 1242 | case BLKIO_PROP_sectors: |
1119 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1243 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1120 | BLKIO_STAT_SECTORS, 0); | 1244 | BLKIO_STAT_CPU_SECTORS, 0, 1); |
1121 | case BLKIO_PROP_io_service_bytes: | 1245 | case BLKIO_PROP_io_service_bytes: |
1122 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1246 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1123 | BLKIO_STAT_SERVICE_BYTES, 1); | 1247 | BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); |
1124 | case BLKIO_PROP_io_serviced: | 1248 | case BLKIO_PROP_io_serviced: |
1125 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1249 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1126 | BLKIO_STAT_SERVICED, 1); | 1250 | BLKIO_STAT_CPU_SERVICED, 1, 1); |
1127 | case BLKIO_PROP_io_service_time: | 1251 | case BLKIO_PROP_io_service_time: |
1128 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1252 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1129 | BLKIO_STAT_SERVICE_TIME, 1); | 1253 | BLKIO_STAT_SERVICE_TIME, 1, 0); |
1130 | case BLKIO_PROP_io_wait_time: | 1254 | case BLKIO_PROP_io_wait_time: |
1131 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1255 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1132 | BLKIO_STAT_WAIT_TIME, 1); | 1256 | BLKIO_STAT_WAIT_TIME, 1, 0); |
1133 | case BLKIO_PROP_io_merged: | 1257 | case BLKIO_PROP_io_merged: |
1134 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1258 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1135 | BLKIO_STAT_MERGED, 1); | 1259 | BLKIO_STAT_CPU_MERGED, 1, 1); |
1136 | case BLKIO_PROP_io_queued: | 1260 | case BLKIO_PROP_io_queued: |
1137 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1261 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1138 | BLKIO_STAT_QUEUED, 1); | 1262 | BLKIO_STAT_QUEUED, 1, 0); |
1139 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 1263 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
1140 | case BLKIO_PROP_unaccounted_time: | 1264 | case BLKIO_PROP_unaccounted_time: |
1141 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1265 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1142 | BLKIO_STAT_UNACCOUNTED_TIME, 0); | 1266 | BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); |
1143 | case BLKIO_PROP_dequeue: | 1267 | case BLKIO_PROP_dequeue: |
1144 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1268 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1145 | BLKIO_STAT_DEQUEUE, 0); | 1269 | BLKIO_STAT_DEQUEUE, 0, 0); |
1146 | case BLKIO_PROP_avg_queue_size: | 1270 | case BLKIO_PROP_avg_queue_size: |
1147 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1271 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1148 | BLKIO_STAT_AVG_QUEUE_SIZE, 0); | 1272 | BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); |
1149 | case BLKIO_PROP_group_wait_time: | 1273 | case BLKIO_PROP_group_wait_time: |
1150 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1274 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1151 | BLKIO_STAT_GROUP_WAIT_TIME, 0); | 1275 | BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); |
1152 | case BLKIO_PROP_idle_time: | 1276 | case BLKIO_PROP_idle_time: |
1153 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1277 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1154 | BLKIO_STAT_IDLE_TIME, 0); | 1278 | BLKIO_STAT_IDLE_TIME, 0, 0); |
1155 | case BLKIO_PROP_empty_time: | 1279 | case BLKIO_PROP_empty_time: |
1156 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1280 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1157 | BLKIO_STAT_EMPTY_TIME, 0); | 1281 | BLKIO_STAT_EMPTY_TIME, 0, 0); |
1158 | #endif | 1282 | #endif |
1159 | default: | 1283 | default: |
1160 | BUG(); | 1284 | BUG(); |
@@ -1164,10 +1288,10 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, | |||
1164 | switch(name){ | 1288 | switch(name){ |
1165 | case BLKIO_THROTL_io_service_bytes: | 1289 | case BLKIO_THROTL_io_service_bytes: |
1166 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1290 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1167 | BLKIO_STAT_SERVICE_BYTES, 1); | 1291 | BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); |
1168 | case BLKIO_THROTL_io_serviced: | 1292 | case BLKIO_THROTL_io_serviced: |
1169 | return blkio_read_blkg_stats(blkcg, cft, cb, | 1293 | return blkio_read_blkg_stats(blkcg, cft, cb, |
1170 | BLKIO_STAT_SERVICED, 1); | 1294 | BLKIO_STAT_CPU_SERVICED, 1, 1); |
1171 | default: | 1295 | default: |
1172 | BUG(); | 1296 | BUG(); |
1173 | } | 1297 | } |
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index c774930cc206..a71d2904ffb9 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
@@ -14,6 +14,7 @@ | |||
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/cgroup.h> | 16 | #include <linux/cgroup.h> |
17 | #include <linux/u64_stats_sync.h> | ||
17 | 18 | ||
18 | enum blkio_policy_id { | 19 | enum blkio_policy_id { |
19 | BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ | 20 | BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ |
@@ -36,22 +37,15 @@ enum stat_type { | |||
36 | * request completion for IOs doen by this cgroup. This may not be | 37 | * request completion for IOs doen by this cgroup. This may not be |
37 | * accurate when NCQ is turned on. */ | 38 | * accurate when NCQ is turned on. */ |
38 | BLKIO_STAT_SERVICE_TIME = 0, | 39 | BLKIO_STAT_SERVICE_TIME = 0, |
39 | /* Total bytes transferred */ | ||
40 | BLKIO_STAT_SERVICE_BYTES, | ||
41 | /* Total IOs serviced, post merge */ | ||
42 | BLKIO_STAT_SERVICED, | ||
43 | /* Total time spent waiting in scheduler queue in ns */ | 40 | /* Total time spent waiting in scheduler queue in ns */ |
44 | BLKIO_STAT_WAIT_TIME, | 41 | BLKIO_STAT_WAIT_TIME, |
45 | /* Number of IOs merged */ | ||
46 | BLKIO_STAT_MERGED, | ||
47 | /* Number of IOs queued up */ | 42 | /* Number of IOs queued up */ |
48 | BLKIO_STAT_QUEUED, | 43 | BLKIO_STAT_QUEUED, |
49 | /* All the single valued stats go below this */ | 44 | /* All the single valued stats go below this */ |
50 | BLKIO_STAT_TIME, | 45 | BLKIO_STAT_TIME, |
51 | BLKIO_STAT_SECTORS, | 46 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
52 | /* Time not charged to this cgroup */ | 47 | /* Time not charged to this cgroup */ |
53 | BLKIO_STAT_UNACCOUNTED_TIME, | 48 | BLKIO_STAT_UNACCOUNTED_TIME, |
54 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
55 | BLKIO_STAT_AVG_QUEUE_SIZE, | 49 | BLKIO_STAT_AVG_QUEUE_SIZE, |
56 | BLKIO_STAT_IDLE_TIME, | 50 | BLKIO_STAT_IDLE_TIME, |
57 | BLKIO_STAT_EMPTY_TIME, | 51 | BLKIO_STAT_EMPTY_TIME, |
@@ -60,6 +54,18 @@ enum stat_type { | |||
60 | #endif | 54 | #endif |
61 | }; | 55 | }; |
62 | 56 | ||
57 | /* Per cpu stats */ | ||
58 | enum stat_type_cpu { | ||
59 | BLKIO_STAT_CPU_SECTORS, | ||
60 | /* Total bytes transferred */ | ||
61 | BLKIO_STAT_CPU_SERVICE_BYTES, | ||
62 | /* Total IOs serviced, post merge */ | ||
63 | BLKIO_STAT_CPU_SERVICED, | ||
64 | /* Number of IOs merged */ | ||
65 | BLKIO_STAT_CPU_MERGED, | ||
66 | BLKIO_STAT_CPU_NR | ||
67 | }; | ||
68 | |||
63 | enum stat_sub_type { | 69 | enum stat_sub_type { |
64 | BLKIO_STAT_READ = 0, | 70 | BLKIO_STAT_READ = 0, |
65 | BLKIO_STAT_WRITE, | 71 | BLKIO_STAT_WRITE, |
@@ -116,11 +122,11 @@ struct blkio_cgroup { | |||
116 | struct blkio_group_stats { | 122 | struct blkio_group_stats { |
117 | /* total disk time and nr sectors dispatched by this group */ | 123 | /* total disk time and nr sectors dispatched by this group */ |
118 | uint64_t time; | 124 | uint64_t time; |
119 | uint64_t sectors; | ||
120 | /* Time not charged to this cgroup */ | ||
121 | uint64_t unaccounted_time; | ||
122 | uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; | 125 | uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; |
123 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 126 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
127 | /* Time not charged to this cgroup */ | ||
128 | uint64_t unaccounted_time; | ||
129 | |||
124 | /* Sum of number of IOs queued across all samples */ | 130 | /* Sum of number of IOs queued across all samples */ |
125 | uint64_t avg_queue_size_sum; | 131 | uint64_t avg_queue_size_sum; |
126 | /* Count of samples taken for average */ | 132 | /* Count of samples taken for average */ |
@@ -145,6 +151,13 @@ struct blkio_group_stats { | |||
145 | #endif | 151 | #endif |
146 | }; | 152 | }; |
147 | 153 | ||
154 | /* Per cpu blkio group stats */ | ||
155 | struct blkio_group_stats_cpu { | ||
156 | uint64_t sectors; | ||
157 | uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; | ||
158 | struct u64_stats_sync syncp; | ||
159 | }; | ||
160 | |||
148 | struct blkio_group { | 161 | struct blkio_group { |
149 | /* An rcu protected unique identifier for the group */ | 162 | /* An rcu protected unique identifier for the group */ |
150 | void *key; | 163 | void *key; |
@@ -160,6 +173,8 @@ struct blkio_group { | |||
160 | /* Need to serialize the stats in the case of reset/update */ | 173 | /* Need to serialize the stats in the case of reset/update */ |
161 | spinlock_t stats_lock; | 174 | spinlock_t stats_lock; |
162 | struct blkio_group_stats stats; | 175 | struct blkio_group_stats stats; |
176 | /* Per cpu stats pointer */ | ||
177 | struct blkio_group_stats_cpu __percpu *stats_cpu; | ||
163 | }; | 178 | }; |
164 | 179 | ||
165 | struct blkio_policy_node { | 180 | struct blkio_policy_node { |
@@ -295,6 +310,7 @@ extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); | |||
295 | extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 310 | extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
296 | struct blkio_group *blkg, void *key, dev_t dev, | 311 | struct blkio_group *blkg, void *key, dev_t dev, |
297 | enum blkio_policy_id plid); | 312 | enum blkio_policy_id plid); |
313 | extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); | ||
298 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); | 314 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); |
299 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, | 315 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, |
300 | void *key); | 316 | void *key); |
@@ -322,6 +338,8 @@ static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | |||
322 | struct blkio_group *blkg, void *key, dev_t dev, | 338 | struct blkio_group *blkg, void *key, dev_t dev, |
323 | enum blkio_policy_id plid) {} | 339 | enum blkio_policy_id plid) {} |
324 | 340 | ||
341 | static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } | ||
342 | |||
325 | static inline int | 343 | static inline int |
326 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } | 344 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } |
327 | 345 | ||
diff --git a/block/blk-core.c b/block/blk-core.c index 3fe00a14822a..c8303e9d919d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -569,8 +569,6 @@ int blk_get_queue(struct request_queue *q) | |||
569 | 569 | ||
570 | static inline void blk_free_request(struct request_queue *q, struct request *rq) | 570 | static inline void blk_free_request(struct request_queue *q, struct request *rq) |
571 | { | 571 | { |
572 | BUG_ON(rq->cmd_flags & REQ_ON_PLUG); | ||
573 | |||
574 | if (rq->cmd_flags & REQ_ELVPRIV) | 572 | if (rq->cmd_flags & REQ_ELVPRIV) |
575 | elv_put_request(q, rq); | 573 | elv_put_request(q, rq); |
576 | mempool_free(rq, q->rq.rq_pool); | 574 | mempool_free(rq, q->rq.rq_pool); |
@@ -1110,14 +1108,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | |||
1110 | { | 1108 | { |
1111 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; | 1109 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; |
1112 | 1110 | ||
1113 | /* | ||
1114 | * Debug stuff, kill later | ||
1115 | */ | ||
1116 | if (!rq_mergeable(req)) { | ||
1117 | blk_dump_rq_flags(req, "back"); | ||
1118 | return false; | ||
1119 | } | ||
1120 | |||
1121 | if (!ll_back_merge_fn(q, req, bio)) | 1111 | if (!ll_back_merge_fn(q, req, bio)) |
1122 | return false; | 1112 | return false; |
1123 | 1113 | ||
@@ -1132,6 +1122,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | |||
1132 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | 1122 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); |
1133 | 1123 | ||
1134 | drive_stat_acct(req, 0); | 1124 | drive_stat_acct(req, 0); |
1125 | elv_bio_merged(q, req, bio); | ||
1135 | return true; | 1126 | return true; |
1136 | } | 1127 | } |
1137 | 1128 | ||
@@ -1141,14 +1132,6 @@ static bool bio_attempt_front_merge(struct request_queue *q, | |||
1141 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; | 1132 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; |
1142 | sector_t sector; | 1133 | sector_t sector; |
1143 | 1134 | ||
1144 | /* | ||
1145 | * Debug stuff, kill later | ||
1146 | */ | ||
1147 | if (!rq_mergeable(req)) { | ||
1148 | blk_dump_rq_flags(req, "front"); | ||
1149 | return false; | ||
1150 | } | ||
1151 | |||
1152 | if (!ll_front_merge_fn(q, req, bio)) | 1135 | if (!ll_front_merge_fn(q, req, bio)) |
1153 | return false; | 1136 | return false; |
1154 | 1137 | ||
@@ -1173,6 +1156,7 @@ static bool bio_attempt_front_merge(struct request_queue *q, | |||
1173 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | 1156 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); |
1174 | 1157 | ||
1175 | drive_stat_acct(req, 0); | 1158 | drive_stat_acct(req, 0); |
1159 | elv_bio_merged(q, req, bio); | ||
1176 | return true; | 1160 | return true; |
1177 | } | 1161 | } |
1178 | 1162 | ||
@@ -1258,14 +1242,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) | |||
1258 | 1242 | ||
1259 | el_ret = elv_merge(q, &req, bio); | 1243 | el_ret = elv_merge(q, &req, bio); |
1260 | if (el_ret == ELEVATOR_BACK_MERGE) { | 1244 | if (el_ret == ELEVATOR_BACK_MERGE) { |
1261 | BUG_ON(req->cmd_flags & REQ_ON_PLUG); | ||
1262 | if (bio_attempt_back_merge(q, req, bio)) { | 1245 | if (bio_attempt_back_merge(q, req, bio)) { |
1263 | if (!attempt_back_merge(q, req)) | 1246 | if (!attempt_back_merge(q, req)) |
1264 | elv_merged_request(q, req, el_ret); | 1247 | elv_merged_request(q, req, el_ret); |
1265 | goto out_unlock; | 1248 | goto out_unlock; |
1266 | } | 1249 | } |
1267 | } else if (el_ret == ELEVATOR_FRONT_MERGE) { | 1250 | } else if (el_ret == ELEVATOR_FRONT_MERGE) { |
1268 | BUG_ON(req->cmd_flags & REQ_ON_PLUG); | ||
1269 | if (bio_attempt_front_merge(q, req, bio)) { | 1251 | if (bio_attempt_front_merge(q, req, bio)) { |
1270 | if (!attempt_front_merge(q, req)) | 1252 | if (!attempt_front_merge(q, req)) |
1271 | elv_merged_request(q, req, el_ret); | 1253 | elv_merged_request(q, req, el_ret); |
@@ -1320,10 +1302,6 @@ get_rq: | |||
1320 | if (__rq->q != q) | 1302 | if (__rq->q != q) |
1321 | plug->should_sort = 1; | 1303 | plug->should_sort = 1; |
1322 | } | 1304 | } |
1323 | /* | ||
1324 | * Debug flag, kill later | ||
1325 | */ | ||
1326 | req->cmd_flags |= REQ_ON_PLUG; | ||
1327 | list_add_tail(&req->queuelist, &plug->list); | 1305 | list_add_tail(&req->queuelist, &plug->list); |
1328 | drive_stat_acct(req, 1); | 1306 | drive_stat_acct(req, 1); |
1329 | } else { | 1307 | } else { |
@@ -1550,7 +1528,8 @@ static inline void __generic_make_request(struct bio *bio) | |||
1550 | goto end_io; | 1528 | goto end_io; |
1551 | } | 1529 | } |
1552 | 1530 | ||
1553 | blk_throtl_bio(q, &bio); | 1531 | if (blk_throtl_bio(q, &bio)) |
1532 | goto end_io; | ||
1554 | 1533 | ||
1555 | /* | 1534 | /* |
1556 | * If bio = NULL, bio has been throttled and will be submitted | 1535 | * If bio = NULL, bio has been throttled and will be submitted |
@@ -2748,7 +2727,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) | |||
2748 | while (!list_empty(&list)) { | 2727 | while (!list_empty(&list)) { |
2749 | rq = list_entry_rq(list.next); | 2728 | rq = list_entry_rq(list.next); |
2750 | list_del_init(&rq->queuelist); | 2729 | list_del_init(&rq->queuelist); |
2751 | BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG)); | ||
2752 | BUG_ON(!rq->q); | 2730 | BUG_ON(!rq->q); |
2753 | if (rq->q != q) { | 2731 | if (rq->q != q) { |
2754 | /* | 2732 | /* |
@@ -2760,8 +2738,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) | |||
2760 | depth = 0; | 2738 | depth = 0; |
2761 | spin_lock(q->queue_lock); | 2739 | spin_lock(q->queue_lock); |
2762 | } | 2740 | } |
2763 | rq->cmd_flags &= ~REQ_ON_PLUG; | ||
2764 | |||
2765 | /* | 2741 | /* |
2766 | * rq is already accounted, so use raw insert | 2742 | * rq is already accounted, so use raw insert |
2767 | */ | 2743 | */ |
diff --git a/block/blk-exec.c b/block/blk-exec.c index 81e31819a597..8a0e7ec056e7 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c | |||
@@ -56,7 +56,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, | |||
56 | spin_lock_irq(q->queue_lock); | 56 | spin_lock_irq(q->queue_lock); |
57 | __elv_add_request(q, rq, where); | 57 | __elv_add_request(q, rq, where); |
58 | __blk_run_queue(q); | 58 | __blk_run_queue(q); |
59 | /* the queue is stopped so it won't be plugged+unplugged */ | 59 | /* the queue is stopped so it won't be run */ |
60 | if (rq->cmd_type == REQ_TYPE_PM_RESUME) | 60 | if (rq->cmd_type == REQ_TYPE_PM_RESUME) |
61 | q->request_fn(q); | 61 | q->request_fn(q); |
62 | spin_unlock_irq(q->queue_lock); | 62 | spin_unlock_irq(q->queue_lock); |
diff --git a/block/blk-flush.c b/block/blk-flush.c index 6c9b5e189e62..bb21e4c36f70 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c | |||
@@ -212,13 +212,19 @@ static void flush_end_io(struct request *flush_rq, int error) | |||
212 | } | 212 | } |
213 | 213 | ||
214 | /* | 214 | /* |
215 | * Moving a request silently to empty queue_head may stall the | 215 | * Kick the queue to avoid stall for two cases: |
216 | * queue. Kick the queue in those cases. This function is called | 216 | * 1. Moving a request silently to empty queue_head may stall the |
217 | * from request completion path and calling directly into | 217 | * queue. |
218 | * request_fn may confuse the driver. Always use kblockd. | 218 | * 2. When flush request is running in non-queueable queue, the |
219 | * queue is hold. Restart the queue after flush request is finished | ||
220 | * to avoid stall. | ||
221 | * This function is called from request completion path and calling | ||
222 | * directly into request_fn may confuse the driver. Always use | ||
223 | * kblockd. | ||
219 | */ | 224 | */ |
220 | if (queued) | 225 | if (queued || q->flush_queue_delayed) |
221 | blk_run_queue_async(q); | 226 | blk_run_queue_async(q); |
227 | q->flush_queue_delayed = 0; | ||
222 | } | 228 | } |
223 | 229 | ||
224 | /** | 230 | /** |
diff --git a/block/blk-ioc.c b/block/blk-ioc.c index b791022beef3..c898049dafd5 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c | |||
@@ -96,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) | |||
96 | INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); | 96 | INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); |
97 | INIT_HLIST_HEAD(&ret->cic_list); | 97 | INIT_HLIST_HEAD(&ret->cic_list); |
98 | ret->ioc_data = NULL; | 98 | ret->ioc_data = NULL; |
99 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) | ||
100 | ret->cgroup_changed = 0; | ||
101 | #endif | ||
99 | } | 102 | } |
100 | 103 | ||
101 | return ret; | 104 | return ret; |
diff --git a/block/blk-lib.c b/block/blk-lib.c index 25de73e4759b..78e627e2581d 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -9,17 +9,20 @@ | |||
9 | 9 | ||
10 | #include "blk.h" | 10 | #include "blk.h" |
11 | 11 | ||
12 | static void blkdev_discard_end_io(struct bio *bio, int err) | 12 | struct bio_batch { |
13 | { | 13 | atomic_t done; |
14 | if (err) { | 14 | unsigned long flags; |
15 | if (err == -EOPNOTSUPP) | 15 | struct completion *wait; |
16 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | 16 | }; |
17 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
18 | } | ||
19 | 17 | ||
20 | if (bio->bi_private) | 18 | static void bio_batch_end_io(struct bio *bio, int err) |
21 | complete(bio->bi_private); | 19 | { |
20 | struct bio_batch *bb = bio->bi_private; | ||
22 | 21 | ||
22 | if (err && (err != -EOPNOTSUPP)) | ||
23 | clear_bit(BIO_UPTODATE, &bb->flags); | ||
24 | if (atomic_dec_and_test(&bb->done)) | ||
25 | complete(bb->wait); | ||
23 | bio_put(bio); | 26 | bio_put(bio); |
24 | } | 27 | } |
25 | 28 | ||
@@ -41,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
41 | struct request_queue *q = bdev_get_queue(bdev); | 44 | struct request_queue *q = bdev_get_queue(bdev); |
42 | int type = REQ_WRITE | REQ_DISCARD; | 45 | int type = REQ_WRITE | REQ_DISCARD; |
43 | unsigned int max_discard_sectors; | 46 | unsigned int max_discard_sectors; |
47 | struct bio_batch bb; | ||
44 | struct bio *bio; | 48 | struct bio *bio; |
45 | int ret = 0; | 49 | int ret = 0; |
46 | 50 | ||
@@ -67,7 +71,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
67 | type |= REQ_SECURE; | 71 | type |= REQ_SECURE; |
68 | } | 72 | } |
69 | 73 | ||
70 | while (nr_sects && !ret) { | 74 | atomic_set(&bb.done, 1); |
75 | bb.flags = 1 << BIO_UPTODATE; | ||
76 | bb.wait = &wait; | ||
77 | |||
78 | while (nr_sects) { | ||
71 | bio = bio_alloc(gfp_mask, 1); | 79 | bio = bio_alloc(gfp_mask, 1); |
72 | if (!bio) { | 80 | if (!bio) { |
73 | ret = -ENOMEM; | 81 | ret = -ENOMEM; |
@@ -75,9 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
75 | } | 83 | } |
76 | 84 | ||
77 | bio->bi_sector = sector; | 85 | bio->bi_sector = sector; |
78 | bio->bi_end_io = blkdev_discard_end_io; | 86 | bio->bi_end_io = bio_batch_end_io; |
79 | bio->bi_bdev = bdev; | 87 | bio->bi_bdev = bdev; |
80 | bio->bi_private = &wait; | 88 | bio->bi_private = &bb; |
81 | 89 | ||
82 | if (nr_sects > max_discard_sectors) { | 90 | if (nr_sects > max_discard_sectors) { |
83 | bio->bi_size = max_discard_sectors << 9; | 91 | bio->bi_size = max_discard_sectors << 9; |
@@ -88,45 +96,21 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
88 | nr_sects = 0; | 96 | nr_sects = 0; |
89 | } | 97 | } |
90 | 98 | ||
91 | bio_get(bio); | 99 | atomic_inc(&bb.done); |
92 | submit_bio(type, bio); | 100 | submit_bio(type, bio); |
101 | } | ||
93 | 102 | ||
103 | /* Wait for bios in-flight */ | ||
104 | if (!atomic_dec_and_test(&bb.done)) | ||
94 | wait_for_completion(&wait); | 105 | wait_for_completion(&wait); |
95 | 106 | ||
96 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 107 | if (!test_bit(BIO_UPTODATE, &bb.flags)) |
97 | ret = -EOPNOTSUPP; | 108 | ret = -EIO; |
98 | else if (!bio_flagged(bio, BIO_UPTODATE)) | ||
99 | ret = -EIO; | ||
100 | bio_put(bio); | ||
101 | } | ||
102 | 109 | ||
103 | return ret; | 110 | return ret; |
104 | } | 111 | } |
105 | EXPORT_SYMBOL(blkdev_issue_discard); | 112 | EXPORT_SYMBOL(blkdev_issue_discard); |
106 | 113 | ||
107 | struct bio_batch | ||
108 | { | ||
109 | atomic_t done; | ||
110 | unsigned long flags; | ||
111 | struct completion *wait; | ||
112 | }; | ||
113 | |||
114 | static void bio_batch_end_io(struct bio *bio, int err) | ||
115 | { | ||
116 | struct bio_batch *bb = bio->bi_private; | ||
117 | |||
118 | if (err) { | ||
119 | if (err == -EOPNOTSUPP) | ||
120 | set_bit(BIO_EOPNOTSUPP, &bb->flags); | ||
121 | else | ||
122 | clear_bit(BIO_UPTODATE, &bb->flags); | ||
123 | } | ||
124 | if (bb) | ||
125 | if (atomic_dec_and_test(&bb->done)) | ||
126 | complete(bb->wait); | ||
127 | bio_put(bio); | ||
128 | } | ||
129 | |||
130 | /** | 114 | /** |
131 | * blkdev_issue_zeroout - generate number of zero filed write bios | 115 | * blkdev_issue_zeroout - generate number of zero filed write bios |
132 | * @bdev: blockdev to issue | 116 | * @bdev: blockdev to issue |
@@ -151,7 +135,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | |||
151 | bb.flags = 1 << BIO_UPTODATE; | 135 | bb.flags = 1 << BIO_UPTODATE; |
152 | bb.wait = &wait; | 136 | bb.wait = &wait; |
153 | 137 | ||
154 | submit: | ||
155 | ret = 0; | 138 | ret = 0; |
156 | while (nr_sects != 0) { | 139 | while (nr_sects != 0) { |
157 | bio = bio_alloc(gfp_mask, | 140 | bio = bio_alloc(gfp_mask, |
@@ -168,9 +151,6 @@ submit: | |||
168 | 151 | ||
169 | while (nr_sects != 0) { | 152 | while (nr_sects != 0) { |
170 | sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); | 153 | sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); |
171 | if (sz == 0) | ||
172 | /* bio has maximum size possible */ | ||
173 | break; | ||
174 | ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); | 154 | ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); |
175 | nr_sects -= ret >> 9; | 155 | nr_sects -= ret >> 9; |
176 | sector += ret >> 9; | 156 | sector += ret >> 9; |
@@ -190,16 +170,6 @@ submit: | |||
190 | /* One of bios in the batch was completed with error.*/ | 170 | /* One of bios in the batch was completed with error.*/ |
191 | ret = -EIO; | 171 | ret = -EIO; |
192 | 172 | ||
193 | if (ret) | ||
194 | goto out; | ||
195 | |||
196 | if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { | ||
197 | ret = -EOPNOTSUPP; | ||
198 | goto out; | ||
199 | } | ||
200 | if (nr_sects != 0) | ||
201 | goto submit; | ||
202 | out: | ||
203 | return ret; | 173 | return ret; |
204 | } | 174 | } |
205 | EXPORT_SYMBOL(blkdev_issue_zeroout); | 175 | EXPORT_SYMBOL(blkdev_issue_zeroout); |
diff --git a/block/blk-settings.c b/block/blk-settings.c index 1fa769293597..fa1eb0449a05 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c | |||
@@ -120,7 +120,7 @@ void blk_set_default_limits(struct queue_limits *lim) | |||
120 | lim->discard_granularity = 0; | 120 | lim->discard_granularity = 0; |
121 | lim->discard_alignment = 0; | 121 | lim->discard_alignment = 0; |
122 | lim->discard_misaligned = 0; | 122 | lim->discard_misaligned = 0; |
123 | lim->discard_zeroes_data = -1; | 123 | lim->discard_zeroes_data = 1; |
124 | lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; | 124 | lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; |
125 | lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); | 125 | lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); |
126 | lim->alignment_offset = 0; | 126 | lim->alignment_offset = 0; |
@@ -166,6 +166,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) | |||
166 | 166 | ||
167 | blk_set_default_limits(&q->limits); | 167 | blk_set_default_limits(&q->limits); |
168 | blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); | 168 | blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); |
169 | q->limits.discard_zeroes_data = 0; | ||
169 | 170 | ||
170 | /* | 171 | /* |
171 | * by default assume old behaviour and bounce for any highmem page | 172 | * by default assume old behaviour and bounce for any highmem page |
@@ -790,6 +791,12 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush) | |||
790 | } | 791 | } |
791 | EXPORT_SYMBOL_GPL(blk_queue_flush); | 792 | EXPORT_SYMBOL_GPL(blk_queue_flush); |
792 | 793 | ||
794 | void blk_queue_flush_queueable(struct request_queue *q, bool queueable) | ||
795 | { | ||
796 | q->flush_not_queueable = !queueable; | ||
797 | } | ||
798 | EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); | ||
799 | |||
793 | static int __init blk_settings_init(void) | 800 | static int __init blk_settings_init(void) |
794 | { | 801 | { |
795 | blk_max_low_pfn = max_low_pfn - 1; | 802 | blk_max_low_pfn = max_low_pfn - 1; |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index bd236313f35d..d935bd859c87 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -152,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag | |||
152 | 152 | ||
153 | static ssize_t queue_discard_max_show(struct request_queue *q, char *page) | 153 | static ssize_t queue_discard_max_show(struct request_queue *q, char *page) |
154 | { | 154 | { |
155 | return queue_var_show(q->limits.max_discard_sectors << 9, page); | 155 | return sprintf(page, "%llu\n", |
156 | (unsigned long long)q->limits.max_discard_sectors << 9); | ||
156 | } | 157 | } |
157 | 158 | ||
158 | static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) | 159 | static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 252a81a306f7..a62be8d0dc1b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -78,6 +78,8 @@ struct throtl_grp { | |||
78 | 78 | ||
79 | /* Some throttle limits got updated for the group */ | 79 | /* Some throttle limits got updated for the group */ |
80 | int limits_changed; | 80 | int limits_changed; |
81 | |||
82 | struct rcu_head rcu_head; | ||
81 | }; | 83 | }; |
82 | 84 | ||
83 | struct throtl_data | 85 | struct throtl_data |
@@ -88,7 +90,7 @@ struct throtl_data | |||
88 | /* service tree for active throtl groups */ | 90 | /* service tree for active throtl groups */ |
89 | struct throtl_rb_root tg_service_tree; | 91 | struct throtl_rb_root tg_service_tree; |
90 | 92 | ||
91 | struct throtl_grp root_tg; | 93 | struct throtl_grp *root_tg; |
92 | struct request_queue *queue; | 94 | struct request_queue *queue; |
93 | 95 | ||
94 | /* Total Number of queued bios on READ and WRITE lists */ | 96 | /* Total Number of queued bios on READ and WRITE lists */ |
@@ -151,56 +153,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) | |||
151 | return tg; | 153 | return tg; |
152 | } | 154 | } |
153 | 155 | ||
154 | static void throtl_put_tg(struct throtl_grp *tg) | 156 | static void throtl_free_tg(struct rcu_head *head) |
155 | { | 157 | { |
156 | BUG_ON(atomic_read(&tg->ref) <= 0); | 158 | struct throtl_grp *tg; |
157 | if (!atomic_dec_and_test(&tg->ref)) | 159 | |
158 | return; | 160 | tg = container_of(head, struct throtl_grp, rcu_head); |
161 | free_percpu(tg->blkg.stats_cpu); | ||
159 | kfree(tg); | 162 | kfree(tg); |
160 | } | 163 | } |
161 | 164 | ||
162 | static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, | 165 | static void throtl_put_tg(struct throtl_grp *tg) |
163 | struct blkio_cgroup *blkcg) | ||
164 | { | 166 | { |
165 | struct throtl_grp *tg = NULL; | 167 | BUG_ON(atomic_read(&tg->ref) <= 0); |
166 | void *key = td; | 168 | if (!atomic_dec_and_test(&tg->ref)) |
167 | struct backing_dev_info *bdi = &td->queue->backing_dev_info; | 169 | return; |
168 | unsigned int major, minor; | ||
169 | 170 | ||
170 | /* | 171 | /* |
171 | * TODO: Speed up blkiocg_lookup_group() by maintaining a radix | 172 | * A group is freed in rcu manner. But having an rcu lock does not |
172 | * tree of blkg (instead of traversing through hash list all | 173 | * mean that one can access all the fields of blkg and assume these |
173 | * the time. | 174 | * are valid. For example, don't try to follow throtl_data and |
175 | * request queue links. | ||
176 | * | ||
177 | * Having a reference to blkg under an rcu allows acess to only | ||
178 | * values local to groups like group stats and group rate limits | ||
174 | */ | 179 | */ |
180 | call_rcu(&tg->rcu_head, throtl_free_tg); | ||
181 | } | ||
175 | 182 | ||
176 | /* | 183 | static void throtl_init_group(struct throtl_grp *tg) |
177 | * This is the common case when there are no blkio cgroups. | 184 | { |
178 | * Avoid lookup in this case | ||
179 | */ | ||
180 | if (blkcg == &blkio_root_cgroup) | ||
181 | tg = &td->root_tg; | ||
182 | else | ||
183 | tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
184 | |||
185 | /* Fill in device details for root group */ | ||
186 | if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { | ||
187 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
188 | tg->blkg.dev = MKDEV(major, minor); | ||
189 | goto done; | ||
190 | } | ||
191 | |||
192 | if (tg) | ||
193 | goto done; | ||
194 | |||
195 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | ||
196 | if (!tg) | ||
197 | goto done; | ||
198 | |||
199 | INIT_HLIST_NODE(&tg->tg_node); | 185 | INIT_HLIST_NODE(&tg->tg_node); |
200 | RB_CLEAR_NODE(&tg->rb_node); | 186 | RB_CLEAR_NODE(&tg->rb_node); |
201 | bio_list_init(&tg->bio_lists[0]); | 187 | bio_list_init(&tg->bio_lists[0]); |
202 | bio_list_init(&tg->bio_lists[1]); | 188 | bio_list_init(&tg->bio_lists[1]); |
203 | td->limits_changed = false; | 189 | tg->limits_changed = false; |
190 | |||
191 | /* Practically unlimited BW */ | ||
192 | tg->bps[0] = tg->bps[1] = -1; | ||
193 | tg->iops[0] = tg->iops[1] = -1; | ||
204 | 194 | ||
205 | /* | 195 | /* |
206 | * Take the initial reference that will be released on destroy | 196 | * Take the initial reference that will be released on destroy |
@@ -209,33 +199,181 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, | |||
209 | * exit or cgroup deletion path depending on who is exiting first. | 199 | * exit or cgroup deletion path depending on who is exiting first. |
210 | */ | 200 | */ |
211 | atomic_set(&tg->ref, 1); | 201 | atomic_set(&tg->ref, 1); |
202 | } | ||
203 | |||
204 | /* Should be called with rcu read lock held (needed for blkcg) */ | ||
205 | static void | ||
206 | throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) | ||
207 | { | ||
208 | hlist_add_head(&tg->tg_node, &td->tg_list); | ||
209 | td->nr_undestroyed_grps++; | ||
210 | } | ||
211 | |||
212 | static void | ||
213 | __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
214 | { | ||
215 | struct backing_dev_info *bdi = &td->queue->backing_dev_info; | ||
216 | unsigned int major, minor; | ||
217 | |||
218 | if (!tg || tg->blkg.dev) | ||
219 | return; | ||
220 | |||
221 | /* | ||
222 | * Fill in device details for a group which might not have been | ||
223 | * filled at group creation time as queue was being instantiated | ||
224 | * and driver had not attached a device yet | ||
225 | */ | ||
226 | if (bdi->dev && dev_name(bdi->dev)) { | ||
227 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
228 | tg->blkg.dev = MKDEV(major, minor); | ||
229 | } | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Should be called with without queue lock held. Here queue lock will be | ||
234 | * taken rarely. It will be taken only once during life time of a group | ||
235 | * if need be | ||
236 | */ | ||
237 | static void | ||
238 | throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
239 | { | ||
240 | if (!tg || tg->blkg.dev) | ||
241 | return; | ||
242 | |||
243 | spin_lock_irq(td->queue->queue_lock); | ||
244 | __throtl_tg_fill_dev_details(td, tg); | ||
245 | spin_unlock_irq(td->queue->queue_lock); | ||
246 | } | ||
247 | |||
248 | static void throtl_init_add_tg_lists(struct throtl_data *td, | ||
249 | struct throtl_grp *tg, struct blkio_cgroup *blkcg) | ||
250 | { | ||
251 | __throtl_tg_fill_dev_details(td, tg); | ||
212 | 252 | ||
213 | /* Add group onto cgroup list */ | 253 | /* Add group onto cgroup list */ |
214 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
215 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, | 254 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, |
216 | MKDEV(major, minor), BLKIO_POLICY_THROTL); | 255 | tg->blkg.dev, BLKIO_POLICY_THROTL); |
217 | 256 | ||
218 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); | 257 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); |
219 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); | 258 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); |
220 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | 259 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); |
221 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | 260 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); |
222 | 261 | ||
223 | hlist_add_head(&tg->tg_node, &td->tg_list); | 262 | throtl_add_group_to_td_list(td, tg); |
224 | td->nr_undestroyed_grps++; | 263 | } |
225 | done: | 264 | |
265 | /* Should be called without queue lock and outside of rcu period */ | ||
266 | static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) | ||
267 | { | ||
268 | struct throtl_grp *tg = NULL; | ||
269 | int ret; | ||
270 | |||
271 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | ||
272 | if (!tg) | ||
273 | return NULL; | ||
274 | |||
275 | ret = blkio_alloc_blkg_stats(&tg->blkg); | ||
276 | |||
277 | if (ret) { | ||
278 | kfree(tg); | ||
279 | return NULL; | ||
280 | } | ||
281 | |||
282 | throtl_init_group(tg); | ||
226 | return tg; | 283 | return tg; |
227 | } | 284 | } |
228 | 285 | ||
229 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | 286 | static struct |
287 | throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) | ||
230 | { | 288 | { |
231 | struct throtl_grp *tg = NULL; | 289 | struct throtl_grp *tg = NULL; |
290 | void *key = td; | ||
291 | |||
292 | /* | ||
293 | * This is the common case when there are no blkio cgroups. | ||
294 | * Avoid lookup in this case | ||
295 | */ | ||
296 | if (blkcg == &blkio_root_cgroup) | ||
297 | tg = td->root_tg; | ||
298 | else | ||
299 | tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
300 | |||
301 | __throtl_tg_fill_dev_details(td, tg); | ||
302 | return tg; | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * This function returns with queue lock unlocked in case of error, like | ||
307 | * request queue is no more | ||
308 | */ | ||
309 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | ||
310 | { | ||
311 | struct throtl_grp *tg = NULL, *__tg = NULL; | ||
232 | struct blkio_cgroup *blkcg; | 312 | struct blkio_cgroup *blkcg; |
313 | struct request_queue *q = td->queue; | ||
233 | 314 | ||
234 | rcu_read_lock(); | 315 | rcu_read_lock(); |
235 | blkcg = task_blkio_cgroup(current); | 316 | blkcg = task_blkio_cgroup(current); |
236 | tg = throtl_find_alloc_tg(td, blkcg); | 317 | tg = throtl_find_tg(td, blkcg); |
237 | if (!tg) | 318 | if (tg) { |
238 | tg = &td->root_tg; | 319 | rcu_read_unlock(); |
320 | return tg; | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * Need to allocate a group. Allocation of group also needs allocation | ||
325 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
326 | * we need to drop rcu lock and queue_lock before we call alloc | ||
327 | * | ||
328 | * Take the request queue reference to make sure queue does not | ||
329 | * go away once we return from allocation. | ||
330 | */ | ||
331 | blk_get_queue(q); | ||
332 | rcu_read_unlock(); | ||
333 | spin_unlock_irq(q->queue_lock); | ||
334 | |||
335 | tg = throtl_alloc_tg(td); | ||
336 | /* | ||
337 | * We might have slept in group allocation. Make sure queue is not | ||
338 | * dead | ||
339 | */ | ||
340 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { | ||
341 | blk_put_queue(q); | ||
342 | if (tg) | ||
343 | kfree(tg); | ||
344 | |||
345 | return ERR_PTR(-ENODEV); | ||
346 | } | ||
347 | blk_put_queue(q); | ||
348 | |||
349 | /* Group allocated and queue is still alive. take the lock */ | ||
350 | spin_lock_irq(q->queue_lock); | ||
351 | |||
352 | /* | ||
353 | * Initialize the new group. After sleeping, read the blkcg again. | ||
354 | */ | ||
355 | rcu_read_lock(); | ||
356 | blkcg = task_blkio_cgroup(current); | ||
357 | |||
358 | /* | ||
359 | * If some other thread already allocated the group while we were | ||
360 | * not holding queue lock, free up the group | ||
361 | */ | ||
362 | __tg = throtl_find_tg(td, blkcg); | ||
363 | |||
364 | if (__tg) { | ||
365 | kfree(tg); | ||
366 | rcu_read_unlock(); | ||
367 | return __tg; | ||
368 | } | ||
369 | |||
370 | /* Group allocation failed. Account the IO to root group */ | ||
371 | if (!tg) { | ||
372 | tg = td->root_tg; | ||
373 | return tg; | ||
374 | } | ||
375 | |||
376 | throtl_init_add_tg_lists(td, tg, blkcg); | ||
239 | rcu_read_unlock(); | 377 | rcu_read_unlock(); |
240 | return tg; | 378 | return tg; |
241 | } | 379 | } |
@@ -544,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, | |||
544 | return 0; | 682 | return 0; |
545 | } | 683 | } |
546 | 684 | ||
685 | static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { | ||
686 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) | ||
687 | return 1; | ||
688 | return 0; | ||
689 | } | ||
690 | |||
547 | /* | 691 | /* |
548 | * Returns whether one can dispatch a bio or not. Also returns approx number | 692 | * Returns whether one can dispatch a bio or not. Also returns approx number |
549 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched | 693 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched |
@@ -608,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | |||
608 | tg->bytes_disp[rw] += bio->bi_size; | 752 | tg->bytes_disp[rw] += bio->bi_size; |
609 | tg->io_disp[rw]++; | 753 | tg->io_disp[rw]++; |
610 | 754 | ||
611 | /* | ||
612 | * TODO: This will take blkg->stats_lock. Figure out a way | ||
613 | * to avoid this cost. | ||
614 | */ | ||
615 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); | 755 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); |
616 | } | 756 | } |
617 | 757 | ||
@@ -989,15 +1129,51 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) | |||
989 | struct throtl_grp *tg; | 1129 | struct throtl_grp *tg; |
990 | struct bio *bio = *biop; | 1130 | struct bio *bio = *biop; |
991 | bool rw = bio_data_dir(bio), update_disptime = true; | 1131 | bool rw = bio_data_dir(bio), update_disptime = true; |
1132 | struct blkio_cgroup *blkcg; | ||
992 | 1133 | ||
993 | if (bio->bi_rw & REQ_THROTTLED) { | 1134 | if (bio->bi_rw & REQ_THROTTLED) { |
994 | bio->bi_rw &= ~REQ_THROTTLED; | 1135 | bio->bi_rw &= ~REQ_THROTTLED; |
995 | return 0; | 1136 | return 0; |
996 | } | 1137 | } |
997 | 1138 | ||
1139 | /* | ||
1140 | * A throtl_grp pointer retrieved under rcu can be used to access | ||
1141 | * basic fields like stats and io rates. If a group has no rules, | ||
1142 | * just update the dispatch stats in lockless manner and return. | ||
1143 | */ | ||
1144 | |||
1145 | rcu_read_lock(); | ||
1146 | blkcg = task_blkio_cgroup(current); | ||
1147 | tg = throtl_find_tg(td, blkcg); | ||
1148 | if (tg) { | ||
1149 | throtl_tg_fill_dev_details(td, tg); | ||
1150 | |||
1151 | if (tg_no_rule_group(tg, rw)) { | ||
1152 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, | ||
1153 | rw, bio->bi_rw & REQ_SYNC); | ||
1154 | rcu_read_unlock(); | ||
1155 | return 0; | ||
1156 | } | ||
1157 | } | ||
1158 | rcu_read_unlock(); | ||
1159 | |||
1160 | /* | ||
1161 | * Either group has not been allocated yet or it is not an unlimited | ||
1162 | * IO group | ||
1163 | */ | ||
1164 | |||
998 | spin_lock_irq(q->queue_lock); | 1165 | spin_lock_irq(q->queue_lock); |
999 | tg = throtl_get_tg(td); | 1166 | tg = throtl_get_tg(td); |
1000 | 1167 | ||
1168 | if (IS_ERR(tg)) { | ||
1169 | if (PTR_ERR(tg) == -ENODEV) { | ||
1170 | /* | ||
1171 | * Queue is gone. No queue lock held here. | ||
1172 | */ | ||
1173 | return -ENODEV; | ||
1174 | } | ||
1175 | } | ||
1176 | |||
1001 | if (tg->nr_queued[rw]) { | 1177 | if (tg->nr_queued[rw]) { |
1002 | /* | 1178 | /* |
1003 | * There is already another bio queued in same dir. No | 1179 | * There is already another bio queued in same dir. No |
@@ -1060,39 +1236,24 @@ int blk_throtl_init(struct request_queue *q) | |||
1060 | INIT_HLIST_HEAD(&td->tg_list); | 1236 | INIT_HLIST_HEAD(&td->tg_list); |
1061 | td->tg_service_tree = THROTL_RB_ROOT; | 1237 | td->tg_service_tree = THROTL_RB_ROOT; |
1062 | td->limits_changed = false; | 1238 | td->limits_changed = false; |
1239 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | ||
1063 | 1240 | ||
1064 | /* Init root group */ | 1241 | /* alloc and Init root group. */ |
1065 | tg = &td->root_tg; | 1242 | td->queue = q; |
1066 | INIT_HLIST_NODE(&tg->tg_node); | 1243 | tg = throtl_alloc_tg(td); |
1067 | RB_CLEAR_NODE(&tg->rb_node); | ||
1068 | bio_list_init(&tg->bio_lists[0]); | ||
1069 | bio_list_init(&tg->bio_lists[1]); | ||
1070 | |||
1071 | /* Practically unlimited BW */ | ||
1072 | tg->bps[0] = tg->bps[1] = -1; | ||
1073 | tg->iops[0] = tg->iops[1] = -1; | ||
1074 | td->limits_changed = false; | ||
1075 | 1244 | ||
1076 | /* | 1245 | if (!tg) { |
1077 | * Set root group reference to 2. One reference will be dropped when | 1246 | kfree(td); |
1078 | * all groups on tg_list are being deleted during queue exit. Other | 1247 | return -ENOMEM; |
1079 | * reference will remain there as we don't want to delete this group | 1248 | } |
1080 | * as it is statically allocated and gets destroyed when throtl_data | ||
1081 | * goes away. | ||
1082 | */ | ||
1083 | atomic_set(&tg->ref, 2); | ||
1084 | hlist_add_head(&tg->tg_node, &td->tg_list); | ||
1085 | td->nr_undestroyed_grps++; | ||
1086 | 1249 | ||
1087 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | 1250 | td->root_tg = tg; |
1088 | 1251 | ||
1089 | rcu_read_lock(); | 1252 | rcu_read_lock(); |
1090 | blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, | 1253 | throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); |
1091 | 0, BLKIO_POLICY_THROTL); | ||
1092 | rcu_read_unlock(); | 1254 | rcu_read_unlock(); |
1093 | 1255 | ||
1094 | /* Attach throtl data to request queue */ | 1256 | /* Attach throtl data to request queue */ |
1095 | td->queue = q; | ||
1096 | q->td = td; | 1257 | q->td = td; |
1097 | return 0; | 1258 | return 0; |
1098 | } | 1259 | } |
diff --git a/block/blk.h b/block/blk.h index 61263463e38e..d6586287adc9 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -62,7 +62,28 @@ static inline struct request *__elv_next_request(struct request_queue *q) | |||
62 | return rq; | 62 | return rq; |
63 | } | 63 | } |
64 | 64 | ||
65 | if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) | 65 | /* |
66 | * Flush request is running and flush request isn't queueable | ||
67 | * in the drive, we can hold the queue till flush request is | ||
68 | * finished. Even we don't do this, driver can't dispatch next | ||
69 | * requests and will requeue them. And this can improve | ||
70 | * throughput too. For example, we have request flush1, write1, | ||
71 | * flush 2. flush1 is dispatched, then queue is hold, write1 | ||
72 | * isn't inserted to queue. After flush1 is finished, flush2 | ||
73 | * will be dispatched. Since disk cache is already clean, | ||
74 | * flush2 will be finished very soon, so looks like flush2 is | ||
75 | * folded to flush1. | ||
76 | * Since the queue is hold, a flag is set to indicate the queue | ||
77 | * should be restarted later. Please see flush_end_io() for | ||
78 | * details. | ||
79 | */ | ||
80 | if (q->flush_pending_idx != q->flush_running_idx && | ||
81 | !queue_flush_queueable(q)) { | ||
82 | q->flush_queue_delayed = 1; | ||
83 | return NULL; | ||
84 | } | ||
85 | if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) || | ||
86 | !q->elevator->ops->elevator_dispatch_fn(q, 0)) | ||
66 | return NULL; | 87 | return NULL; |
67 | } | 88 | } |
68 | } | 89 | } |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ab7a9e6a9b1c..7c52d6888924 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -300,7 +300,9 @@ struct cfq_data { | |||
300 | 300 | ||
301 | /* List of cfq groups being managed on this device*/ | 301 | /* List of cfq groups being managed on this device*/ |
302 | struct hlist_head cfqg_list; | 302 | struct hlist_head cfqg_list; |
303 | struct rcu_head rcu; | 303 | |
304 | /* Number of groups which are on blkcg->blkg_list */ | ||
305 | unsigned int nr_blkcg_linked_grps; | ||
304 | }; | 306 | }; |
305 | 307 | ||
306 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); | 308 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); |
@@ -665,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, | |||
665 | if (rq2 == NULL) | 667 | if (rq2 == NULL) |
666 | return rq1; | 668 | return rq1; |
667 | 669 | ||
668 | if (rq_is_sync(rq1) && !rq_is_sync(rq2)) | 670 | if (rq_is_sync(rq1) != rq_is_sync(rq2)) |
669 | return rq1; | 671 | return rq_is_sync(rq1) ? rq1 : rq2; |
670 | else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) | 672 | |
671 | return rq2; | 673 | if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) |
672 | if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) | 674 | return rq1->cmd_flags & REQ_META ? rq1 : rq2; |
673 | return rq1; | ||
674 | else if ((rq2->cmd_flags & REQ_META) && | ||
675 | !(rq1->cmd_flags & REQ_META)) | ||
676 | return rq2; | ||
677 | 675 | ||
678 | s1 = blk_rq_pos(rq1); | 676 | s1 = blk_rq_pos(rq1); |
679 | s2 = blk_rq_pos(rq2); | 677 | s2 = blk_rq_pos(rq2); |
@@ -1014,28 +1012,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, | |||
1014 | cfqg->needs_update = true; | 1012 | cfqg->needs_update = true; |
1015 | } | 1013 | } |
1016 | 1014 | ||
1017 | static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, | 1015 | static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, |
1018 | struct blkio_cgroup *blkcg, int create) | 1016 | struct cfq_group *cfqg, struct blkio_cgroup *blkcg) |
1019 | { | 1017 | { |
1020 | struct cfq_group *cfqg = NULL; | ||
1021 | void *key = cfqd; | ||
1022 | int i, j; | ||
1023 | struct cfq_rb_root *st; | ||
1024 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | 1018 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; |
1025 | unsigned int major, minor; | 1019 | unsigned int major, minor; |
1026 | 1020 | ||
1027 | cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); | 1021 | /* |
1028 | if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { | 1022 | * Add group onto cgroup list. It might happen that bdi->dev is |
1023 | * not initialized yet. Initialize this new group without major | ||
1024 | * and minor info and this info will be filled in once a new thread | ||
1025 | * comes for IO. | ||
1026 | */ | ||
1027 | if (bdi->dev) { | ||
1029 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | 1028 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); |
1030 | cfqg->blkg.dev = MKDEV(major, minor); | 1029 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, |
1031 | goto done; | 1030 | (void *)cfqd, MKDEV(major, minor)); |
1032 | } | 1031 | } else |
1033 | if (cfqg || !create) | 1032 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, |
1034 | goto done; | 1033 | (void *)cfqd, 0); |
1034 | |||
1035 | cfqd->nr_blkcg_linked_grps++; | ||
1036 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
1037 | |||
1038 | /* Add group on cfqd list */ | ||
1039 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | ||
1040 | } | ||
1041 | |||
1042 | /* | ||
1043 | * Should be called from sleepable context. No request queue lock as per | ||
1044 | * cpu stats are allocated dynamically and alloc_percpu needs to be called | ||
1045 | * from sleepable context. | ||
1046 | */ | ||
1047 | static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) | ||
1048 | { | ||
1049 | struct cfq_group *cfqg = NULL; | ||
1050 | int i, j, ret; | ||
1051 | struct cfq_rb_root *st; | ||
1035 | 1052 | ||
1036 | cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); | 1053 | cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); |
1037 | if (!cfqg) | 1054 | if (!cfqg) |
1038 | goto done; | 1055 | return NULL; |
1039 | 1056 | ||
1040 | for_each_cfqg_st(cfqg, i, j, st) | 1057 | for_each_cfqg_st(cfqg, i, j, st) |
1041 | *st = CFQ_RB_ROOT; | 1058 | *st = CFQ_RB_ROOT; |
@@ -1049,43 +1066,94 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, | |||
1049 | */ | 1066 | */ |
1050 | cfqg->ref = 1; | 1067 | cfqg->ref = 1; |
1051 | 1068 | ||
1069 | ret = blkio_alloc_blkg_stats(&cfqg->blkg); | ||
1070 | if (ret) { | ||
1071 | kfree(cfqg); | ||
1072 | return NULL; | ||
1073 | } | ||
1074 | |||
1075 | return cfqg; | ||
1076 | } | ||
1077 | |||
1078 | static struct cfq_group * | ||
1079 | cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) | ||
1080 | { | ||
1081 | struct cfq_group *cfqg = NULL; | ||
1082 | void *key = cfqd; | ||
1083 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | ||
1084 | unsigned int major, minor; | ||
1085 | |||
1052 | /* | 1086 | /* |
1053 | * Add group onto cgroup list. It might happen that bdi->dev is | 1087 | * This is the common case when there are no blkio cgroups. |
1054 | * not initialized yet. Initialize this new group without major | 1088 | * Avoid lookup in this case |
1055 | * and minor info and this info will be filled in once a new thread | ||
1056 | * comes for IO. See code above. | ||
1057 | */ | 1089 | */ |
1058 | if (bdi->dev) { | 1090 | if (blkcg == &blkio_root_cgroup) |
1059 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | 1091 | cfqg = &cfqd->root_group; |
1060 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, | 1092 | else |
1061 | MKDEV(major, minor)); | 1093 | cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); |
1062 | } else | ||
1063 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, | ||
1064 | 0); | ||
1065 | |||
1066 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
1067 | 1094 | ||
1068 | /* Add group on cfqd list */ | 1095 | if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { |
1069 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | 1096 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); |
1097 | cfqg->blkg.dev = MKDEV(major, minor); | ||
1098 | } | ||
1070 | 1099 | ||
1071 | done: | ||
1072 | return cfqg; | 1100 | return cfqg; |
1073 | } | 1101 | } |
1074 | 1102 | ||
1075 | /* | 1103 | /* |
1076 | * Search for the cfq group current task belongs to. If create = 1, then also | 1104 | * Search for the cfq group current task belongs to. request_queue lock must |
1077 | * create the cfq group if it does not exist. request_queue lock must be held. | 1105 | * be held. |
1078 | */ | 1106 | */ |
1079 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) | 1107 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) |
1080 | { | 1108 | { |
1081 | struct blkio_cgroup *blkcg; | 1109 | struct blkio_cgroup *blkcg; |
1082 | struct cfq_group *cfqg = NULL; | 1110 | struct cfq_group *cfqg = NULL, *__cfqg = NULL; |
1111 | struct request_queue *q = cfqd->queue; | ||
1083 | 1112 | ||
1084 | rcu_read_lock(); | 1113 | rcu_read_lock(); |
1085 | blkcg = task_blkio_cgroup(current); | 1114 | blkcg = task_blkio_cgroup(current); |
1086 | cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create); | 1115 | cfqg = cfq_find_cfqg(cfqd, blkcg); |
1087 | if (!cfqg && create) | 1116 | if (cfqg) { |
1117 | rcu_read_unlock(); | ||
1118 | return cfqg; | ||
1119 | } | ||
1120 | |||
1121 | /* | ||
1122 | * Need to allocate a group. Allocation of group also needs allocation | ||
1123 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
1124 | * we need to drop rcu lock and queue_lock before we call alloc. | ||
1125 | * | ||
1126 | * Not taking any queue reference here and assuming that queue is | ||
1127 | * around by the time we return. CFQ queue allocation code does | ||
1128 | * the same. It might be racy though. | ||
1129 | */ | ||
1130 | |||
1131 | rcu_read_unlock(); | ||
1132 | spin_unlock_irq(q->queue_lock); | ||
1133 | |||
1134 | cfqg = cfq_alloc_cfqg(cfqd); | ||
1135 | |||
1136 | spin_lock_irq(q->queue_lock); | ||
1137 | |||
1138 | rcu_read_lock(); | ||
1139 | blkcg = task_blkio_cgroup(current); | ||
1140 | |||
1141 | /* | ||
1142 | * If some other thread already allocated the group while we were | ||
1143 | * not holding queue lock, free up the group | ||
1144 | */ | ||
1145 | __cfqg = cfq_find_cfqg(cfqd, blkcg); | ||
1146 | |||
1147 | if (__cfqg) { | ||
1148 | kfree(cfqg); | ||
1149 | rcu_read_unlock(); | ||
1150 | return __cfqg; | ||
1151 | } | ||
1152 | |||
1153 | if (!cfqg) | ||
1088 | cfqg = &cfqd->root_group; | 1154 | cfqg = &cfqd->root_group; |
1155 | |||
1156 | cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); | ||
1089 | rcu_read_unlock(); | 1157 | rcu_read_unlock(); |
1090 | return cfqg; | 1158 | return cfqg; |
1091 | } | 1159 | } |
@@ -1118,6 +1186,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg) | |||
1118 | return; | 1186 | return; |
1119 | for_each_cfqg_st(cfqg, i, j, st) | 1187 | for_each_cfqg_st(cfqg, i, j, st) |
1120 | BUG_ON(!RB_EMPTY_ROOT(&st->rb)); | 1188 | BUG_ON(!RB_EMPTY_ROOT(&st->rb)); |
1189 | free_percpu(cfqg->blkg.stats_cpu); | ||
1121 | kfree(cfqg); | 1190 | kfree(cfqg); |
1122 | } | 1191 | } |
1123 | 1192 | ||
@@ -1176,7 +1245,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) | |||
1176 | } | 1245 | } |
1177 | 1246 | ||
1178 | #else /* GROUP_IOSCHED */ | 1247 | #else /* GROUP_IOSCHED */ |
1179 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) | 1248 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) |
1180 | { | 1249 | { |
1181 | return &cfqd->root_group; | 1250 | return &cfqd->root_group; |
1182 | } | 1251 | } |
@@ -1210,7 +1279,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1210 | struct cfq_rb_root *service_tree; | 1279 | struct cfq_rb_root *service_tree; |
1211 | int left; | 1280 | int left; |
1212 | int new_cfqq = 1; | 1281 | int new_cfqq = 1; |
1213 | int group_changed = 0; | ||
1214 | 1282 | ||
1215 | service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), | 1283 | service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), |
1216 | cfqq_type(cfqq)); | 1284 | cfqq_type(cfqq)); |
@@ -1281,7 +1349,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1281 | rb_link_node(&cfqq->rb_node, parent, p); | 1349 | rb_link_node(&cfqq->rb_node, parent, p); |
1282 | rb_insert_color(&cfqq->rb_node, &service_tree->rb); | 1350 | rb_insert_color(&cfqq->rb_node, &service_tree->rb); |
1283 | service_tree->count++; | 1351 | service_tree->count++; |
1284 | if ((add_front || !new_cfqq) && !group_changed) | 1352 | if (add_front || !new_cfqq) |
1285 | return; | 1353 | return; |
1286 | cfq_group_notify_queue_add(cfqd, cfqq->cfqg); | 1354 | cfq_group_notify_queue_add(cfqd, cfqq->cfqg); |
1287 | } | 1355 | } |
@@ -2029,7 +2097,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
2029 | 2097 | ||
2030 | WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); | 2098 | WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); |
2031 | 2099 | ||
2032 | return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); | 2100 | return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); |
2033 | } | 2101 | } |
2034 | 2102 | ||
2035 | /* | 2103 | /* |
@@ -2911,7 +2979,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, | |||
2911 | struct cfq_group *cfqg; | 2979 | struct cfq_group *cfqg; |
2912 | 2980 | ||
2913 | retry: | 2981 | retry: |
2914 | cfqg = cfq_get_cfqg(cfqd, 1); | 2982 | cfqg = cfq_get_cfqg(cfqd); |
2915 | cic = cfq_cic_lookup(cfqd, ioc); | 2983 | cic = cfq_cic_lookup(cfqd, ioc); |
2916 | /* cic always exists here */ | 2984 | /* cic always exists here */ |
2917 | cfqq = cic_to_cfqq(cic, is_sync); | 2985 | cfqq = cic_to_cfqq(cic, is_sync); |
@@ -3815,15 +3883,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd) | |||
3815 | cfq_put_queue(cfqd->async_idle_cfqq); | 3883 | cfq_put_queue(cfqd->async_idle_cfqq); |
3816 | } | 3884 | } |
3817 | 3885 | ||
3818 | static void cfq_cfqd_free(struct rcu_head *head) | ||
3819 | { | ||
3820 | kfree(container_of(head, struct cfq_data, rcu)); | ||
3821 | } | ||
3822 | |||
3823 | static void cfq_exit_queue(struct elevator_queue *e) | 3886 | static void cfq_exit_queue(struct elevator_queue *e) |
3824 | { | 3887 | { |
3825 | struct cfq_data *cfqd = e->elevator_data; | 3888 | struct cfq_data *cfqd = e->elevator_data; |
3826 | struct request_queue *q = cfqd->queue; | 3889 | struct request_queue *q = cfqd->queue; |
3890 | bool wait = false; | ||
3827 | 3891 | ||
3828 | cfq_shutdown_timer_wq(cfqd); | 3892 | cfq_shutdown_timer_wq(cfqd); |
3829 | 3893 | ||
@@ -3842,7 +3906,13 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
3842 | 3906 | ||
3843 | cfq_put_async_queues(cfqd); | 3907 | cfq_put_async_queues(cfqd); |
3844 | cfq_release_cfq_groups(cfqd); | 3908 | cfq_release_cfq_groups(cfqd); |
3845 | cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); | 3909 | |
3910 | /* | ||
3911 | * If there are groups which we could not unlink from blkcg list, | ||
3912 | * wait for a rcu period for them to be freed. | ||
3913 | */ | ||
3914 | if (cfqd->nr_blkcg_linked_grps) | ||
3915 | wait = true; | ||
3846 | 3916 | ||
3847 | spin_unlock_irq(q->queue_lock); | 3917 | spin_unlock_irq(q->queue_lock); |
3848 | 3918 | ||
@@ -3852,8 +3922,25 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
3852 | ida_remove(&cic_index_ida, cfqd->cic_index); | 3922 | ida_remove(&cic_index_ida, cfqd->cic_index); |
3853 | spin_unlock(&cic_index_lock); | 3923 | spin_unlock(&cic_index_lock); |
3854 | 3924 | ||
3855 | /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ | 3925 | /* |
3856 | call_rcu(&cfqd->rcu, cfq_cfqd_free); | 3926 | * Wait for cfqg->blkg->key accessors to exit their grace periods. |
3927 | * Do this wait only if there are other unlinked groups out | ||
3928 | * there. This can happen if cgroup deletion path claimed the | ||
3929 | * responsibility of cleaning up a group before queue cleanup code | ||
3930 | * get to the group. | ||
3931 | * | ||
3932 | * Do not call synchronize_rcu() unconditionally as there are drivers | ||
3933 | * which create/delete request queue hundreds of times during scan/boot | ||
3934 | * and synchronize_rcu() can take significant time and slow down boot. | ||
3935 | */ | ||
3936 | if (wait) | ||
3937 | synchronize_rcu(); | ||
3938 | |||
3939 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
3940 | /* Free up per cpu stats for root group */ | ||
3941 | free_percpu(cfqd->root_group.blkg.stats_cpu); | ||
3942 | #endif | ||
3943 | kfree(cfqd); | ||
3857 | } | 3944 | } |
3858 | 3945 | ||
3859 | static int cfq_alloc_cic_index(void) | 3946 | static int cfq_alloc_cic_index(void) |
@@ -3886,8 +3973,12 @@ static void *cfq_init_queue(struct request_queue *q) | |||
3886 | return NULL; | 3973 | return NULL; |
3887 | 3974 | ||
3888 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); | 3975 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); |
3889 | if (!cfqd) | 3976 | if (!cfqd) { |
3977 | spin_lock(&cic_index_lock); | ||
3978 | ida_remove(&cic_index_ida, i); | ||
3979 | spin_unlock(&cic_index_lock); | ||
3890 | return NULL; | 3980 | return NULL; |
3981 | } | ||
3891 | 3982 | ||
3892 | /* | 3983 | /* |
3893 | * Don't need take queue_lock in the routine, since we are | 3984 | * Don't need take queue_lock in the routine, since we are |
@@ -3909,14 +4000,29 @@ static void *cfq_init_queue(struct request_queue *q) | |||
3909 | 4000 | ||
3910 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 4001 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
3911 | /* | 4002 | /* |
3912 | * Take a reference to root group which we never drop. This is just | 4003 | * Set root group reference to 2. One reference will be dropped when |
3913 | * to make sure that cfq_put_cfqg() does not try to kfree root group | 4004 | * all groups on cfqd->cfqg_list are being deleted during queue exit. |
4005 | * Other reference will remain there as we don't want to delete this | ||
4006 | * group as it is statically allocated and gets destroyed when | ||
4007 | * throtl_data goes away. | ||
3914 | */ | 4008 | */ |
3915 | cfqg->ref = 1; | 4009 | cfqg->ref = 2; |
4010 | |||
4011 | if (blkio_alloc_blkg_stats(&cfqg->blkg)) { | ||
4012 | kfree(cfqg); | ||
4013 | kfree(cfqd); | ||
4014 | return NULL; | ||
4015 | } | ||
4016 | |||
3916 | rcu_read_lock(); | 4017 | rcu_read_lock(); |
4018 | |||
3917 | cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, | 4019 | cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, |
3918 | (void *)cfqd, 0); | 4020 | (void *)cfqd, 0); |
3919 | rcu_read_unlock(); | 4021 | rcu_read_unlock(); |
4022 | cfqd->nr_blkcg_linked_grps++; | ||
4023 | |||
4024 | /* Add group on cfqd->cfqg_list */ | ||
4025 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | ||
3920 | #endif | 4026 | #endif |
3921 | /* | 4027 | /* |
3922 | * Not strictly needed (since RB_ROOT just clears the node and we | 4028 | * Not strictly needed (since RB_ROOT just clears the node and we |
diff --git a/block/elevator.c b/block/elevator.c index 45ca1e34f582..b0b38ce0dcb6 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name) | |||
155 | 155 | ||
156 | e = elevator_find(name); | 156 | e = elevator_find(name); |
157 | if (!e) { | 157 | if (!e) { |
158 | char elv[ELV_NAME_MAX + strlen("-iosched")]; | ||
159 | |||
160 | spin_unlock(&elv_list_lock); | 158 | spin_unlock(&elv_list_lock); |
161 | 159 | request_module("%s-iosched", name); | |
162 | snprintf(elv, sizeof(elv), "%s-iosched", name); | ||
163 | |||
164 | request_module("%s", elv); | ||
165 | spin_lock(&elv_list_lock); | 160 | spin_lock(&elv_list_lock); |
166 | e = elevator_find(name); | 161 | e = elevator_find(name); |
167 | } | 162 | } |
@@ -421,8 +416,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) | |||
421 | struct list_head *entry; | 416 | struct list_head *entry; |
422 | int stop_flags; | 417 | int stop_flags; |
423 | 418 | ||
424 | BUG_ON(rq->cmd_flags & REQ_ON_PLUG); | ||
425 | |||
426 | if (q->last_merge == rq) | 419 | if (q->last_merge == rq) |
427 | q->last_merge = NULL; | 420 | q->last_merge = NULL; |
428 | 421 | ||
@@ -661,8 +654,6 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) | |||
661 | 654 | ||
662 | rq->q = q; | 655 | rq->q = q; |
663 | 656 | ||
664 | BUG_ON(rq->cmd_flags & REQ_ON_PLUG); | ||
665 | |||
666 | if (rq->cmd_flags & REQ_SOFTBARRIER) { | 657 | if (rq->cmd_flags & REQ_SOFTBARRIER) { |
667 | /* barriers are scheduling boundary, update end_sector */ | 658 | /* barriers are scheduling boundary, update end_sector */ |
668 | if (rq->cmd_type == REQ_TYPE_FS || | 659 | if (rq->cmd_type == REQ_TYPE_FS || |
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 30ea95f43e79..d51f9795c064 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c | |||
@@ -1089,21 +1089,21 @@ static int atapi_drain_needed(struct request *rq) | |||
1089 | static int ata_scsi_dev_config(struct scsi_device *sdev, | 1089 | static int ata_scsi_dev_config(struct scsi_device *sdev, |
1090 | struct ata_device *dev) | 1090 | struct ata_device *dev) |
1091 | { | 1091 | { |
1092 | struct request_queue *q = sdev->request_queue; | ||
1093 | |||
1092 | if (!ata_id_has_unload(dev->id)) | 1094 | if (!ata_id_has_unload(dev->id)) |
1093 | dev->flags |= ATA_DFLAG_NO_UNLOAD; | 1095 | dev->flags |= ATA_DFLAG_NO_UNLOAD; |
1094 | 1096 | ||
1095 | /* configure max sectors */ | 1097 | /* configure max sectors */ |
1096 | blk_queue_max_hw_sectors(sdev->request_queue, dev->max_sectors); | 1098 | blk_queue_max_hw_sectors(q, dev->max_sectors); |
1097 | 1099 | ||
1098 | if (dev->class == ATA_DEV_ATAPI) { | 1100 | if (dev->class == ATA_DEV_ATAPI) { |
1099 | struct request_queue *q = sdev->request_queue; | ||
1100 | void *buf; | 1101 | void *buf; |
1101 | 1102 | ||
1102 | sdev->sector_size = ATA_SECT_SIZE; | 1103 | sdev->sector_size = ATA_SECT_SIZE; |
1103 | 1104 | ||
1104 | /* set DMA padding */ | 1105 | /* set DMA padding */ |
1105 | blk_queue_update_dma_pad(sdev->request_queue, | 1106 | blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1); |
1106 | ATA_DMA_PAD_SZ - 1); | ||
1107 | 1107 | ||
1108 | /* configure draining */ | 1108 | /* configure draining */ |
1109 | buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); | 1109 | buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); |
@@ -1131,8 +1131,7 @@ static int ata_scsi_dev_config(struct scsi_device *sdev, | |||
1131 | "sector_size=%u > PAGE_SIZE, PIO may malfunction\n", | 1131 | "sector_size=%u > PAGE_SIZE, PIO may malfunction\n", |
1132 | sdev->sector_size); | 1132 | sdev->sector_size); |
1133 | 1133 | ||
1134 | blk_queue_update_dma_alignment(sdev->request_queue, | 1134 | blk_queue_update_dma_alignment(q, sdev->sector_size - 1); |
1135 | sdev->sector_size - 1); | ||
1136 | 1135 | ||
1137 | if (dev->flags & ATA_DFLAG_AN) | 1136 | if (dev->flags & ATA_DFLAG_AN) |
1138 | set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events); | 1137 | set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events); |
@@ -1145,6 +1144,8 @@ static int ata_scsi_dev_config(struct scsi_device *sdev, | |||
1145 | scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth); | 1144 | scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth); |
1146 | } | 1145 | } |
1147 | 1146 | ||
1147 | blk_queue_flush_queueable(q, false); | ||
1148 | |||
1148 | dev->sdev = sdev; | 1149 | dev->sdev = sdev; |
1149 | return 0; | 1150 | return 0; |
1150 | } | 1151 | } |
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 8690e31d9932..a0aabd904a51 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c | |||
@@ -320,6 +320,8 @@ static void pcd_init_units(void) | |||
320 | disk->first_minor = unit; | 320 | disk->first_minor = unit; |
321 | strcpy(disk->disk_name, cd->name); /* umm... */ | 321 | strcpy(disk->disk_name, cd->name); /* umm... */ |
322 | disk->fops = &pcd_bdops; | 322 | disk->fops = &pcd_bdops; |
323 | disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; | ||
324 | disk->events = DISK_EVENT_MEDIA_CHANGE; | ||
323 | } | 325 | } |
324 | } | 326 | } |
325 | 327 | ||
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index e427fbe45999..ae15a4ddaa9b 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c | |||
@@ -625,7 +625,9 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id) | |||
625 | blk_queue_max_hw_sectors(q, 4096 / 512); | 625 | blk_queue_max_hw_sectors(q, 4096 / 512); |
626 | gendisk->queue = q; | 626 | gendisk->queue = q; |
627 | gendisk->fops = &viocd_fops; | 627 | gendisk->fops = &viocd_fops; |
628 | gendisk->flags = GENHD_FL_CD|GENHD_FL_REMOVABLE; | 628 | gendisk->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE | |
629 | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; | ||
630 | gendisk->events = DISK_EVENT_MEDIA_CHANGE; | ||
629 | set_capacity(gendisk, 0); | 631 | set_capacity(gendisk, 0); |
630 | gendisk->private_data = d; | 632 | gendisk->private_data = d; |
631 | d->viocd_disk = gendisk; | 633 | d->viocd_disk = gendisk; |
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index a5ec5a7cb381..6e5123b1d341 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c | |||
@@ -1781,7 +1781,8 @@ static int ide_cd_probe(ide_drive_t *drive) | |||
1781 | 1781 | ||
1782 | ide_cd_read_toc(drive, &sense); | 1782 | ide_cd_read_toc(drive, &sense); |
1783 | g->fops = &idecd_ops; | 1783 | g->fops = &idecd_ops; |
1784 | g->flags |= GENHD_FL_REMOVABLE; | 1784 | g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; |
1785 | g->events = DISK_EVENT_MEDIA_CHANGE; | ||
1785 | add_disk(g); | 1786 | add_disk(g); |
1786 | return 0; | 1787 | return 0; |
1787 | 1788 | ||
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 95019c747cc1..4778e2707168 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c | |||
@@ -636,7 +636,7 @@ static int sr_probe(struct device *dev) | |||
636 | disk->first_minor = minor; | 636 | disk->first_minor = minor; |
637 | sprintf(disk->disk_name, "sr%d", minor); | 637 | sprintf(disk->disk_name, "sr%d", minor); |
638 | disk->fops = &sr_bdops; | 638 | disk->fops = &sr_bdops; |
639 | disk->flags = GENHD_FL_CD; | 639 | disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; |
640 | disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; | 640 | disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; |
641 | 641 | ||
642 | blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT); | 642 | blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT); |
diff --git a/fs/block_dev.c b/fs/block_dev.c index bf9c7a720371..1f2b19978333 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -1238,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) | |||
1238 | res = __blkdev_get(bdev, mode, 0); | 1238 | res = __blkdev_get(bdev, mode, 0); |
1239 | 1239 | ||
1240 | if (whole) { | 1240 | if (whole) { |
1241 | struct gendisk *disk = whole->bd_disk; | ||
1242 | |||
1241 | /* finish claiming */ | 1243 | /* finish claiming */ |
1242 | mutex_lock(&bdev->bd_mutex); | 1244 | mutex_lock(&bdev->bd_mutex); |
1243 | spin_lock(&bdev_lock); | 1245 | spin_lock(&bdev_lock); |
@@ -1264,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) | |||
1264 | spin_unlock(&bdev_lock); | 1266 | spin_unlock(&bdev_lock); |
1265 | 1267 | ||
1266 | /* | 1268 | /* |
1267 | * Block event polling for write claims. Any write | 1269 | * Block event polling for write claims if requested. Any |
1268 | * holder makes the write_holder state stick until all | 1270 | * write holder makes the write_holder state stick until |
1269 | * are released. This is good enough and tracking | 1271 | * all are released. This is good enough and tracking |
1270 | * individual writeable reference is too fragile given | 1272 | * individual writeable reference is too fragile given the |
1271 | * the way @mode is used in blkdev_get/put(). | 1273 | * way @mode is used in blkdev_get/put(). |
1272 | */ | 1274 | */ |
1273 | if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { | 1275 | if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) && |
1276 | !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { | ||
1274 | bdev->bd_write_holder = true; | 1277 | bdev->bd_write_holder = true; |
1275 | disk_block_events(bdev->bd_disk); | 1278 | disk_block_events(disk); |
1276 | } | 1279 | } |
1277 | 1280 | ||
1278 | mutex_unlock(&bdev->bd_mutex); | 1281 | mutex_unlock(&bdev->bd_mutex); |
diff --git a/fs/partitions/check.c b/fs/partitions/check.c index d545e97d99c3..8ed4d3433199 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c | |||
@@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev, | |||
255 | struct device_attribute *attr, char *buf) | 255 | struct device_attribute *attr, char *buf) |
256 | { | 256 | { |
257 | struct hd_struct *p = dev_to_part(dev); | 257 | struct hd_struct *p = dev_to_part(dev); |
258 | return sprintf(buf, "%u\n", p->discard_alignment); | 258 | struct gendisk *disk = dev_to_disk(dev); |
259 | |||
260 | return sprintf(buf, "%u\n", | ||
261 | queue_limit_discard_alignment(&disk->queue->limits, | ||
262 | p->start_sect)); | ||
259 | } | 263 | } |
260 | 264 | ||
261 | ssize_t part_stat_show(struct device *dev, | 265 | ssize_t part_stat_show(struct device *dev, |
@@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, | |||
449 | p->start_sect = start; | 453 | p->start_sect = start; |
450 | p->alignment_offset = | 454 | p->alignment_offset = |
451 | queue_limit_alignment_offset(&disk->queue->limits, start); | 455 | queue_limit_alignment_offset(&disk->queue->limits, start); |
452 | p->discard_alignment = | ||
453 | queue_limit_discard_alignment(&disk->queue->limits, start); | ||
454 | p->nr_sects = len; | 456 | p->nr_sects = len; |
455 | p->partno = partno; | 457 | p->partno = partno; |
456 | p->policy = get_disk_ro(disk); | 458 | p->policy = get_disk_ro(disk); |
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index be50d9e70a7d..2a7cea53ca0d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h | |||
@@ -151,7 +151,6 @@ enum rq_flag_bits { | |||
151 | __REQ_IO_STAT, /* account I/O stat */ | 151 | __REQ_IO_STAT, /* account I/O stat */ |
152 | __REQ_MIXED_MERGE, /* merge of different types, fail separately */ | 152 | __REQ_MIXED_MERGE, /* merge of different types, fail separately */ |
153 | __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ | 153 | __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ |
154 | __REQ_ON_PLUG, /* on plug list */ | ||
155 | __REQ_NR_BITS, /* stops here */ | 154 | __REQ_NR_BITS, /* stops here */ |
156 | }; | 155 | }; |
157 | 156 | ||
@@ -192,6 +191,5 @@ enum rq_flag_bits { | |||
192 | #define REQ_IO_STAT (1 << __REQ_IO_STAT) | 191 | #define REQ_IO_STAT (1 << __REQ_IO_STAT) |
193 | #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) | 192 | #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) |
194 | #define REQ_SECURE (1 << __REQ_SECURE) | 193 | #define REQ_SECURE (1 << __REQ_SECURE) |
195 | #define REQ_ON_PLUG (1 << __REQ_ON_PLUG) | ||
196 | 194 | ||
197 | #endif /* __LINUX_BLK_TYPES_H */ | 195 | #endif /* __LINUX_BLK_TYPES_H */ |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2ad95fa1d130..ae9091a68480 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -257,7 +257,7 @@ struct queue_limits { | |||
257 | unsigned char misaligned; | 257 | unsigned char misaligned; |
258 | unsigned char discard_misaligned; | 258 | unsigned char discard_misaligned; |
259 | unsigned char cluster; | 259 | unsigned char cluster; |
260 | signed char discard_zeroes_data; | 260 | unsigned char discard_zeroes_data; |
261 | }; | 261 | }; |
262 | 262 | ||
263 | struct request_queue | 263 | struct request_queue |
@@ -364,6 +364,8 @@ struct request_queue | |||
364 | * for flush operations | 364 | * for flush operations |
365 | */ | 365 | */ |
366 | unsigned int flush_flags; | 366 | unsigned int flush_flags; |
367 | unsigned int flush_not_queueable:1; | ||
368 | unsigned int flush_queue_delayed:1; | ||
367 | unsigned int flush_pending_idx:1; | 369 | unsigned int flush_pending_idx:1; |
368 | unsigned int flush_running_idx:1; | 370 | unsigned int flush_running_idx:1; |
369 | unsigned long flush_pending_since; | 371 | unsigned long flush_pending_since; |
@@ -843,6 +845,7 @@ extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); | |||
843 | extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); | 845 | extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); |
844 | extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); | 846 | extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); |
845 | extern void blk_queue_flush(struct request_queue *q, unsigned int flush); | 847 | extern void blk_queue_flush(struct request_queue *q, unsigned int flush); |
848 | extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); | ||
846 | extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); | 849 | extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); |
847 | 850 | ||
848 | extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); | 851 | extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); |
@@ -1066,13 +1069,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector | |||
1066 | { | 1069 | { |
1067 | unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1); | 1070 | unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1); |
1068 | 1071 | ||
1072 | if (!lim->max_discard_sectors) | ||
1073 | return 0; | ||
1074 | |||
1069 | return (lim->discard_granularity + lim->discard_alignment - alignment) | 1075 | return (lim->discard_granularity + lim->discard_alignment - alignment) |
1070 | & (lim->discard_granularity - 1); | 1076 | & (lim->discard_granularity - 1); |
1071 | } | 1077 | } |
1072 | 1078 | ||
1073 | static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) | 1079 | static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) |
1074 | { | 1080 | { |
1075 | if (q->limits.discard_zeroes_data == 1) | 1081 | if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) |
1076 | return 1; | 1082 | return 1; |
1077 | 1083 | ||
1078 | return 0; | 1084 | return 0; |
@@ -1111,6 +1117,11 @@ static inline unsigned int block_size(struct block_device *bdev) | |||
1111 | return bdev->bd_block_size; | 1117 | return bdev->bd_block_size; |
1112 | } | 1118 | } |
1113 | 1119 | ||
1120 | static inline bool queue_flush_queueable(struct request_queue *q) | ||
1121 | { | ||
1122 | return !q->flush_not_queueable; | ||
1123 | } | ||
1124 | |||
1114 | typedef struct {struct page *v;} Sector; | 1125 | typedef struct {struct page *v;} Sector; |
1115 | 1126 | ||
1116 | unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *); | 1127 | unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *); |
diff --git a/include/linux/genhd.h b/include/linux/genhd.h index d764a426e9fd..b78956b3c2e7 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h | |||
@@ -100,7 +100,6 @@ struct hd_struct { | |||
100 | sector_t start_sect; | 100 | sector_t start_sect; |
101 | sector_t nr_sects; | 101 | sector_t nr_sects; |
102 | sector_t alignment_offset; | 102 | sector_t alignment_offset; |
103 | unsigned int discard_alignment; | ||
104 | struct device __dev; | 103 | struct device __dev; |
105 | struct kobject *holder_dir; | 104 | struct kobject *holder_dir; |
106 | int policy, partno; | 105 | int policy, partno; |
@@ -127,6 +126,7 @@ struct hd_struct { | |||
127 | #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 | 126 | #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 |
128 | #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ | 127 | #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ |
129 | #define GENHD_FL_NATIVE_CAPACITY 128 | 128 | #define GENHD_FL_NATIVE_CAPACITY 128 |
129 | #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 | ||
130 | 130 | ||
131 | enum { | 131 | enum { |
132 | DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ | 132 | DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index befc87531e4f..f032e6e1e09a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -63,10 +63,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
63 | unsigned long background_thresh; | 63 | unsigned long background_thresh; |
64 | unsigned long dirty_thresh; | 64 | unsigned long dirty_thresh; |
65 | unsigned long bdi_thresh; | 65 | unsigned long bdi_thresh; |
66 | unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; | 66 | unsigned long nr_dirty, nr_io, nr_more_io; |
67 | struct inode *inode; | 67 | struct inode *inode; |
68 | 68 | ||
69 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; | 69 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 70 | spin_lock(&inode_wb_list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 72 | nr_dirty++; |