aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorAnton Vorontsov <avorontsov@ru.mvista.com>2009-09-22 19:49:27 -0400
committerAnton Vorontsov <avorontsov@ru.mvista.com>2009-09-22 19:49:27 -0400
commitf056878332a91ed984a116bad4e7d49aefff9e6e (patch)
tree572f4757c8e7811d45e0be0c2ae529c78fb63441 /block
parent3961f7c3cf247eee5df7fabadc7a40f2deeb98f3 (diff)
parent7fa07729e439a6184bd824746d06a49cca553f15 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: drivers/power/wm97xx_battery.c
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig11
-rw-r--r--block/Makefile2
-rw-r--r--block/as-iosched.c10
-rw-r--r--block/blk-barrier.c31
-rw-r--r--block/blk-core.c198
-rw-r--r--block/blk-integrity.c1
-rw-r--r--block/blk-iopoll.c227
-rw-r--r--block/blk-merge.c45
-rw-r--r--block/blk-settings.c105
-rw-r--r--block/blk-sysfs.c20
-rw-r--r--block/blk.h1
-rw-r--r--block/bsg.c6
-rw-r--r--block/cfq-iosched.c260
-rw-r--r--block/cmd-filter.c233
-rw-r--r--block/elevator.c3
-rw-r--r--block/genhd.c32
-rw-r--r--block/ioctl.c49
-rw-r--r--block/scsi_ioctl.c44
18 files changed, 735 insertions, 543 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 95a86adc33a..9be0b56eaee 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -48,9 +48,9 @@ config LBDAF
48 If unsure, say Y. 48 If unsure, say Y.
49 49
50config BLK_DEV_BSG 50config BLK_DEV_BSG
51 bool "Block layer SG support v4 (EXPERIMENTAL)" 51 bool "Block layer SG support v4"
52 depends on EXPERIMENTAL 52 default y
53 ---help--- 53 help
54 Saying Y here will enable generic SG (SCSI generic) v4 support 54 Saying Y here will enable generic SG (SCSI generic) v4 support
55 for any block device. 55 for any block device.
56 56
@@ -60,7 +60,10 @@ config BLK_DEV_BSG
60 protocols (e.g. Task Management Functions and SMP in Serial 60 protocols (e.g. Task Management Functions and SMP in Serial
61 Attached SCSI). 61 Attached SCSI).
62 62
63 If unsure, say N. 63 This option is required by recent UDEV versions to properly
64 access device serial numbers, etc.
65
66 If unsure, say Y.
64 67
65config BLK_DEV_INTEGRITY 68config BLK_DEV_INTEGRITY
66 bool "Block layer data integrity support" 69 bool "Block layer data integrity support"
diff --git a/block/Makefile b/block/Makefile
index e9fa4dd690f..ba74ca6bfa1 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 ioctl.o genhd.o scsi_ioctl.o cmd-filter.o 8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 7a12cf6ee1d..ce8ba57c655 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -146,7 +146,7 @@ enum arq_state {
146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) 146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2)
147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) 147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state)
148 148
149static DEFINE_PER_CPU(unsigned long, ioc_count); 149static DEFINE_PER_CPU(unsigned long, as_ioc_count);
150static struct completion *ioc_gone; 150static struct completion *ioc_gone;
151static DEFINE_SPINLOCK(ioc_gone_lock); 151static DEFINE_SPINLOCK(ioc_gone_lock);
152 152
@@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad);
161static void free_as_io_context(struct as_io_context *aic) 161static void free_as_io_context(struct as_io_context *aic)
162{ 162{
163 kfree(aic); 163 kfree(aic);
164 elv_ioc_count_dec(ioc_count); 164 elv_ioc_count_dec(as_ioc_count);
165 if (ioc_gone) { 165 if (ioc_gone) {
166 /* 166 /*
167 * AS scheduler is exiting, grab exit lock and check 167 * AS scheduler is exiting, grab exit lock and check
@@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic)
169 * complete ioc_gone and set it back to NULL. 169 * complete ioc_gone and set it back to NULL.
170 */ 170 */
171 spin_lock(&ioc_gone_lock); 171 spin_lock(&ioc_gone_lock);
172 if (ioc_gone && !elv_ioc_count_read(ioc_count)) { 172 if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
173 complete(ioc_gone); 173 complete(ioc_gone);
174 ioc_gone = NULL; 174 ioc_gone = NULL;
175 } 175 }
@@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void)
211 ret->seek_total = 0; 211 ret->seek_total = 0;
212 ret->seek_samples = 0; 212 ret->seek_samples = 0;
213 ret->seek_mean = 0; 213 ret->seek_mean = 0;
214 elv_ioc_count_inc(ioc_count); 214 elv_ioc_count_inc(as_ioc_count);
215 } 215 }
216 216
217 return ret; 217 return ret;
@@ -1507,7 +1507,7 @@ static void __exit as_exit(void)
1507 ioc_gone = &all_gone; 1507 ioc_gone = &all_gone;
1508 /* ioc_gone's update must be visible before reading ioc_count */ 1508 /* ioc_gone's update must be visible before reading ioc_count */
1509 smp_wmb(); 1509 smp_wmb();
1510 if (elv_ioc_count_read(ioc_count)) 1510 if (elv_ioc_count_read(as_ioc_count))
1511 wait_for_completion(&all_gone); 1511 wait_for_completion(&all_gone);
1512 synchronize_rcu(); 1512 synchronize_rcu();
1513} 1513}
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 30022b4e2f6..6593ab39cfe 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -348,6 +348,9 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
348 clear_bit(BIO_UPTODATE, &bio->bi_flags); 348 clear_bit(BIO_UPTODATE, &bio->bi_flags);
349 } 349 }
350 350
351 if (bio->bi_private)
352 complete(bio->bi_private);
353
351 bio_put(bio); 354 bio_put(bio);
352} 355}
353 356
@@ -357,21 +360,20 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
357 * @sector: start sector 360 * @sector: start sector
358 * @nr_sects: number of sectors to discard 361 * @nr_sects: number of sectors to discard
359 * @gfp_mask: memory allocation flags (for bio_alloc) 362 * @gfp_mask: memory allocation flags (for bio_alloc)
363 * @flags: DISCARD_FL_* flags to control behaviour
360 * 364 *
361 * Description: 365 * Description:
362 * Issue a discard request for the sectors in question. Does not wait. 366 * Issue a discard request for the sectors in question.
363 */ 367 */
364int blkdev_issue_discard(struct block_device *bdev, 368int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
365 sector_t sector, sector_t nr_sects, gfp_t gfp_mask) 369 sector_t nr_sects, gfp_t gfp_mask, int flags)
366{ 370{
367 struct request_queue *q; 371 DECLARE_COMPLETION_ONSTACK(wait);
368 struct bio *bio; 372 struct request_queue *q = bdev_get_queue(bdev);
373 int type = flags & DISCARD_FL_BARRIER ?
374 DISCARD_BARRIER : DISCARD_NOBARRIER;
369 int ret = 0; 375 int ret = 0;
370 376
371 if (bdev->bd_disk == NULL)
372 return -ENXIO;
373
374 q = bdev_get_queue(bdev);
375 if (!q) 377 if (!q)
376 return -ENXIO; 378 return -ENXIO;
377 379
@@ -379,12 +381,14 @@ int blkdev_issue_discard(struct block_device *bdev,
379 return -EOPNOTSUPP; 381 return -EOPNOTSUPP;
380 382
381 while (nr_sects && !ret) { 383 while (nr_sects && !ret) {
382 bio = bio_alloc(gfp_mask, 0); 384 struct bio *bio = bio_alloc(gfp_mask, 0);
383 if (!bio) 385 if (!bio)
384 return -ENOMEM; 386 return -ENOMEM;
385 387
386 bio->bi_end_io = blkdev_discard_end_io; 388 bio->bi_end_io = blkdev_discard_end_io;
387 bio->bi_bdev = bdev; 389 bio->bi_bdev = bdev;
390 if (flags & DISCARD_FL_WAIT)
391 bio->bi_private = &wait;
388 392
389 bio->bi_sector = sector; 393 bio->bi_sector = sector;
390 394
@@ -396,10 +400,13 @@ int blkdev_issue_discard(struct block_device *bdev,
396 bio->bi_size = nr_sects << 9; 400 bio->bi_size = nr_sects << 9;
397 nr_sects = 0; 401 nr_sects = 0;
398 } 402 }
403
399 bio_get(bio); 404 bio_get(bio);
400 submit_bio(DISCARD_BARRIER, bio); 405 submit_bio(type, bio);
406
407 if (flags & DISCARD_FL_WAIT)
408 wait_for_completion(&wait);
401 409
402 /* Check if it failed immediately */
403 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 410 if (bio_flagged(bio, BIO_EOPNOTSUPP))
404 ret = -EOPNOTSUPP; 411 ret = -EOPNOTSUPP;
405 else if (!bio_flagged(bio, BIO_UPTODATE)) 412 else if (!bio_flagged(bio, BIO_UPTODATE))
diff --git a/block/blk-core.c b/block/blk-core.c
index b06cf5c2a82..8135228e4b2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
69 part_stat_inc(cpu, part, merges[rw]); 69 part_stat_inc(cpu, part, merges[rw]);
70 else { 70 else {
71 part_round_stats(cpu, part); 71 part_round_stats(cpu, part);
72 part_inc_in_flight(part); 72 part_inc_in_flight(part, rw);
73 } 73 }
74 74
75 part_stat_unlock(); 75 part_stat_unlock();
@@ -501,6 +501,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
501 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 501 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
502 q->backing_dev_info.state = 0; 502 q->backing_dev_info.state = 0;
503 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; 503 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
504 q->backing_dev_info.name = "block";
504 505
505 err = bdi_init(&q->backing_dev_info); 506 err = bdi_init(&q->backing_dev_info);
506 if (err) { 507 if (err) {
@@ -575,13 +576,6 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
575 return NULL; 576 return NULL;
576 } 577 }
577 578
578 /*
579 * if caller didn't supply a lock, they get per-queue locking with
580 * our embedded lock
581 */
582 if (!lock)
583 lock = &q->__queue_lock;
584
585 q->request_fn = rfn; 579 q->request_fn = rfn;
586 q->prep_rq_fn = NULL; 580 q->prep_rq_fn = NULL;
587 q->unplug_fn = generic_unplug_device; 581 q->unplug_fn = generic_unplug_device;
@@ -595,8 +589,6 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
595 589
596 q->sg_reserved_size = INT_MAX; 590 q->sg_reserved_size = INT_MAX;
597 591
598 blk_set_cmd_filter_defaults(&q->cmd_filter);
599
600 /* 592 /*
601 * all done 593 * all done
602 */ 594 */
@@ -1039,7 +1031,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
1039 1031
1040 if (part->in_flight) { 1032 if (part->in_flight) {
1041 __part_stat_add(cpu, part, time_in_queue, 1033 __part_stat_add(cpu, part, time_in_queue,
1042 part->in_flight * (now - part->stamp)); 1034 part_in_flight(part) * (now - part->stamp));
1043 __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); 1035 __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1044 } 1036 }
1045 part->stamp = now; 1037 part->stamp = now;
@@ -1120,31 +1112,27 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1120 req->cmd_type = REQ_TYPE_FS; 1112 req->cmd_type = REQ_TYPE_FS;
1121 1113
1122 /* 1114 /*
1123 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) 1115 * Inherit FAILFAST from bio (for read-ahead, and explicit
1116 * FAILFAST). FAILFAST flags are identical for req and bio.
1124 */ 1117 */
1125 if (bio_rw_ahead(bio)) 1118 if (bio_rw_flagged(bio, BIO_RW_AHEAD))
1126 req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | 1119 req->cmd_flags |= REQ_FAILFAST_MASK;
1127 REQ_FAILFAST_DRIVER); 1120 else
1128 if (bio_failfast_dev(bio)) 1121 req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
1129 req->cmd_flags |= REQ_FAILFAST_DEV; 1122
1130 if (bio_failfast_transport(bio)) 1123 if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
1131 req->cmd_flags |= REQ_FAILFAST_TRANSPORT;
1132 if (bio_failfast_driver(bio))
1133 req->cmd_flags |= REQ_FAILFAST_DRIVER;
1134
1135 if (unlikely(bio_discard(bio))) {
1136 req->cmd_flags |= REQ_DISCARD; 1124 req->cmd_flags |= REQ_DISCARD;
1137 if (bio_barrier(bio)) 1125 if (bio_rw_flagged(bio, BIO_RW_BARRIER))
1138 req->cmd_flags |= REQ_SOFTBARRIER; 1126 req->cmd_flags |= REQ_SOFTBARRIER;
1139 req->q->prepare_discard_fn(req->q, req); 1127 req->q->prepare_discard_fn(req->q, req);
1140 } else if (unlikely(bio_barrier(bio))) 1128 } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
1141 req->cmd_flags |= REQ_HARDBARRIER; 1129 req->cmd_flags |= REQ_HARDBARRIER;
1142 1130
1143 if (bio_sync(bio)) 1131 if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
1144 req->cmd_flags |= REQ_RW_SYNC; 1132 req->cmd_flags |= REQ_RW_SYNC;
1145 if (bio_rw_meta(bio)) 1133 if (bio_rw_flagged(bio, BIO_RW_META))
1146 req->cmd_flags |= REQ_RW_META; 1134 req->cmd_flags |= REQ_RW_META;
1147 if (bio_noidle(bio)) 1135 if (bio_rw_flagged(bio, BIO_RW_NOIDLE))
1148 req->cmd_flags |= REQ_NOIDLE; 1136 req->cmd_flags |= REQ_NOIDLE;
1149 1137
1150 req->errors = 0; 1138 req->errors = 0;
@@ -1159,7 +1147,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1159 */ 1147 */
1160static inline bool queue_should_plug(struct request_queue *q) 1148static inline bool queue_should_plug(struct request_queue *q)
1161{ 1149{
1162 return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); 1150 return !(blk_queue_nonrot(q) && blk_queue_queuing(q));
1163} 1151}
1164 1152
1165static int __make_request(struct request_queue *q, struct bio *bio) 1153static int __make_request(struct request_queue *q, struct bio *bio)
@@ -1168,10 +1156,16 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1168 int el_ret; 1156 int el_ret;
1169 unsigned int bytes = bio->bi_size; 1157 unsigned int bytes = bio->bi_size;
1170 const unsigned short prio = bio_prio(bio); 1158 const unsigned short prio = bio_prio(bio);
1171 const int sync = bio_sync(bio); 1159 const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
1172 const int unplug = bio_unplug(bio); 1160 const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
1161 const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1173 int rw_flags; 1162 int rw_flags;
1174 1163
1164 if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) &&
1165 (q->next_ordered == QUEUE_ORDERED_NONE)) {
1166 bio_endio(bio, -EOPNOTSUPP);
1167 return 0;
1168 }
1175 /* 1169 /*
1176 * low level driver can indicate that it wants pages above a 1170 * low level driver can indicate that it wants pages above a
1177 * certain limit bounced to low memory (ie for highmem, or even 1171 * certain limit bounced to low memory (ie for highmem, or even
@@ -1181,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1181 1175
1182 spin_lock_irq(q->queue_lock); 1176 spin_lock_irq(q->queue_lock);
1183 1177
1184 if (unlikely(bio_barrier(bio)) || elv_queue_empty(q)) 1178 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
1185 goto get_rq; 1179 goto get_rq;
1186 1180
1187 el_ret = elv_merge(q, &req, bio); 1181 el_ret = elv_merge(q, &req, bio);
@@ -1194,6 +1188,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1194 1188
1195 trace_block_bio_backmerge(q, bio); 1189 trace_block_bio_backmerge(q, bio);
1196 1190
1191 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1192 blk_rq_set_mixed_merge(req);
1193
1197 req->biotail->bi_next = bio; 1194 req->biotail->bi_next = bio;
1198 req->biotail = bio; 1195 req->biotail = bio;
1199 req->__data_len += bytes; 1196 req->__data_len += bytes;
@@ -1213,6 +1210,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1213 1210
1214 trace_block_bio_frontmerge(q, bio); 1211 trace_block_bio_frontmerge(q, bio);
1215 1212
1213 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
1214 blk_rq_set_mixed_merge(req);
1215 req->cmd_flags &= ~REQ_FAILFAST_MASK;
1216 req->cmd_flags |= ff;
1217 }
1218
1216 bio->bi_next = req->bio; 1219 bio->bi_next = req->bio;
1217 req->bio = bio; 1220 req->bio = bio;
1218 1221
@@ -1460,24 +1463,20 @@ static inline void __generic_make_request(struct bio *bio)
1460 if (old_sector != -1) 1463 if (old_sector != -1)
1461 trace_block_remap(q, bio, old_dev, old_sector); 1464 trace_block_remap(q, bio, old_dev, old_sector);
1462 1465
1463 trace_block_bio_queue(q, bio);
1464
1465 old_sector = bio->bi_sector; 1466 old_sector = bio->bi_sector;
1466 old_dev = bio->bi_bdev->bd_dev; 1467 old_dev = bio->bi_bdev->bd_dev;
1467 1468
1468 if (bio_check_eod(bio, nr_sectors)) 1469 if (bio_check_eod(bio, nr_sectors))
1469 goto end_io; 1470 goto end_io;
1470 1471
1471 if (bio_discard(bio) && !q->prepare_discard_fn) { 1472 if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
1472 err = -EOPNOTSUPP; 1473 !q->prepare_discard_fn) {
1473 goto end_io;
1474 }
1475 if (bio_barrier(bio) && bio_has_data(bio) &&
1476 (q->next_ordered == QUEUE_ORDERED_NONE)) {
1477 err = -EOPNOTSUPP; 1474 err = -EOPNOTSUPP;
1478 goto end_io; 1475 goto end_io;
1479 } 1476 }
1480 1477
1478 trace_block_bio_queue(q, bio);
1479
1481 ret = q->make_request_fn(q, bio); 1480 ret = q->make_request_fn(q, bio);
1482 } while (ret); 1481 } while (ret);
1483 1482
@@ -1662,6 +1661,50 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1662} 1661}
1663EXPORT_SYMBOL_GPL(blk_insert_cloned_request); 1662EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
1664 1663
1664/**
1665 * blk_rq_err_bytes - determine number of bytes till the next failure boundary
1666 * @rq: request to examine
1667 *
1668 * Description:
1669 * A request could be merge of IOs which require different failure
1670 * handling. This function determines the number of bytes which
1671 * can be failed from the beginning of the request without
1672 * crossing into area which need to be retried further.
1673 *
1674 * Return:
1675 * The number of bytes to fail.
1676 *
1677 * Context:
1678 * queue_lock must be held.
1679 */
1680unsigned int blk_rq_err_bytes(const struct request *rq)
1681{
1682 unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
1683 unsigned int bytes = 0;
1684 struct bio *bio;
1685
1686 if (!(rq->cmd_flags & REQ_MIXED_MERGE))
1687 return blk_rq_bytes(rq);
1688
1689 /*
1690 * Currently the only 'mixing' which can happen is between
1691 * different fastfail types. We can safely fail portions
1692 * which have all the failfast bits that the first one has -
1693 * the ones which are at least as eager to fail as the first
1694 * one.
1695 */
1696 for (bio = rq->bio; bio; bio = bio->bi_next) {
1697 if ((bio->bi_rw & ff) != ff)
1698 break;
1699 bytes += bio->bi_size;
1700 }
1701
1702 /* this could lead to infinite loop */
1703 BUG_ON(blk_rq_bytes(rq) && !bytes);
1704 return bytes;
1705}
1706EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
1707
1665static void blk_account_io_completion(struct request *req, unsigned int bytes) 1708static void blk_account_io_completion(struct request *req, unsigned int bytes)
1666{ 1709{
1667 if (blk_do_io_stat(req)) { 1710 if (blk_do_io_stat(req)) {
@@ -1695,7 +1738,7 @@ static void blk_account_io_done(struct request *req)
1695 part_stat_inc(cpu, part, ios[rw]); 1738 part_stat_inc(cpu, part, ios[rw]);
1696 part_stat_add(cpu, part, ticks[rw], duration); 1739 part_stat_add(cpu, part, ticks[rw], duration);
1697 part_round_stats(cpu, part); 1740 part_round_stats(cpu, part);
1698 part_dec_in_flight(part); 1741 part_dec_in_flight(part, rw);
1699 1742
1700 part_stat_unlock(); 1743 part_stat_unlock();
1701 } 1744 }
@@ -1815,8 +1858,15 @@ void blk_dequeue_request(struct request *rq)
1815 * and to it is freed is accounted as io that is in progress at 1858 * and to it is freed is accounted as io that is in progress at
1816 * the driver side. 1859 * the driver side.
1817 */ 1860 */
1818 if (blk_account_rq(rq)) 1861 if (blk_account_rq(rq)) {
1819 q->in_flight[rq_is_sync(rq)]++; 1862 q->in_flight[rq_is_sync(rq)]++;
1863 /*
1864 * Mark this device as supporting hardware queuing, if
1865 * we have more IOs in flight than 4.
1866 */
1867 if (!blk_queue_queuing(q) && queue_in_flight(q) > 4)
1868 set_bit(QUEUE_FLAG_CQ, &q->queue_flags);
1869 }
1820} 1870}
1821 1871
1822/** 1872/**
@@ -2008,6 +2058,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2008 if (blk_fs_request(req) || blk_discard_rq(req)) 2058 if (blk_fs_request(req) || blk_discard_rq(req))
2009 req->__sector += total_bytes >> 9; 2059 req->__sector += total_bytes >> 9;
2010 2060
2061 /* mixed attributes always follow the first bio */
2062 if (req->cmd_flags & REQ_MIXED_MERGE) {
2063 req->cmd_flags &= ~REQ_FAILFAST_MASK;
2064 req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
2065 }
2066
2011 /* 2067 /*
2012 * If total number of sectors is less than the first segment 2068 * If total number of sectors is less than the first segment
2013 * size, something has gone terribly wrong. 2069 * size, something has gone terribly wrong.
@@ -2145,7 +2201,7 @@ bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
2145{ 2201{
2146 return blk_end_bidi_request(rq, error, nr_bytes, 0); 2202 return blk_end_bidi_request(rq, error, nr_bytes, 0);
2147} 2203}
2148EXPORT_SYMBOL_GPL(blk_end_request); 2204EXPORT_SYMBOL(blk_end_request);
2149 2205
2150/** 2206/**
2151 * blk_end_request_all - Helper function for drives to finish the request. 2207 * blk_end_request_all - Helper function for drives to finish the request.
@@ -2166,7 +2222,7 @@ void blk_end_request_all(struct request *rq, int error)
2166 pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); 2222 pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
2167 BUG_ON(pending); 2223 BUG_ON(pending);
2168} 2224}
2169EXPORT_SYMBOL_GPL(blk_end_request_all); 2225EXPORT_SYMBOL(blk_end_request_all);
2170 2226
2171/** 2227/**
2172 * blk_end_request_cur - Helper function to finish the current request chunk. 2228 * blk_end_request_cur - Helper function to finish the current request chunk.
@@ -2184,7 +2240,26 @@ bool blk_end_request_cur(struct request *rq, int error)
2184{ 2240{
2185 return blk_end_request(rq, error, blk_rq_cur_bytes(rq)); 2241 return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
2186} 2242}
2187EXPORT_SYMBOL_GPL(blk_end_request_cur); 2243EXPORT_SYMBOL(blk_end_request_cur);
2244
2245/**
2246 * blk_end_request_err - Finish a request till the next failure boundary.
2247 * @rq: the request to finish till the next failure boundary for
2248 * @error: must be negative errno
2249 *
2250 * Description:
2251 * Complete @rq till the next failure boundary.
2252 *
2253 * Return:
2254 * %false - we are done with this request
2255 * %true - still buffers pending for this request
2256 */
2257bool blk_end_request_err(struct request *rq, int error)
2258{
2259 WARN_ON(error >= 0);
2260 return blk_end_request(rq, error, blk_rq_err_bytes(rq));
2261}
2262EXPORT_SYMBOL_GPL(blk_end_request_err);
2188 2263
2189/** 2264/**
2190 * __blk_end_request - Helper function for drivers to complete the request. 2265 * __blk_end_request - Helper function for drivers to complete the request.
@@ -2203,7 +2278,7 @@ bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
2203{ 2278{
2204 return __blk_end_bidi_request(rq, error, nr_bytes, 0); 2279 return __blk_end_bidi_request(rq, error, nr_bytes, 0);
2205} 2280}
2206EXPORT_SYMBOL_GPL(__blk_end_request); 2281EXPORT_SYMBOL(__blk_end_request);
2207 2282
2208/** 2283/**
2209 * __blk_end_request_all - Helper function for drives to finish the request. 2284 * __blk_end_request_all - Helper function for drives to finish the request.
@@ -2224,7 +2299,7 @@ void __blk_end_request_all(struct request *rq, int error)
2224 pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); 2299 pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
2225 BUG_ON(pending); 2300 BUG_ON(pending);
2226} 2301}
2227EXPORT_SYMBOL_GPL(__blk_end_request_all); 2302EXPORT_SYMBOL(__blk_end_request_all);
2228 2303
2229/** 2304/**
2230 * __blk_end_request_cur - Helper function to finish the current request chunk. 2305 * __blk_end_request_cur - Helper function to finish the current request chunk.
@@ -2243,14 +2318,33 @@ bool __blk_end_request_cur(struct request *rq, int error)
2243{ 2318{
2244 return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); 2319 return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
2245} 2320}
2246EXPORT_SYMBOL_GPL(__blk_end_request_cur); 2321EXPORT_SYMBOL(__blk_end_request_cur);
2322
2323/**
2324 * __blk_end_request_err - Finish a request till the next failure boundary.
2325 * @rq: the request to finish till the next failure boundary for
2326 * @error: must be negative errno
2327 *
2328 * Description:
2329 * Complete @rq till the next failure boundary. Must be called
2330 * with queue lock held.
2331 *
2332 * Return:
2333 * %false - we are done with this request
2334 * %true - still buffers pending for this request
2335 */
2336bool __blk_end_request_err(struct request *rq, int error)
2337{
2338 WARN_ON(error >= 0);
2339 return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
2340}
2341EXPORT_SYMBOL_GPL(__blk_end_request_err);
2247 2342
2248void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 2343void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2249 struct bio *bio) 2344 struct bio *bio)
2250{ 2345{
2251 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and 2346 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
2252 we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */ 2347 rq->cmd_flags |= bio->bi_rw & REQ_RW;
2253 rq->cmd_flags |= (bio->bi_rw & 3);
2254 2348
2255 if (bio_has_data(bio)) { 2349 if (bio_has_data(bio)) {
2256 rq->nr_phys_segments = bio_phys_segments(q, bio); 2350 rq->nr_phys_segments = bio_phys_segments(q, bio);
@@ -2365,7 +2459,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
2365 __bio_clone(bio, bio_src); 2459 __bio_clone(bio, bio_src);
2366 2460
2367 if (bio_integrity(bio_src) && 2461 if (bio_integrity(bio_src) &&
2368 bio_integrity_clone(bio, bio_src, gfp_mask)) 2462 bio_integrity_clone(bio, bio_src, gfp_mask, bs))
2369 goto free_and_out; 2463 goto free_and_out;
2370 2464
2371 if (bio_ctr && bio_ctr(bio, bio_src, data)) 2465 if (bio_ctr && bio_ctr(bio, bio_src, data))
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 73e28d35568..15c630813b1 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -379,6 +379,7 @@ void blk_integrity_unregister(struct gendisk *disk)
379 379
380 kobject_uevent(&bi->kobj, KOBJ_REMOVE); 380 kobject_uevent(&bi->kobj, KOBJ_REMOVE);
381 kobject_del(&bi->kobj); 381 kobject_del(&bi->kobj);
382 kobject_put(&bi->kobj);
382 kmem_cache_free(integrity_cachep, bi); 383 kmem_cache_free(integrity_cachep, bi);
383 disk->integrity = NULL; 384 disk->integrity = NULL;
384} 385}
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
new file mode 100644
index 00000000000..ca564202ed7
--- /dev/null
+++ b/block/blk-iopoll.c
@@ -0,0 +1,227 @@
1/*
2 * Functions related to interrupt-poll handling in the block layer. This
3 * is similar to NAPI for network devices.
4 */
5#include <linux/kernel.h>
6#include <linux/module.h>
7#include <linux/init.h>
8#include <linux/bio.h>
9#include <linux/blkdev.h>
10#include <linux/interrupt.h>
11#include <linux/cpu.h>
12#include <linux/blk-iopoll.h>
13#include <linux/delay.h>
14
15#include "blk.h"
16
17int blk_iopoll_enabled = 1;
18EXPORT_SYMBOL(blk_iopoll_enabled);
19
20static unsigned int blk_iopoll_budget __read_mostly = 256;
21
22static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
23
24/**
25 * blk_iopoll_sched - Schedule a run of the iopoll handler
26 * @iop: The parent iopoll structure
27 *
28 * Description:
29 * Add this blk_iopoll structure to the pending poll list and trigger the
30 * raise of the blk iopoll softirq. The driver must already have gotten a
31 * succesful return from blk_iopoll_sched_prep() before calling this.
32 **/
33void blk_iopoll_sched(struct blk_iopoll *iop)
34{
35 unsigned long flags;
36
37 local_irq_save(flags);
38 list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
39 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
40 local_irq_restore(flags);
41}
42EXPORT_SYMBOL(blk_iopoll_sched);
43
44/**
45 * __blk_iopoll_complete - Mark this @iop as un-polled again
46 * @iop: The parent iopoll structure
47 *
48 * Description:
49 * See blk_iopoll_complete(). This function must be called with interrupts
50 * disabled.
51 **/
52void __blk_iopoll_complete(struct blk_iopoll *iop)
53{
54 list_del(&iop->list);
55 smp_mb__before_clear_bit();
56 clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
57}
58EXPORT_SYMBOL(__blk_iopoll_complete);
59
60/**
61 * blk_iopoll_complete - Mark this @iop as un-polled again
62 * @iop: The parent iopoll structure
63 *
64 * Description:
65 * If a driver consumes less than the assigned budget in its run of the
66 * iopoll handler, it'll end the polled mode by calling this function. The
67 * iopoll handler will not be invoked again before blk_iopoll_sched_prep()
68 * is called.
69 **/
70void blk_iopoll_complete(struct blk_iopoll *iopoll)
71{
72 unsigned long flags;
73
74 local_irq_save(flags);
75 __blk_iopoll_complete(iopoll);
76 local_irq_restore(flags);
77}
78EXPORT_SYMBOL(blk_iopoll_complete);
79
80static void blk_iopoll_softirq(struct softirq_action *h)
81{
82 struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
83 int rearm = 0, budget = blk_iopoll_budget;
84 unsigned long start_time = jiffies;
85
86 local_irq_disable();
87
88 while (!list_empty(list)) {
89 struct blk_iopoll *iop;
90 int work, weight;
91
92 /*
93 * If softirq window is exhausted then punt.
94 */
95 if (budget <= 0 || time_after(jiffies, start_time)) {
96 rearm = 1;
97 break;
98 }
99
100 local_irq_enable();
101
102 /* Even though interrupts have been re-enabled, this
103 * access is safe because interrupts can only add new
104 * entries to the tail of this list, and only ->poll()
105 * calls can remove this head entry from the list.
106 */
107 iop = list_entry(list->next, struct blk_iopoll, list);
108
109 weight = iop->weight;
110 work = 0;
111 if (test_bit(IOPOLL_F_SCHED, &iop->state))
112 work = iop->poll(iop, weight);
113
114 budget -= work;
115
116 local_irq_disable();
117
118 /*
119 * Drivers must not modify the iopoll state, if they
120 * consume their assigned weight (or more, some drivers can't
121 * easily just stop processing, they have to complete an
122 * entire mask of commands).In such cases this code
123 * still "owns" the iopoll instance and therefore can
124 * move the instance around on the list at-will.
125 */
126 if (work >= weight) {
127 if (blk_iopoll_disable_pending(iop))
128 __blk_iopoll_complete(iop);
129 else
130 list_move_tail(&iop->list, list);
131 }
132 }
133
134 if (rearm)
135 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
136
137 local_irq_enable();
138}
139
140/**
141 * blk_iopoll_disable - Disable iopoll on this @iop
142 * @iop: The parent iopoll structure
143 *
144 * Description:
145 * Disable io polling and wait for any pending callbacks to have completed.
146 **/
147void blk_iopoll_disable(struct blk_iopoll *iop)
148{
149 set_bit(IOPOLL_F_DISABLE, &iop->state);
150 while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
151 msleep(1);
152 clear_bit(IOPOLL_F_DISABLE, &iop->state);
153}
154EXPORT_SYMBOL(blk_iopoll_disable);
155
156/**
157 * blk_iopoll_enable - Enable iopoll on this @iop
158 * @iop: The parent iopoll structure
159 *
160 * Description:
161 * Enable iopoll on this @iop. Note that the handler run will not be
162 * scheduled, it will only mark it as active.
163 **/
164void blk_iopoll_enable(struct blk_iopoll *iop)
165{
166 BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
167 smp_mb__before_clear_bit();
168 clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
169}
170EXPORT_SYMBOL(blk_iopoll_enable);
171
172/**
173 * blk_iopoll_init - Initialize this @iop
174 * @iop: The parent iopoll structure
175 * @weight: The default weight (or command completion budget)
176 * @poll_fn: The handler to invoke
177 *
178 * Description:
179 * Initialize this blk_iopoll structure. Before being actively used, the
180 * driver must call blk_iopoll_enable().
181 **/
182void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
183{
184 memset(iop, 0, sizeof(*iop));
185 INIT_LIST_HEAD(&iop->list);
186 iop->weight = weight;
187 iop->poll = poll_fn;
188 set_bit(IOPOLL_F_SCHED, &iop->state);
189}
190EXPORT_SYMBOL(blk_iopoll_init);
191
192static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
193 unsigned long action, void *hcpu)
194{
195 /*
196 * If a CPU goes away, splice its entries to the current CPU
197 * and trigger a run of the softirq
198 */
199 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
200 int cpu = (unsigned long) hcpu;
201
202 local_irq_disable();
203 list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
204 &__get_cpu_var(blk_cpu_iopoll));
205 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
206 local_irq_enable();
207 }
208
209 return NOTIFY_OK;
210}
211
212static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = {
213 .notifier_call = blk_iopoll_cpu_notify,
214};
215
216static __init int blk_iopoll_setup(void)
217{
218 int i;
219
220 for_each_possible_cpu(i)
221 INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
222
223 open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
224 register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
225 return 0;
226}
227subsys_initcall(blk_iopoll_setup);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 39ce64432ba..99cb5cf1f44 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -311,6 +311,36 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
311 return 1; 311 return 1;
312} 312}
313 313
314/**
315 * blk_rq_set_mixed_merge - mark a request as mixed merge
316 * @rq: request to mark as mixed merge
317 *
318 * Description:
319 * @rq is about to be mixed merged. Make sure the attributes
320 * which can be mixed are set in each bio and mark @rq as mixed
321 * merged.
322 */
323void blk_rq_set_mixed_merge(struct request *rq)
324{
325 unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
326 struct bio *bio;
327
328 if (rq->cmd_flags & REQ_MIXED_MERGE)
329 return;
330
331 /*
332 * @rq will no longer represent mixable attributes for all the
333 * contained bios. It will just track those of the first one.
334 * Distributes the attributs to each bio.
335 */
336 for (bio = rq->bio; bio; bio = bio->bi_next) {
337 WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
338 (bio->bi_rw & REQ_FAILFAST_MASK) != ff);
339 bio->bi_rw |= ff;
340 }
341 rq->cmd_flags |= REQ_MIXED_MERGE;
342}
343
314static void blk_account_io_merge(struct request *req) 344static void blk_account_io_merge(struct request *req)
315{ 345{
316 if (blk_do_io_stat(req)) { 346 if (blk_do_io_stat(req)) {
@@ -321,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
321 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 351 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
322 352
323 part_round_stats(cpu, part); 353 part_round_stats(cpu, part);
324 part_dec_in_flight(part); 354 part_dec_in_flight(part, rq_data_dir(req));
325 355
326 part_stat_unlock(); 356 part_stat_unlock();
327 } 357 }
@@ -360,6 +390,19 @@ static int attempt_merge(struct request_queue *q, struct request *req,
360 return 0; 390 return 0;
361 391
362 /* 392 /*
393 * If failfast settings disagree or any of the two is already
394 * a mixed merge, mark both as mixed before proceeding. This
395 * makes sure that all involved bios have mixable attributes
396 * set properly.
397 */
398 if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
399 (req->cmd_flags & REQ_FAILFAST_MASK) !=
400 (next->cmd_flags & REQ_FAILFAST_MASK)) {
401 blk_rq_set_mixed_merge(req);
402 blk_rq_set_mixed_merge(next);
403 }
404
405 /*
363 * At this point we have either done a back merge 406 * At this point we have either done a back merge
364 * or front merge. We need the smaller start_time of 407 * or front merge. We need the smaller start_time of
365 * the merged requests to be the current request 408 * the merged requests to be the current request
diff --git a/block/blk-settings.c b/block/blk-settings.c
index bd582a7f531..83413ff8373 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -7,6 +7,7 @@
7#include <linux/bio.h> 7#include <linux/bio.h>
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
10#include <linux/gcd.h>
10 11
11#include "blk.h" 12#include "blk.h"
12 13
@@ -165,6 +166,13 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
165 blk_set_default_limits(&q->limits); 166 blk_set_default_limits(&q->limits);
166 167
167 /* 168 /*
169 * If the caller didn't supply a lock, fall back to our embedded
170 * per-queue locks
171 */
172 if (!q->queue_lock)
173 q->queue_lock = &q->__queue_lock;
174
175 /*
168 * by default assume old behaviour and bounce for any highmem page 176 * by default assume old behaviour and bounce for any highmem page
169 */ 177 */
170 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 178 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
@@ -377,8 +385,8 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
377EXPORT_SYMBOL(blk_queue_alignment_offset); 385EXPORT_SYMBOL(blk_queue_alignment_offset);
378 386
379/** 387/**
380 * blk_queue_io_min - set minimum request size for the queue 388 * blk_limits_io_min - set minimum request size for a device
381 * @q: the request queue for the device 389 * @limits: the queue limits
382 * @min: smallest I/O size in bytes 390 * @min: smallest I/O size in bytes
383 * 391 *
384 * Description: 392 * Description:
@@ -387,30 +395,73 @@ EXPORT_SYMBOL(blk_queue_alignment_offset);
387 * smallest I/O the device can perform without incurring a performance 395 * smallest I/O the device can perform without incurring a performance
388 * penalty. 396 * penalty.
389 */ 397 */
390void blk_queue_io_min(struct request_queue *q, unsigned int min) 398void blk_limits_io_min(struct queue_limits *limits, unsigned int min)
391{ 399{
392 q->limits.io_min = min; 400 limits->io_min = min;
393 401
394 if (q->limits.io_min < q->limits.logical_block_size) 402 if (limits->io_min < limits->logical_block_size)
395 q->limits.io_min = q->limits.logical_block_size; 403 limits->io_min = limits->logical_block_size;
396 404
397 if (q->limits.io_min < q->limits.physical_block_size) 405 if (limits->io_min < limits->physical_block_size)
398 q->limits.io_min = q->limits.physical_block_size; 406 limits->io_min = limits->physical_block_size;
407}
408EXPORT_SYMBOL(blk_limits_io_min);
409
410/**
411 * blk_queue_io_min - set minimum request size for the queue
412 * @q: the request queue for the device
413 * @min: smallest I/O size in bytes
414 *
415 * Description:
416 * Storage devices may report a granularity or preferred minimum I/O
417 * size which is the smallest request the device can perform without
418 * incurring a performance penalty. For disk drives this is often the
419 * physical block size. For RAID arrays it is often the stripe chunk
420 * size. A properly aligned multiple of minimum_io_size is the
421 * preferred request size for workloads where a high number of I/O
422 * operations is desired.
423 */
424void blk_queue_io_min(struct request_queue *q, unsigned int min)
425{
426 blk_limits_io_min(&q->limits, min);
399} 427}
400EXPORT_SYMBOL(blk_queue_io_min); 428EXPORT_SYMBOL(blk_queue_io_min);
401 429
402/** 430/**
431 * blk_limits_io_opt - set optimal request size for a device
432 * @limits: the queue limits
433 * @opt: smallest I/O size in bytes
434 *
435 * Description:
436 * Storage devices may report an optimal I/O size, which is the
437 * device's preferred unit for sustained I/O. This is rarely reported
438 * for disk drives. For RAID arrays it is usually the stripe width or
439 * the internal track size. A properly aligned multiple of
440 * optimal_io_size is the preferred request size for workloads where
441 * sustained throughput is desired.
442 */
443void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
444{
445 limits->io_opt = opt;
446}
447EXPORT_SYMBOL(blk_limits_io_opt);
448
449/**
403 * blk_queue_io_opt - set optimal request size for the queue 450 * blk_queue_io_opt - set optimal request size for the queue
404 * @q: the request queue for the device 451 * @q: the request queue for the device
405 * @opt: optimal request size in bytes 452 * @opt: optimal request size in bytes
406 * 453 *
407 * Description: 454 * Description:
408 * Drivers can call this function to set the preferred I/O request 455 * Storage devices may report an optimal I/O size, which is the
409 * size for devices that report such a value. 456 * device's preferred unit for sustained I/O. This is rarely reported
457 * for disk drives. For RAID arrays it is usually the stripe width or
458 * the internal track size. A properly aligned multiple of
459 * optimal_io_size is the preferred request size for workloads where
460 * sustained throughput is desired.
410 */ 461 */
411void blk_queue_io_opt(struct request_queue *q, unsigned int opt) 462void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
412{ 463{
413 q->limits.io_opt = opt; 464 blk_limits_io_opt(&q->limits, opt);
414} 465}
415EXPORT_SYMBOL(blk_queue_io_opt); 466EXPORT_SYMBOL(blk_queue_io_opt);
416 467
@@ -426,27 +477,7 @@ EXPORT_SYMBOL(blk_queue_io_opt);
426 **/ 477 **/
427void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) 478void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
428{ 479{
429 /* zero is "infinity" */ 480 blk_stack_limits(&t->limits, &b->limits, 0);
430 t->limits.max_sectors = min_not_zero(queue_max_sectors(t),
431 queue_max_sectors(b));
432
433 t->limits.max_hw_sectors = min_not_zero(queue_max_hw_sectors(t),
434 queue_max_hw_sectors(b));
435
436 t->limits.seg_boundary_mask = min_not_zero(queue_segment_boundary(t),
437 queue_segment_boundary(b));
438
439 t->limits.max_phys_segments = min_not_zero(queue_max_phys_segments(t),
440 queue_max_phys_segments(b));
441
442 t->limits.max_hw_segments = min_not_zero(queue_max_hw_segments(t),
443 queue_max_hw_segments(b));
444
445 t->limits.max_segment_size = min_not_zero(queue_max_segment_size(t),
446 queue_max_segment_size(b));
447
448 t->limits.logical_block_size = max(queue_logical_block_size(t),
449 queue_logical_block_size(b));
450 481
451 if (!t->queue_lock) 482 if (!t->queue_lock)
452 WARN_ON_ONCE(1); 483 WARN_ON_ONCE(1);
@@ -516,6 +547,16 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
516 return -1; 547 return -1;
517 } 548 }
518 549
550 /* Find lcm() of optimal I/O size */
551 if (t->io_opt && b->io_opt)
552 t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt);
553 else if (b->io_opt)
554 t->io_opt = b->io_opt;
555
556 /* Verify that optimal I/O size is a multiple of io_min */
557 if (t->io_min && t->io_opt % t->io_min)
558 return -1;
559
519 return 0; 560 return 0;
520} 561}
521EXPORT_SYMBOL(blk_stack_limits); 562EXPORT_SYMBOL(blk_stack_limits);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b1cd04087d6..b78c9c3e267 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -16,9 +16,9 @@ struct queue_sysfs_entry {
16}; 16};
17 17
18static ssize_t 18static ssize_t
19queue_var_show(unsigned int var, char *page) 19queue_var_show(unsigned long var, char *page)
20{ 20{
21 return sprintf(page, "%d\n", var); 21 return sprintf(page, "%lu\n", var);
22} 22}
23 23
24static ssize_t 24static ssize_t
@@ -40,7 +40,12 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
40{ 40{
41 struct request_list *rl = &q->rq; 41 struct request_list *rl = &q->rq;
42 unsigned long nr; 42 unsigned long nr;
43 int ret = queue_var_store(&nr, page, count); 43 int ret;
44
45 if (!q->request_fn)
46 return -EINVAL;
47
48 ret = queue_var_store(&nr, page, count);
44 if (nr < BLKDEV_MIN_RQ) 49 if (nr < BLKDEV_MIN_RQ)
45 nr = BLKDEV_MIN_RQ; 50 nr = BLKDEV_MIN_RQ;
46 51
@@ -77,7 +82,8 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
77 82
78static ssize_t queue_ra_show(struct request_queue *q, char *page) 83static ssize_t queue_ra_show(struct request_queue *q, char *page)
79{ 84{
80 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 85 unsigned long ra_kb = q->backing_dev_info.ra_pages <<
86 (PAGE_CACHE_SHIFT - 10);
81 87
82 return queue_var_show(ra_kb, (page)); 88 return queue_var_show(ra_kb, (page));
83} 89}
@@ -132,7 +138,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
132 return -EINVAL; 138 return -EINVAL;
133 139
134 spin_lock_irq(q->queue_lock); 140 spin_lock_irq(q->queue_lock);
135 blk_queue_max_sectors(q, max_sectors_kb << 1); 141 q->limits.max_sectors = max_sectors_kb << 1;
136 spin_unlock_irq(q->queue_lock); 142 spin_unlock_irq(q->queue_lock);
137 143
138 return ret; 144 return ret;
@@ -189,9 +195,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
189 195
190static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) 196static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
191{ 197{
192 unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); 198 bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
193 199
194 return queue_var_show(set != 0, page); 200 return queue_var_show(set, page);
195} 201}
196 202
197static ssize_t 203static ssize_t
diff --git a/block/blk.h b/block/blk.h
index 3fae6add543..5ee3d7e72fe 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -104,6 +104,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
104int attempt_back_merge(struct request_queue *q, struct request *rq); 104int attempt_back_merge(struct request_queue *q, struct request *rq);
105int attempt_front_merge(struct request_queue *q, struct request *rq); 105int attempt_front_merge(struct request_queue *q, struct request *rq);
106void blk_recalc_rq_segments(struct request *rq); 106void blk_recalc_rq_segments(struct request *rq);
107void blk_rq_set_mixed_merge(struct request *rq);
107 108
108void blk_queue_congestion_threshold(struct request_queue *q); 109void blk_queue_congestion_threshold(struct request_queue *q);
109 110
diff --git a/block/bsg.c b/block/bsg.c
index e7d47525424..0676301f16d 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -186,7 +186,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
186 return -EFAULT; 186 return -EFAULT;
187 187
188 if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { 188 if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
189 if (blk_verify_command(&q->cmd_filter, rq->cmd, has_write_perm)) 189 if (blk_verify_command(rq->cmd, has_write_perm))
190 return -EPERM; 190 return -EPERM;
191 } else if (!capable(CAP_SYS_RAWIO)) 191 } else if (!capable(CAP_SYS_RAWIO))
192 return -EPERM; 192 return -EPERM;
@@ -1062,7 +1062,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue);
1062 1062
1063static struct cdev bsg_cdev; 1063static struct cdev bsg_cdev;
1064 1064
1065static char *bsg_nodename(struct device *dev) 1065static char *bsg_devnode(struct device *dev, mode_t *mode)
1066{ 1066{
1067 return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); 1067 return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
1068} 1068}
@@ -1087,7 +1087,7 @@ static int __init bsg_init(void)
1087 ret = PTR_ERR(bsg_class); 1087 ret = PTR_ERR(bsg_class);
1088 goto destroy_kmemcache; 1088 goto destroy_kmemcache;
1089 } 1089 }
1090 bsg_class->nodename = bsg_nodename; 1090 bsg_class->devnode = bsg_devnode;
1091 1091
1092 ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); 1092 ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
1093 if (ret) 1093 if (ret)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 833ec18eaa6..1ca813b16e7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125;
48static struct kmem_cache *cfq_pool; 48static struct kmem_cache *cfq_pool;
49static struct kmem_cache *cfq_ioc_pool; 49static struct kmem_cache *cfq_ioc_pool;
50 50
51static DEFINE_PER_CPU(unsigned long, ioc_count); 51static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
52static struct completion *ioc_gone; 52static struct completion *ioc_gone;
53static DEFINE_SPINLOCK(ioc_gone_lock); 53static DEFINE_SPINLOCK(ioc_gone_lock);
54 54
@@ -71,6 +71,51 @@ struct cfq_rb_root {
71#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } 71#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, }
72 72
73/* 73/*
74 * Per process-grouping structure
75 */
76struct cfq_queue {
77 /* reference count */
78 atomic_t ref;
79 /* various state flags, see below */
80 unsigned int flags;
81 /* parent cfq_data */
82 struct cfq_data *cfqd;
83 /* service_tree member */
84 struct rb_node rb_node;
85 /* service_tree key */
86 unsigned long rb_key;
87 /* prio tree member */
88 struct rb_node p_node;
89 /* prio tree root we belong to, if any */
90 struct rb_root *p_root;
91 /* sorted list of pending requests */
92 struct rb_root sort_list;
93 /* if fifo isn't expired, next request to serve */
94 struct request *next_rq;
95 /* requests queued in sort_list */
96 int queued[2];
97 /* currently allocated requests */
98 int allocated[2];
99 /* fifo list of requests in sort_list */
100 struct list_head fifo;
101
102 unsigned long slice_end;
103 long slice_resid;
104 unsigned int slice_dispatch;
105
106 /* pending metadata requests */
107 int meta_pending;
108 /* number of requests that are on the dispatch list or inside driver */
109 int dispatched;
110
111 /* io prio of this group */
112 unsigned short ioprio, org_ioprio;
113 unsigned short ioprio_class, org_ioprio_class;
114
115 pid_t pid;
116};
117
118/*
74 * Per block device queue structure 119 * Per block device queue structure
75 */ 120 */
76struct cfq_data { 121struct cfq_data {
@@ -89,13 +134,8 @@ struct cfq_data {
89 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 134 struct rb_root prio_trees[CFQ_PRIO_LISTS];
90 135
91 unsigned int busy_queues; 136 unsigned int busy_queues;
92 /*
93 * Used to track any pending rt requests so we can pre-empt current
94 * non-RT cfqq in service when this value is non-zero.
95 */
96 unsigned int busy_rt_queues;
97 137
98 int rq_in_driver; 138 int rq_in_driver[2];
99 int sync_flight; 139 int sync_flight;
100 140
101 /* 141 /*
@@ -135,58 +175,17 @@ struct cfq_data {
135 unsigned int cfq_slice_idle; 175 unsigned int cfq_slice_idle;
136 176
137 struct list_head cic_list; 177 struct list_head cic_list;
138};
139
140/*
141 * Per process-grouping structure
142 */
143struct cfq_queue {
144 /* reference count */
145 atomic_t ref;
146 /* various state flags, see below */
147 unsigned int flags;
148 /* parent cfq_data */
149 struct cfq_data *cfqd;
150 /* service_tree member */
151 struct rb_node rb_node;
152 /* service_tree key */
153 unsigned long rb_key;
154 /* prio tree member */
155 struct rb_node p_node;
156 /* prio tree root we belong to, if any */
157 struct rb_root *p_root;
158 /* sorted list of pending requests */
159 struct rb_root sort_list;
160 /* if fifo isn't expired, next request to serve */
161 struct request *next_rq;
162 /* requests queued in sort_list */
163 int queued[2];
164 /* currently allocated requests */
165 int allocated[2];
166 /* fifo list of requests in sort_list */
167 struct list_head fifo;
168
169 unsigned long slice_end;
170 long slice_resid;
171 unsigned int slice_dispatch;
172 178
173 /* pending metadata requests */ 179 /*
174 int meta_pending; 180 * Fallback dummy cfqq for extreme OOM conditions
175 /* number of requests that are on the dispatch list or inside driver */ 181 */
176 int dispatched; 182 struct cfq_queue oom_cfqq;
177
178 /* io prio of this group */
179 unsigned short ioprio, org_ioprio;
180 unsigned short ioprio_class, org_ioprio_class;
181
182 pid_t pid;
183}; 183};
184 184
185enum cfqq_state_flags { 185enum cfqq_state_flags {
186 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 186 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
187 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 187 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
188 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 188 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
189 CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
190 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 189 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
191 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 190 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
192 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 191 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
@@ -213,7 +212,6 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
213CFQ_CFQQ_FNS(on_rr); 212CFQ_CFQQ_FNS(on_rr);
214CFQ_CFQQ_FNS(wait_request); 213CFQ_CFQQ_FNS(wait_request);
215CFQ_CFQQ_FNS(must_dispatch); 214CFQ_CFQQ_FNS(must_dispatch);
216CFQ_CFQQ_FNS(must_alloc);
217CFQ_CFQQ_FNS(must_alloc_slice); 215CFQ_CFQQ_FNS(must_alloc_slice);
218CFQ_CFQQ_FNS(fifo_expire); 216CFQ_CFQQ_FNS(fifo_expire);
219CFQ_CFQQ_FNS(idle_window); 217CFQ_CFQQ_FNS(idle_window);
@@ -234,6 +232,11 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
234static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 232static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
235 struct io_context *); 233 struct io_context *);
236 234
235static inline int rq_in_driver(struct cfq_data *cfqd)
236{
237 return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
238}
239
237static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 240static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
238 int is_sync) 241 int is_sync)
239{ 242{
@@ -252,7 +255,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
252 */ 255 */
253static inline int cfq_bio_sync(struct bio *bio) 256static inline int cfq_bio_sync(struct bio *bio)
254{ 257{
255 if (bio_data_dir(bio) == READ || bio_sync(bio)) 258 if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO))
256 return 1; 259 return 1;
257 260
258 return 0; 261 return 0;
@@ -643,8 +646,6 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
643 BUG_ON(cfq_cfqq_on_rr(cfqq)); 646 BUG_ON(cfq_cfqq_on_rr(cfqq));
644 cfq_mark_cfqq_on_rr(cfqq); 647 cfq_mark_cfqq_on_rr(cfqq);
645 cfqd->busy_queues++; 648 cfqd->busy_queues++;
646 if (cfq_class_rt(cfqq))
647 cfqd->busy_rt_queues++;
648 649
649 cfq_resort_rr_list(cfqd, cfqq); 650 cfq_resort_rr_list(cfqd, cfqq);
650} 651}
@@ -668,8 +669,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
668 669
669 BUG_ON(!cfqd->busy_queues); 670 BUG_ON(!cfqd->busy_queues);
670 cfqd->busy_queues--; 671 cfqd->busy_queues--;
671 if (cfq_class_rt(cfqq))
672 cfqd->busy_rt_queues--;
673} 672}
674 673
675/* 674/*
@@ -755,9 +754,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
755{ 754{
756 struct cfq_data *cfqd = q->elevator->elevator_data; 755 struct cfq_data *cfqd = q->elevator->elevator_data;
757 756
758 cfqd->rq_in_driver++; 757 cfqd->rq_in_driver[rq_is_sync(rq)]++;
759 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 758 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
760 cfqd->rq_in_driver); 759 rq_in_driver(cfqd));
761 760
762 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 761 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
763} 762}
@@ -765,11 +764,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
765static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 764static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
766{ 765{
767 struct cfq_data *cfqd = q->elevator->elevator_data; 766 struct cfq_data *cfqd = q->elevator->elevator_data;
767 const int sync = rq_is_sync(rq);
768 768
769 WARN_ON(!cfqd->rq_in_driver); 769 WARN_ON(!cfqd->rq_in_driver[sync]);
770 cfqd->rq_in_driver--; 770 cfqd->rq_in_driver[sync]--;
771 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 771 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
772 cfqd->rq_in_driver); 772 rq_in_driver(cfqd));
773} 773}
774 774
775static void cfq_remove_request(struct request *rq) 775static void cfq_remove_request(struct request *rq)
@@ -1075,7 +1075,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1075 /* 1075 /*
1076 * still requests with the driver, don't idle 1076 * still requests with the driver, don't idle
1077 */ 1077 */
1078 if (cfqd->rq_in_driver) 1078 if (rq_in_driver(cfqd))
1079 return; 1079 return;
1080 1080
1081 /* 1081 /*
@@ -1110,6 +1110,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1110 1110
1111 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 1111 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
1112 1112
1113 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1113 cfq_remove_request(rq); 1114 cfq_remove_request(rq);
1114 cfqq->dispatched++; 1115 cfqq->dispatched++;
1115 elv_dispatch_sort(q, rq); 1116 elv_dispatch_sort(q, rq);
@@ -1174,20 +1175,6 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1174 goto expire; 1175 goto expire;
1175 1176
1176 /* 1177 /*
1177 * If we have a RT cfqq waiting, then we pre-empt the current non-rt
1178 * cfqq.
1179 */
1180 if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) {
1181 /*
1182 * We simulate this as cfqq timed out so that it gets to bank
1183 * the remaining of its time slice.
1184 */
1185 cfq_log_cfqq(cfqd, cfqq, "preempt");
1186 cfq_slice_expired(cfqd, 1);
1187 goto new_queue;
1188 }
1189
1190 /*
1191 * The active queue has requests and isn't expired, allow it to 1178 * The active queue has requests and isn't expired, allow it to
1192 * dispatch. 1179 * dispatch.
1193 */ 1180 */
@@ -1307,6 +1294,12 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
1307 return 0; 1294 return 0;
1308 1295
1309 /* 1296 /*
1297 * Drain async requests before we start sync IO
1298 */
1299 if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
1300 return 0;
1301
1302 /*
1310 * If this is an async queue and we have sync IO in flight, let it wait 1303 * If this is an async queue and we have sync IO in flight, let it wait
1311 */ 1304 */
1312 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) 1305 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
@@ -1357,7 +1350,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
1357 cfq_slice_expired(cfqd, 0); 1350 cfq_slice_expired(cfqd, 0);
1358 } 1351 }
1359 1352
1360 cfq_log(cfqd, "dispatched a request"); 1353 cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
1361 return 1; 1354 return 1;
1362} 1355}
1363 1356
@@ -1422,7 +1415,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
1422 cic = container_of(head, struct cfq_io_context, rcu_head); 1415 cic = container_of(head, struct cfq_io_context, rcu_head);
1423 1416
1424 kmem_cache_free(cfq_ioc_pool, cic); 1417 kmem_cache_free(cfq_ioc_pool, cic);
1425 elv_ioc_count_dec(ioc_count); 1418 elv_ioc_count_dec(cfq_ioc_count);
1426 1419
1427 if (ioc_gone) { 1420 if (ioc_gone) {
1428 /* 1421 /*
@@ -1431,7 +1424,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
1431 * complete ioc_gone and set it back to NULL 1424 * complete ioc_gone and set it back to NULL
1432 */ 1425 */
1433 spin_lock(&ioc_gone_lock); 1426 spin_lock(&ioc_gone_lock);
1434 if (ioc_gone && !elv_ioc_count_read(ioc_count)) { 1427 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
1435 complete(ioc_gone); 1428 complete(ioc_gone);
1436 ioc_gone = NULL; 1429 ioc_gone = NULL;
1437 } 1430 }
@@ -1557,7 +1550,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1557 INIT_HLIST_NODE(&cic->cic_list); 1550 INIT_HLIST_NODE(&cic->cic_list);
1558 cic->dtor = cfq_free_io_context; 1551 cic->dtor = cfq_free_io_context;
1559 cic->exit = cfq_exit_io_context; 1552 cic->exit = cfq_exit_io_context;
1560 elv_ioc_count_inc(ioc_count); 1553 elv_ioc_count_inc(cfq_ioc_count);
1561 } 1554 }
1562 1555
1563 return cic; 1556 return cic;
@@ -1641,6 +1634,26 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
1641 ioc->ioprio_changed = 0; 1634 ioc->ioprio_changed = 0;
1642} 1635}
1643 1636
1637static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1638 pid_t pid, int is_sync)
1639{
1640 RB_CLEAR_NODE(&cfqq->rb_node);
1641 RB_CLEAR_NODE(&cfqq->p_node);
1642 INIT_LIST_HEAD(&cfqq->fifo);
1643
1644 atomic_set(&cfqq->ref, 0);
1645 cfqq->cfqd = cfqd;
1646
1647 cfq_mark_cfqq_prio_changed(cfqq);
1648
1649 if (is_sync) {
1650 if (!cfq_class_idle(cfqq))
1651 cfq_mark_cfqq_idle_window(cfqq);
1652 cfq_mark_cfqq_sync(cfqq);
1653 }
1654 cfqq->pid = pid;
1655}
1656
1644static struct cfq_queue * 1657static struct cfq_queue *
1645cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync, 1658cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
1646 struct io_context *ioc, gfp_t gfp_mask) 1659 struct io_context *ioc, gfp_t gfp_mask)
@@ -1653,56 +1666,40 @@ retry:
1653 /* cic always exists here */ 1666 /* cic always exists here */
1654 cfqq = cic_to_cfqq(cic, is_sync); 1667 cfqq = cic_to_cfqq(cic, is_sync);
1655 1668
1656 if (!cfqq) { 1669 /*
1670 * Always try a new alloc if we fell back to the OOM cfqq
1671 * originally, since it should just be a temporary situation.
1672 */
1673 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
1674 cfqq = NULL;
1657 if (new_cfqq) { 1675 if (new_cfqq) {
1658 cfqq = new_cfqq; 1676 cfqq = new_cfqq;
1659 new_cfqq = NULL; 1677 new_cfqq = NULL;
1660 } else if (gfp_mask & __GFP_WAIT) { 1678 } else if (gfp_mask & __GFP_WAIT) {
1661 /*
1662 * Inform the allocator of the fact that we will
1663 * just repeat this allocation if it fails, to allow
1664 * the allocator to do whatever it needs to attempt to
1665 * free memory.
1666 */
1667 spin_unlock_irq(cfqd->queue->queue_lock); 1679 spin_unlock_irq(cfqd->queue->queue_lock);
1668 new_cfqq = kmem_cache_alloc_node(cfq_pool, 1680 new_cfqq = kmem_cache_alloc_node(cfq_pool,
1669 gfp_mask | __GFP_NOFAIL | __GFP_ZERO, 1681 gfp_mask | __GFP_ZERO,
1670 cfqd->queue->node); 1682 cfqd->queue->node);
1671 spin_lock_irq(cfqd->queue->queue_lock); 1683 spin_lock_irq(cfqd->queue->queue_lock);
1672 goto retry; 1684 if (new_cfqq)
1685 goto retry;
1673 } else { 1686 } else {
1674 cfqq = kmem_cache_alloc_node(cfq_pool, 1687 cfqq = kmem_cache_alloc_node(cfq_pool,
1675 gfp_mask | __GFP_ZERO, 1688 gfp_mask | __GFP_ZERO,
1676 cfqd->queue->node); 1689 cfqd->queue->node);
1677 if (!cfqq)
1678 goto out;
1679 } 1690 }
1680 1691
1681 RB_CLEAR_NODE(&cfqq->rb_node); 1692 if (cfqq) {
1682 RB_CLEAR_NODE(&cfqq->p_node); 1693 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
1683 INIT_LIST_HEAD(&cfqq->fifo); 1694 cfq_init_prio_data(cfqq, ioc);
1684 1695 cfq_log_cfqq(cfqd, cfqq, "alloced");
1685 atomic_set(&cfqq->ref, 0); 1696 } else
1686 cfqq->cfqd = cfqd; 1697 cfqq = &cfqd->oom_cfqq;
1687
1688 cfq_mark_cfqq_prio_changed(cfqq);
1689
1690 cfq_init_prio_data(cfqq, ioc);
1691
1692 if (is_sync) {
1693 if (!cfq_class_idle(cfqq))
1694 cfq_mark_cfqq_idle_window(cfqq);
1695 cfq_mark_cfqq_sync(cfqq);
1696 }
1697 cfqq->pid = current->pid;
1698 cfq_log_cfqq(cfqd, cfqq, "alloced");
1699 } 1698 }
1700 1699
1701 if (new_cfqq) 1700 if (new_cfqq)
1702 kmem_cache_free(cfq_pool, new_cfqq); 1701 kmem_cache_free(cfq_pool, new_cfqq);
1703 1702
1704out:
1705 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1706 return cfqq; 1703 return cfqq;
1707} 1704}
1708 1705
@@ -1735,11 +1732,8 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
1735 cfqq = *async_cfqq; 1732 cfqq = *async_cfqq;
1736 } 1733 }
1737 1734
1738 if (!cfqq) { 1735 if (!cfqq)
1739 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 1736 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
1740 if (!cfqq)
1741 return NULL;
1742 }
1743 1737
1744 /* 1738 /*
1745 * pin the queue now that it's allocated, scheduler exit will prune it 1739 * pin the queue now that it's allocated, scheduler exit will prune it
@@ -2124,11 +2118,11 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
2124 */ 2118 */
2125static void cfq_update_hw_tag(struct cfq_data *cfqd) 2119static void cfq_update_hw_tag(struct cfq_data *cfqd)
2126{ 2120{
2127 if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak) 2121 if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
2128 cfqd->rq_in_driver_peak = cfqd->rq_in_driver; 2122 cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
2129 2123
2130 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 2124 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
2131 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) 2125 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
2132 return; 2126 return;
2133 2127
2134 if (cfqd->hw_tag_samples++ < 50) 2128 if (cfqd->hw_tag_samples++ < 50)
@@ -2155,9 +2149,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
2155 2149
2156 cfq_update_hw_tag(cfqd); 2150 cfq_update_hw_tag(cfqd);
2157 2151
2158 WARN_ON(!cfqd->rq_in_driver); 2152 WARN_ON(!cfqd->rq_in_driver[sync]);
2159 WARN_ON(!cfqq->dispatched); 2153 WARN_ON(!cfqq->dispatched);
2160 cfqd->rq_in_driver--; 2154 cfqd->rq_in_driver[sync]--;
2161 cfqq->dispatched--; 2155 cfqq->dispatched--;
2162 2156
2163 if (cfq_cfqq_sync(cfqq)) 2157 if (cfq_cfqq_sync(cfqq))
@@ -2191,7 +2185,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
2191 cfq_arm_slice_timer(cfqd); 2185 cfq_arm_slice_timer(cfqd);
2192 } 2186 }
2193 2187
2194 if (!cfqd->rq_in_driver) 2188 if (!rq_in_driver(cfqd))
2195 cfq_schedule_dispatch(cfqd); 2189 cfq_schedule_dispatch(cfqd);
2196} 2190}
2197 2191
@@ -2223,8 +2217,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
2223 2217
2224static inline int __cfq_may_queue(struct cfq_queue *cfqq) 2218static inline int __cfq_may_queue(struct cfq_queue *cfqq)
2225{ 2219{
2226 if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && 2220 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
2227 !cfq_cfqq_must_alloc_slice(cfqq)) {
2228 cfq_mark_cfqq_must_alloc_slice(cfqq); 2221 cfq_mark_cfqq_must_alloc_slice(cfqq);
2229 return ELV_MQUEUE_MUST; 2222 return ELV_MQUEUE_MUST;
2230 } 2223 }
@@ -2305,17 +2298,12 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
2305 goto queue_fail; 2298 goto queue_fail;
2306 2299
2307 cfqq = cic_to_cfqq(cic, is_sync); 2300 cfqq = cic_to_cfqq(cic, is_sync);
2308 if (!cfqq) { 2301 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2309 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 2302 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
2310
2311 if (!cfqq)
2312 goto queue_fail;
2313
2314 cic_set_cfqq(cic, cfqq, is_sync); 2303 cic_set_cfqq(cic, cfqq, is_sync);
2315 } 2304 }
2316 2305
2317 cfqq->allocated[rw]++; 2306 cfqq->allocated[rw]++;
2318 cfq_clear_cfqq_must_alloc(cfqq);
2319 atomic_inc(&cfqq->ref); 2307 atomic_inc(&cfqq->ref);
2320 2308
2321 spin_unlock_irqrestore(q->queue_lock, flags); 2309 spin_unlock_irqrestore(q->queue_lock, flags);
@@ -2465,6 +2453,14 @@ static void *cfq_init_queue(struct request_queue *q)
2465 for (i = 0; i < CFQ_PRIO_LISTS; i++) 2453 for (i = 0; i < CFQ_PRIO_LISTS; i++)
2466 cfqd->prio_trees[i] = RB_ROOT; 2454 cfqd->prio_trees[i] = RB_ROOT;
2467 2455
2456 /*
2457 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
2458 * Grab a permanent reference to it, so that the normal code flow
2459 * will not attempt to free it.
2460 */
2461 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
2462 atomic_inc(&cfqd->oom_cfqq.ref);
2463
2468 INIT_LIST_HEAD(&cfqd->cic_list); 2464 INIT_LIST_HEAD(&cfqd->cic_list);
2469 2465
2470 cfqd->queue = q; 2466 cfqd->queue = q;
@@ -2658,7 +2654,7 @@ static void __exit cfq_exit(void)
2658 * this also protects us from entering cfq_slab_kill() with 2654 * this also protects us from entering cfq_slab_kill() with
2659 * pending RCU callbacks 2655 * pending RCU callbacks
2660 */ 2656 */
2661 if (elv_ioc_count_read(ioc_count)) 2657 if (elv_ioc_count_read(cfq_ioc_count))
2662 wait_for_completion(&all_gone); 2658 wait_for_completion(&all_gone);
2663 cfq_slab_kill(); 2659 cfq_slab_kill();
2664} 2660}
diff --git a/block/cmd-filter.c b/block/cmd-filter.c
deleted file mode 100644
index 572bbc2f900..00000000000
--- a/block/cmd-filter.c
+++ /dev/null
@@ -1,233 +0,0 @@
1/*
2 * Copyright 2004 Peter M. Jones <pjones@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 *
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 *
18 */
19
20#include <linux/list.h>
21#include <linux/genhd.h>
22#include <linux/spinlock.h>
23#include <linux/capability.h>
24#include <linux/bitops.h>
25#include <linux/blkdev.h>
26
27#include <scsi/scsi.h>
28#include <linux/cdrom.h>
29
30int blk_verify_command(struct blk_cmd_filter *filter,
31 unsigned char *cmd, fmode_t has_write_perm)
32{
33 /* root can do any command. */
34 if (capable(CAP_SYS_RAWIO))
35 return 0;
36
37 /* if there's no filter set, assume we're filtering everything out */
38 if (!filter)
39 return -EPERM;
40
41 /* Anybody who can open the device can do a read-safe command */
42 if (test_bit(cmd[0], filter->read_ok))
43 return 0;
44
45 /* Write-safe commands require a writable open */
46 if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
47 return 0;
48
49 return -EPERM;
50}
51EXPORT_SYMBOL(blk_verify_command);
52
53#if 0
54/* and now, the sysfs stuff */
55static ssize_t rcf_cmds_show(struct blk_cmd_filter *filter, char *page,
56 int rw)
57{
58 char *npage = page;
59 unsigned long *okbits;
60 int i;
61
62 if (rw == READ)
63 okbits = filter->read_ok;
64 else
65 okbits = filter->write_ok;
66
67 for (i = 0; i < BLK_SCSI_MAX_CMDS; i++) {
68 if (test_bit(i, okbits)) {
69 npage += sprintf(npage, "0x%02x", i);
70 if (i < BLK_SCSI_MAX_CMDS - 1)
71 sprintf(npage++, " ");
72 }
73 }
74
75 if (npage != page)
76 npage += sprintf(npage, "\n");
77
78 return npage - page;
79}
80
81static ssize_t rcf_readcmds_show(struct blk_cmd_filter *filter, char *page)
82{
83 return rcf_cmds_show(filter, page, READ);
84}
85
86static ssize_t rcf_writecmds_show(struct blk_cmd_filter *filter,
87 char *page)
88{
89 return rcf_cmds_show(filter, page, WRITE);
90}
91
92static ssize_t rcf_cmds_store(struct blk_cmd_filter *filter,
93 const char *page, size_t count, int rw)
94{
95 unsigned long okbits[BLK_SCSI_CMD_PER_LONG], *target_okbits;
96 int cmd, set;
97 char *p, *status;
98
99 if (rw == READ) {
100 memcpy(&okbits, filter->read_ok, sizeof(okbits));
101 target_okbits = filter->read_ok;
102 } else {
103 memcpy(&okbits, filter->write_ok, sizeof(okbits));
104 target_okbits = filter->write_ok;
105 }
106
107 while ((p = strsep((char **)&page, " ")) != NULL) {
108 set = 1;
109
110 if (p[0] == '+') {
111 p++;
112 } else if (p[0] == '-') {
113 set = 0;
114 p++;
115 }
116
117 cmd = simple_strtol(p, &status, 16);
118
119 /* either of these cases means invalid input, so do nothing. */
120 if ((status == p) || cmd >= BLK_SCSI_MAX_CMDS)
121 return -EINVAL;
122
123 if (set)
124 __set_bit(cmd, okbits);
125 else
126 __clear_bit(cmd, okbits);
127 }
128
129 memcpy(target_okbits, okbits, sizeof(okbits));
130 return count;
131}
132
133static ssize_t rcf_readcmds_store(struct blk_cmd_filter *filter,
134 const char *page, size_t count)
135{
136 return rcf_cmds_store(filter, page, count, READ);
137}
138
139static ssize_t rcf_writecmds_store(struct blk_cmd_filter *filter,
140 const char *page, size_t count)
141{
142 return rcf_cmds_store(filter, page, count, WRITE);
143}
144
145struct rcf_sysfs_entry {
146 struct attribute attr;
147 ssize_t (*show)(struct blk_cmd_filter *, char *);
148 ssize_t (*store)(struct blk_cmd_filter *, const char *, size_t);
149};
150
151static struct rcf_sysfs_entry rcf_readcmds_entry = {
152 .attr = { .name = "read_table", .mode = S_IRUGO | S_IWUSR },
153 .show = rcf_readcmds_show,
154 .store = rcf_readcmds_store,
155};
156
157static struct rcf_sysfs_entry rcf_writecmds_entry = {
158 .attr = {.name = "write_table", .mode = S_IRUGO | S_IWUSR },
159 .show = rcf_writecmds_show,
160 .store = rcf_writecmds_store,
161};
162
163static struct attribute *default_attrs[] = {
164 &rcf_readcmds_entry.attr,
165 &rcf_writecmds_entry.attr,
166 NULL,
167};
168
169#define to_rcf(atr) container_of((atr), struct rcf_sysfs_entry, attr)
170
171static ssize_t
172rcf_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
173{
174 struct rcf_sysfs_entry *entry = to_rcf(attr);
175 struct blk_cmd_filter *filter;
176
177 filter = container_of(kobj, struct blk_cmd_filter, kobj);
178 if (entry->show)
179 return entry->show(filter, page);
180
181 return 0;
182}
183
184static ssize_t
185rcf_attr_store(struct kobject *kobj, struct attribute *attr,
186 const char *page, size_t length)
187{
188 struct rcf_sysfs_entry *entry = to_rcf(attr);
189 struct blk_cmd_filter *filter;
190
191 if (!capable(CAP_SYS_RAWIO))
192 return -EPERM;
193
194 if (!entry->store)
195 return -EINVAL;
196
197 filter = container_of(kobj, struct blk_cmd_filter, kobj);
198 return entry->store(filter, page, length);
199}
200
201static struct sysfs_ops rcf_sysfs_ops = {
202 .show = rcf_attr_show,
203 .store = rcf_attr_store,
204};
205
206static struct kobj_type rcf_ktype = {
207 .sysfs_ops = &rcf_sysfs_ops,
208 .default_attrs = default_attrs,
209};
210
211int blk_register_filter(struct gendisk *disk)
212{
213 int ret;
214 struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
215
216 ret = kobject_init_and_add(&filter->kobj, &rcf_ktype,
217 &disk_to_dev(disk)->kobj,
218 "%s", "cmd_filter");
219 if (ret < 0)
220 return ret;
221
222 return 0;
223}
224EXPORT_SYMBOL(blk_register_filter);
225
226void blk_unregister_filter(struct gendisk *disk)
227{
228 struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
229
230 kobject_put(&filter->kobj);
231}
232EXPORT_SYMBOL(blk_unregister_filter);
233#endif
diff --git a/block/elevator.c b/block/elevator.c
index ca861927ba4..1975b619c86 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -79,7 +79,8 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
79 /* 79 /*
80 * Don't merge file system requests and discard requests 80 * Don't merge file system requests and discard requests
81 */ 81 */
82 if (bio_discard(bio) != bio_discard(rq->bio)) 82 if (bio_rw_flagged(bio, BIO_RW_DISCARD) !=
83 bio_rw_flagged(rq->bio, BIO_RW_DISCARD))
83 return 0; 84 return 0;
84 85
85 /* 86 /*
diff --git a/block/genhd.c b/block/genhd.c
index f4c64c2b303..517e4332cb3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -869,6 +869,7 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
869static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); 869static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
870static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); 870static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
871static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 871static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
872static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
872#ifdef CONFIG_FAIL_MAKE_REQUEST 873#ifdef CONFIG_FAIL_MAKE_REQUEST
873static struct device_attribute dev_attr_fail = 874static struct device_attribute dev_attr_fail =
874 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); 875 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -888,6 +889,7 @@ static struct attribute *disk_attrs[] = {
888 &dev_attr_alignment_offset.attr, 889 &dev_attr_alignment_offset.attr,
889 &dev_attr_capability.attr, 890 &dev_attr_capability.attr,
890 &dev_attr_stat.attr, 891 &dev_attr_stat.attr,
892 &dev_attr_inflight.attr,
891#ifdef CONFIG_FAIL_MAKE_REQUEST 893#ifdef CONFIG_FAIL_MAKE_REQUEST
892 &dev_attr_fail.attr, 894 &dev_attr_fail.attr,
893#endif 895#endif
@@ -901,7 +903,7 @@ static struct attribute_group disk_attr_group = {
901 .attrs = disk_attrs, 903 .attrs = disk_attrs,
902}; 904};
903 905
904static struct attribute_group *disk_attr_groups[] = { 906static const struct attribute_group *disk_attr_groups[] = {
905 &disk_attr_group, 907 &disk_attr_group,
906 NULL 908 NULL
907}; 909};
@@ -996,12 +998,12 @@ struct class block_class = {
996 .name = "block", 998 .name = "block",
997}; 999};
998 1000
999static char *block_nodename(struct device *dev) 1001static char *block_devnode(struct device *dev, mode_t *mode)
1000{ 1002{
1001 struct gendisk *disk = dev_to_disk(dev); 1003 struct gendisk *disk = dev_to_disk(dev);
1002 1004
1003 if (disk->nodename) 1005 if (disk->devnode)
1004 return disk->nodename(disk); 1006 return disk->devnode(disk, mode);
1005 return NULL; 1007 return NULL;
1006} 1008}
1007 1009
@@ -1009,7 +1011,7 @@ static struct device_type disk_type = {
1009 .name = "disk", 1011 .name = "disk",
1010 .groups = disk_attr_groups, 1012 .groups = disk_attr_groups,
1011 .release = disk_release, 1013 .release = disk_release,
1012 .nodename = block_nodename, 1014 .devnode = block_devnode,
1013}; 1015};
1014 1016
1015#ifdef CONFIG_PROC_FS 1017#ifdef CONFIG_PROC_FS
@@ -1053,7 +1055,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1053 part_stat_read(hd, merges[1]), 1055 part_stat_read(hd, merges[1]),
1054 (unsigned long long)part_stat_read(hd, sectors[1]), 1056 (unsigned long long)part_stat_read(hd, sectors[1]),
1055 jiffies_to_msecs(part_stat_read(hd, ticks[1])), 1057 jiffies_to_msecs(part_stat_read(hd, ticks[1])),
1056 hd->in_flight, 1058 part_in_flight(hd),
1057 jiffies_to_msecs(part_stat_read(hd, io_ticks)), 1059 jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1058 jiffies_to_msecs(part_stat_read(hd, time_in_queue)) 1060 jiffies_to_msecs(part_stat_read(hd, time_in_queue))
1059 ); 1061 );
@@ -1215,6 +1217,16 @@ void put_disk(struct gendisk *disk)
1215 1217
1216EXPORT_SYMBOL(put_disk); 1218EXPORT_SYMBOL(put_disk);
1217 1219
1220static void set_disk_ro_uevent(struct gendisk *gd, int ro)
1221{
1222 char event[] = "DISK_RO=1";
1223 char *envp[] = { event, NULL };
1224
1225 if (!ro)
1226 event[8] = '0';
1227 kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1228}
1229
1218void set_device_ro(struct block_device *bdev, int flag) 1230void set_device_ro(struct block_device *bdev, int flag)
1219{ 1231{
1220 bdev->bd_part->policy = flag; 1232 bdev->bd_part->policy = flag;
@@ -1227,8 +1239,12 @@ void set_disk_ro(struct gendisk *disk, int flag)
1227 struct disk_part_iter piter; 1239 struct disk_part_iter piter;
1228 struct hd_struct *part; 1240 struct hd_struct *part;
1229 1241
1230 disk_part_iter_init(&piter, disk, 1242 if (disk->part0.policy != flag) {
1231 DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0); 1243 set_disk_ro_uevent(disk, flag);
1244 disk->part0.policy = flag;
1245 }
1246
1247 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
1232 while ((part = disk_part_iter_next(&piter))) 1248 while ((part = disk_part_iter_next(&piter)))
1233 part->policy = flag; 1249 part->policy = flag;
1234 disk_part_iter_exit(&piter); 1250 disk_part_iter_exit(&piter);
diff --git a/block/ioctl.c b/block/ioctl.c
index 500e4c73cc5..d3e6b5827a3 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -112,22 +112,9 @@ static int blkdev_reread_part(struct block_device *bdev)
112 return res; 112 return res;
113} 113}
114 114
115static void blk_ioc_discard_endio(struct bio *bio, int err)
116{
117 if (err) {
118 if (err == -EOPNOTSUPP)
119 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
120 clear_bit(BIO_UPTODATE, &bio->bi_flags);
121 }
122 complete(bio->bi_private);
123}
124
125static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, 115static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
126 uint64_t len) 116 uint64_t len)
127{ 117{
128 struct request_queue *q = bdev_get_queue(bdev);
129 int ret = 0;
130
131 if (start & 511) 118 if (start & 511)
132 return -EINVAL; 119 return -EINVAL;
133 if (len & 511) 120 if (len & 511)
@@ -137,40 +124,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
137 124
138 if (start + len > (bdev->bd_inode->i_size >> 9)) 125 if (start + len > (bdev->bd_inode->i_size >> 9))
139 return -EINVAL; 126 return -EINVAL;
140 127 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
141 if (!q->prepare_discard_fn) 128 DISCARD_FL_WAIT);
142 return -EOPNOTSUPP;
143
144 while (len && !ret) {
145 DECLARE_COMPLETION_ONSTACK(wait);
146 struct bio *bio;
147
148 bio = bio_alloc(GFP_KERNEL, 0);
149
150 bio->bi_end_io = blk_ioc_discard_endio;
151 bio->bi_bdev = bdev;
152 bio->bi_private = &wait;
153 bio->bi_sector = start;
154
155 if (len > queue_max_hw_sectors(q)) {
156 bio->bi_size = queue_max_hw_sectors(q) << 9;
157 len -= queue_max_hw_sectors(q);
158 start += queue_max_hw_sectors(q);
159 } else {
160 bio->bi_size = len << 9;
161 len = 0;
162 }
163 submit_bio(DISCARD_NOBARRIER, bio);
164
165 wait_for_completion(&wait);
166
167 if (bio_flagged(bio, BIO_EOPNOTSUPP))
168 ret = -EOPNOTSUPP;
169 else if (!bio_flagged(bio, BIO_UPTODATE))
170 ret = -EIO;
171 bio_put(bio);
172 }
173 return ret;
174} 129}
175 130
176static int put_ushort(unsigned long arg, unsigned short val) 131static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 5f8e798ede4..e5b10017a50 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -32,6 +32,11 @@
32#include <scsi/scsi_ioctl.h> 32#include <scsi/scsi_ioctl.h>
33#include <scsi/scsi_cmnd.h> 33#include <scsi/scsi_cmnd.h>
34 34
35struct blk_cmd_filter {
36 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
37 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
38} blk_default_cmd_filter;
39
35/* Command group 3 is reserved and should never be used. */ 40/* Command group 3 is reserved and should never be used. */
36const unsigned char scsi_command_size_tbl[8] = 41const unsigned char scsi_command_size_tbl[8] =
37{ 42{
@@ -105,7 +110,7 @@ static int sg_emulated_host(struct request_queue *q, int __user *p)
105 return put_user(1, p); 110 return put_user(1, p);
106} 111}
107 112
108void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) 113static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
109{ 114{
110 /* Basic read-only commands */ 115 /* Basic read-only commands */
111 __set_bit(TEST_UNIT_READY, filter->read_ok); 116 __set_bit(TEST_UNIT_READY, filter->read_ok);
@@ -187,14 +192,37 @@ void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
187 __set_bit(GPCMD_SET_STREAMING, filter->write_ok); 192 __set_bit(GPCMD_SET_STREAMING, filter->write_ok);
188 __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); 193 __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
189} 194}
190EXPORT_SYMBOL_GPL(blk_set_cmd_filter_defaults); 195
196int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
197{
198 struct blk_cmd_filter *filter = &blk_default_cmd_filter;
199
200 /* root can do any command. */
201 if (capable(CAP_SYS_RAWIO))
202 return 0;
203
204 /* if there's no filter set, assume we're filtering everything out */
205 if (!filter)
206 return -EPERM;
207
208 /* Anybody who can open the device can do a read-safe command */
209 if (test_bit(cmd[0], filter->read_ok))
210 return 0;
211
212 /* Write-safe commands require a writable open */
213 if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
214 return 0;
215
216 return -EPERM;
217}
218EXPORT_SYMBOL(blk_verify_command);
191 219
192static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, 220static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
193 struct sg_io_hdr *hdr, fmode_t mode) 221 struct sg_io_hdr *hdr, fmode_t mode)
194{ 222{
195 if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) 223 if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
196 return -EFAULT; 224 return -EFAULT;
197 if (blk_verify_command(&q->cmd_filter, rq->cmd, mode & FMODE_WRITE)) 225 if (blk_verify_command(rq->cmd, mode & FMODE_WRITE))
198 return -EPERM; 226 return -EPERM;
199 227
200 /* 228 /*
@@ -427,7 +455,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
427 if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) 455 if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
428 goto error; 456 goto error;
429 457
430 err = blk_verify_command(&q->cmd_filter, rq->cmd, mode & FMODE_WRITE); 458 err = blk_verify_command(rq->cmd, mode & FMODE_WRITE);
431 if (err) 459 if (err)
432 goto error; 460 goto error;
433 461
@@ -645,5 +673,11 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
645 blk_put_queue(q); 673 blk_put_queue(q);
646 return err; 674 return err;
647} 675}
648
649EXPORT_SYMBOL(scsi_cmd_ioctl); 676EXPORT_SYMBOL(scsi_cmd_ioctl);
677
678int __init blk_scsi_ioctl_init(void)
679{
680 blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
681 return 0;
682}
683fs_initcall(blk_scsi_ioctl_init);