aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Makefile4
-rw-r--r--block/as-iosched.c14
-rw-r--r--block/blk-barrier.c72
-rw-r--r--block/blk-core.c605
-rw-r--r--block/blk-exec.c6
-rw-r--r--block/blk-integrity.c33
-rw-r--r--block/blk-map.c68
-rw-r--r--block/blk-merge.c129
-rw-r--r--block/blk-settings.c43
-rw-r--r--block/blk-softirq.c175
-rw-r--r--block/blk-sysfs.c35
-rw-r--r--block/blk-tag.c22
-rw-r--r--block/blk-timeout.c238
-rw-r--r--block/blk.h48
-rw-r--r--block/blktrace.c32
-rw-r--r--block/bsg.c6
-rw-r--r--block/cfq-iosched.c57
-rw-r--r--block/cmd-filter.c9
-rw-r--r--block/compat_ioctl.c1
-rw-r--r--block/deadline-iosched.c40
-rw-r--r--block/elevator.c40
-rw-r--r--block/genhd.c965
-rw-r--r--block/ioctl.c124
-rw-r--r--block/scsi_ioctl.c8
24 files changed, 1894 insertions, 880 deletions
diff --git a/block/Makefile b/block/Makefile
index 208000b0750d..bfe73049f939 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -4,8 +4,8 @@
4 4
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 cmd-filter.o 8 ioctl.o genhd.o scsi_ioctl.o cmd-filter.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
diff --git a/block/as-iosched.c b/block/as-iosched.c
index cf4eb0eefbbf..71f0abb219ee 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -462,7 +462,7 @@ static void as_antic_stop(struct as_data *ad)
462 del_timer(&ad->antic_timer); 462 del_timer(&ad->antic_timer);
463 ad->antic_status = ANTIC_FINISHED; 463 ad->antic_status = ANTIC_FINISHED;
464 /* see as_work_handler */ 464 /* see as_work_handler */
465 kblockd_schedule_work(&ad->antic_work); 465 kblockd_schedule_work(ad->q, &ad->antic_work);
466 } 466 }
467} 467}
468 468
@@ -483,7 +483,7 @@ static void as_antic_timeout(unsigned long data)
483 aic = ad->io_context->aic; 483 aic = ad->io_context->aic;
484 484
485 ad->antic_status = ANTIC_FINISHED; 485 ad->antic_status = ANTIC_FINISHED;
486 kblockd_schedule_work(&ad->antic_work); 486 kblockd_schedule_work(q, &ad->antic_work);
487 487
488 if (aic->ttime_samples == 0) { 488 if (aic->ttime_samples == 0) {
489 /* process anticipated on has exited or timed out*/ 489 /* process anticipated on has exited or timed out*/
@@ -745,6 +745,14 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
745 */ 745 */
746static int as_can_anticipate(struct as_data *ad, struct request *rq) 746static int as_can_anticipate(struct as_data *ad, struct request *rq)
747{ 747{
748#if 0 /* disable for now, we need to check tag level as well */
749 /*
750 * SSD device without seek penalty, disable idling
751 */
752 if (blk_queue_nonrot(ad->q)) axman
753 return 0;
754#endif
755
748 if (!ad->io_context) 756 if (!ad->io_context)
749 /* 757 /*
750 * Last request submitted was a write 758 * Last request submitted was a write
@@ -844,7 +852,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
844 if (ad->changed_batch && ad->nr_dispatched == 1) { 852 if (ad->changed_batch && ad->nr_dispatched == 1) {
845 ad->current_batch_expires = jiffies + 853 ad->current_batch_expires = jiffies +
846 ad->batch_expire[ad->batch_data_dir]; 854 ad->batch_expire[ad->batch_data_dir];
847 kblockd_schedule_work(&ad->antic_work); 855 kblockd_schedule_work(q, &ad->antic_work);
848 ad->changed_batch = 0; 856 ad->changed_batch = 0;
849 857
850 if (ad->batch_data_dir == REQ_SYNC) 858 if (ad->batch_data_dir == REQ_SYNC)
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index a09ead19f9c5..5c99ff8d2db8 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -293,7 +293,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
293 bio->bi_end_io = bio_end_empty_barrier; 293 bio->bi_end_io = bio_end_empty_barrier;
294 bio->bi_private = &wait; 294 bio->bi_private = &wait;
295 bio->bi_bdev = bdev; 295 bio->bi_bdev = bdev;
296 submit_bio(1 << BIO_RW_BARRIER, bio); 296 submit_bio(WRITE_BARRIER, bio);
297 297
298 wait_for_completion(&wait); 298 wait_for_completion(&wait);
299 299
@@ -315,3 +315,73 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
315 return ret; 315 return ret;
316} 316}
317EXPORT_SYMBOL(blkdev_issue_flush); 317EXPORT_SYMBOL(blkdev_issue_flush);
318
319static void blkdev_discard_end_io(struct bio *bio, int err)
320{
321 if (err) {
322 if (err == -EOPNOTSUPP)
323 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
324 clear_bit(BIO_UPTODATE, &bio->bi_flags);
325 }
326
327 bio_put(bio);
328}
329
330/**
331 * blkdev_issue_discard - queue a discard
332 * @bdev: blockdev to issue discard for
333 * @sector: start sector
334 * @nr_sects: number of sectors to discard
335 * @gfp_mask: memory allocation flags (for bio_alloc)
336 *
337 * Description:
338 * Issue a discard request for the sectors in question. Does not wait.
339 */
340int blkdev_issue_discard(struct block_device *bdev,
341 sector_t sector, sector_t nr_sects, gfp_t gfp_mask)
342{
343 struct request_queue *q;
344 struct bio *bio;
345 int ret = 0;
346
347 if (bdev->bd_disk == NULL)
348 return -ENXIO;
349
350 q = bdev_get_queue(bdev);
351 if (!q)
352 return -ENXIO;
353
354 if (!q->prepare_discard_fn)
355 return -EOPNOTSUPP;
356
357 while (nr_sects && !ret) {
358 bio = bio_alloc(gfp_mask, 0);
359 if (!bio)
360 return -ENOMEM;
361
362 bio->bi_end_io = blkdev_discard_end_io;
363 bio->bi_bdev = bdev;
364
365 bio->bi_sector = sector;
366
367 if (nr_sects > q->max_hw_sectors) {
368 bio->bi_size = q->max_hw_sectors << 9;
369 nr_sects -= q->max_hw_sectors;
370 sector += q->max_hw_sectors;
371 } else {
372 bio->bi_size = nr_sects << 9;
373 nr_sects = 0;
374 }
375 bio_get(bio);
376 submit_bio(DISCARD_BARRIER, bio);
377
378 /* Check if it failed immediately */
379 if (bio_flagged(bio, BIO_EOPNOTSUPP))
380 ret = -EOPNOTSUPP;
381 else if (!bio_flagged(bio, BIO_UPTODATE))
382 ret = -EIO;
383 bio_put(bio);
384 }
385 return ret;
386}
387EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index 2cba5ef97b2b..2d053b584410 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -26,8 +26,6 @@
26#include <linux/swap.h> 26#include <linux/swap.h>
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/task_io_accounting_ops.h> 28#include <linux/task_io_accounting_ops.h>
29#include <linux/interrupt.h>
30#include <linux/cpu.h>
31#include <linux/blktrace_api.h> 29#include <linux/blktrace_api.h>
32#include <linux/fault-inject.h> 30#include <linux/fault-inject.h>
33 31
@@ -50,27 +48,26 @@ struct kmem_cache *blk_requestq_cachep;
50 */ 48 */
51static struct workqueue_struct *kblockd_workqueue; 49static struct workqueue_struct *kblockd_workqueue;
52 50
53static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
54
55static void drive_stat_acct(struct request *rq, int new_io) 51static void drive_stat_acct(struct request *rq, int new_io)
56{ 52{
57 struct hd_struct *part; 53 struct hd_struct *part;
58 int rw = rq_data_dir(rq); 54 int rw = rq_data_dir(rq);
55 int cpu;
59 56
60 if (!blk_fs_request(rq) || !rq->rq_disk) 57 if (!blk_fs_request(rq) || !rq->rq_disk)
61 return; 58 return;
62 59
63 part = get_part(rq->rq_disk, rq->sector); 60 cpu = part_stat_lock();
61 part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
62
64 if (!new_io) 63 if (!new_io)
65 __all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector); 64 part_stat_inc(cpu, part, merges[rw]);
66 else { 65 else {
67 disk_round_stats(rq->rq_disk); 66 part_round_stats(cpu, part);
68 rq->rq_disk->in_flight++; 67 part_inc_in_flight(part);
69 if (part) {
70 part_round_stats(part);
71 part->in_flight++;
72 }
73 } 68 }
69
70 part_stat_unlock();
74} 71}
75 72
76void blk_queue_congestion_threshold(struct request_queue *q) 73void blk_queue_congestion_threshold(struct request_queue *q)
@@ -113,7 +110,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
113 memset(rq, 0, sizeof(*rq)); 110 memset(rq, 0, sizeof(*rq));
114 111
115 INIT_LIST_HEAD(&rq->queuelist); 112 INIT_LIST_HEAD(&rq->queuelist);
116 INIT_LIST_HEAD(&rq->donelist); 113 INIT_LIST_HEAD(&rq->timeout_list);
114 rq->cpu = -1;
117 rq->q = q; 115 rq->q = q;
118 rq->sector = rq->hard_sector = (sector_t) -1; 116 rq->sector = rq->hard_sector = (sector_t) -1;
119 INIT_HLIST_NODE(&rq->hash); 117 INIT_HLIST_NODE(&rq->hash);
@@ -308,7 +306,7 @@ void blk_unplug_timeout(unsigned long data)
308 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, 306 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
309 q->rq.count[READ] + q->rq.count[WRITE]); 307 q->rq.count[READ] + q->rq.count[WRITE]);
310 308
311 kblockd_schedule_work(&q->unplug_work); 309 kblockd_schedule_work(q, &q->unplug_work);
312} 310}
313 311
314void blk_unplug(struct request_queue *q) 312void blk_unplug(struct request_queue *q)
@@ -325,6 +323,21 @@ void blk_unplug(struct request_queue *q)
325} 323}
326EXPORT_SYMBOL(blk_unplug); 324EXPORT_SYMBOL(blk_unplug);
327 325
326static void blk_invoke_request_fn(struct request_queue *q)
327{
328 /*
329 * one level of recursion is ok and is much faster than kicking
330 * the unplug handling
331 */
332 if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
333 q->request_fn(q);
334 queue_flag_clear(QUEUE_FLAG_REENTER, q);
335 } else {
336 queue_flag_set(QUEUE_FLAG_PLUGGED, q);
337 kblockd_schedule_work(q, &q->unplug_work);
338 }
339}
340
328/** 341/**
329 * blk_start_queue - restart a previously stopped queue 342 * blk_start_queue - restart a previously stopped queue
330 * @q: The &struct request_queue in question 343 * @q: The &struct request_queue in question
@@ -339,18 +352,7 @@ void blk_start_queue(struct request_queue *q)
339 WARN_ON(!irqs_disabled()); 352 WARN_ON(!irqs_disabled());
340 353
341 queue_flag_clear(QUEUE_FLAG_STOPPED, q); 354 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
342 355 blk_invoke_request_fn(q);
343 /*
344 * one level of recursion is ok and is much faster than kicking
345 * the unplug handling
346 */
347 if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
348 q->request_fn(q);
349 queue_flag_clear(QUEUE_FLAG_REENTER, q);
350 } else {
351 blk_plug_device(q);
352 kblockd_schedule_work(&q->unplug_work);
353 }
354} 356}
355EXPORT_SYMBOL(blk_start_queue); 357EXPORT_SYMBOL(blk_start_queue);
356 358
@@ -408,15 +410,8 @@ void __blk_run_queue(struct request_queue *q)
408 * Only recurse once to avoid overrunning the stack, let the unplug 410 * Only recurse once to avoid overrunning the stack, let the unplug
409 * handling reinvoke the handler shortly if we already got there. 411 * handling reinvoke the handler shortly if we already got there.
410 */ 412 */
411 if (!elv_queue_empty(q)) { 413 if (!elv_queue_empty(q))
412 if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { 414 blk_invoke_request_fn(q);
413 q->request_fn(q);
414 queue_flag_clear(QUEUE_FLAG_REENTER, q);
415 } else {
416 blk_plug_device(q);
417 kblockd_schedule_work(&q->unplug_work);
418 }
419 }
420} 415}
421EXPORT_SYMBOL(__blk_run_queue); 416EXPORT_SYMBOL(__blk_run_queue);
422 417
@@ -441,6 +436,14 @@ void blk_put_queue(struct request_queue *q)
441 436
442void blk_cleanup_queue(struct request_queue *q) 437void blk_cleanup_queue(struct request_queue *q)
443{ 438{
439 /*
440 * We know we have process context here, so we can be a little
441 * cautious and ensure that pending block actions on this device
442 * are done before moving on. Going into this function, we should
443 * not have processes doing IO to this device.
444 */
445 blk_sync_queue(q);
446
444 mutex_lock(&q->sysfs_lock); 447 mutex_lock(&q->sysfs_lock);
445 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); 448 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
446 mutex_unlock(&q->sysfs_lock); 449 mutex_unlock(&q->sysfs_lock);
@@ -496,6 +499,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
496 } 499 }
497 500
498 init_timer(&q->unplug_timer); 501 init_timer(&q->unplug_timer);
502 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
503 INIT_LIST_HEAD(&q->timeout_list);
499 504
500 kobject_init(&q->kobj, &blk_queue_ktype); 505 kobject_init(&q->kobj, &blk_queue_ktype);
501 506
@@ -531,7 +536,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node);
531 * request queue; this lock will be taken also from interrupt context, so irq 536 * request queue; this lock will be taken also from interrupt context, so irq
532 * disabling is needed for it. 537 * disabling is needed for it.
533 * 538 *
534 * Function returns a pointer to the initialized request queue, or NULL if 539 * Function returns a pointer to the initialized request queue, or %NULL if
535 * it didn't succeed. 540 * it didn't succeed.
536 * 541 *
537 * Note: 542 * Note:
@@ -569,7 +574,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
569 q->request_fn = rfn; 574 q->request_fn = rfn;
570 q->prep_rq_fn = NULL; 575 q->prep_rq_fn = NULL;
571 q->unplug_fn = generic_unplug_device; 576 q->unplug_fn = generic_unplug_device;
572 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); 577 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER |
578 1 << QUEUE_FLAG_STACKABLE);
573 q->queue_lock = lock; 579 q->queue_lock = lock;
574 580
575 blk_queue_segment_boundary(q, 0xffffffff); 581 blk_queue_segment_boundary(q, 0xffffffff);
@@ -624,10 +630,6 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
624 630
625 blk_rq_init(q, rq); 631 blk_rq_init(q, rq);
626 632
627 /*
628 * first three bits are identical in rq->cmd_flags and bio->bi_rw,
629 * see bio.h and blkdev.h
630 */
631 rq->cmd_flags = rw | REQ_ALLOCED; 633 rq->cmd_flags = rw | REQ_ALLOCED;
632 634
633 if (priv) { 635 if (priv) {
@@ -888,9 +890,11 @@ EXPORT_SYMBOL(blk_get_request);
888 */ 890 */
889void blk_start_queueing(struct request_queue *q) 891void blk_start_queueing(struct request_queue *q)
890{ 892{
891 if (!blk_queue_plugged(q)) 893 if (!blk_queue_plugged(q)) {
894 if (unlikely(blk_queue_stopped(q)))
895 return;
892 q->request_fn(q); 896 q->request_fn(q);
893 else 897 } else
894 __generic_unplug_device(q); 898 __generic_unplug_device(q);
895} 899}
896EXPORT_SYMBOL(blk_start_queueing); 900EXPORT_SYMBOL(blk_start_queueing);
@@ -907,6 +911,8 @@ EXPORT_SYMBOL(blk_start_queueing);
907 */ 911 */
908void blk_requeue_request(struct request_queue *q, struct request *rq) 912void blk_requeue_request(struct request_queue *q, struct request *rq)
909{ 913{
914 blk_delete_timer(rq);
915 blk_clear_rq_complete(rq);
910 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 916 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
911 917
912 if (blk_rq_tagged(rq)) 918 if (blk_rq_tagged(rq))
@@ -917,7 +923,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
917EXPORT_SYMBOL(blk_requeue_request); 923EXPORT_SYMBOL(blk_requeue_request);
918 924
919/** 925/**
920 * blk_insert_request - insert a special request in to a request queue 926 * blk_insert_request - insert a special request into a request queue
921 * @q: request queue where request should be inserted 927 * @q: request queue where request should be inserted
922 * @rq: request to be inserted 928 * @rq: request to be inserted
923 * @at_head: insert request at head or tail of queue 929 * @at_head: insert request at head or tail of queue
@@ -927,8 +933,8 @@ EXPORT_SYMBOL(blk_requeue_request);
927 * Many block devices need to execute commands asynchronously, so they don't 933 * Many block devices need to execute commands asynchronously, so they don't
928 * block the whole kernel from preemption during request execution. This is 934 * block the whole kernel from preemption during request execution. This is
929 * accomplished normally by inserting aritficial requests tagged as 935 * accomplished normally by inserting aritficial requests tagged as
930 * REQ_SPECIAL in to the corresponding request queue, and letting them be 936 * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
931 * scheduled for actual execution by the request queue. 937 * be scheduled for actual execution by the request queue.
932 * 938 *
933 * We have the option of inserting the head or the tail of the queue. 939 * We have the option of inserting the head or the tail of the queue.
934 * Typically we use the tail for new ioctls and so forth. We use the head 940 * Typically we use the tail for new ioctls and so forth. We use the head
@@ -982,8 +988,22 @@ static inline void add_request(struct request_queue *q, struct request *req)
982 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); 988 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
983} 989}
984 990
985/* 991static void part_round_stats_single(int cpu, struct hd_struct *part,
986 * disk_round_stats() - Round off the performance stats on a struct 992 unsigned long now)
993{
994 if (now == part->stamp)
995 return;
996
997 if (part->in_flight) {
998 __part_stat_add(cpu, part, time_in_queue,
999 part->in_flight * (now - part->stamp));
1000 __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1001 }
1002 part->stamp = now;
1003}
1004
1005/**
1006 * part_round_stats() - Round off the performance stats on a struct
987 * disk_stats. 1007 * disk_stats.
988 * 1008 *
989 * The average IO queue length and utilisation statistics are maintained 1009 * The average IO queue length and utilisation statistics are maintained
@@ -997,36 +1017,15 @@ static inline void add_request(struct request_queue *q, struct request *req)
997 * /proc/diskstats. This accounts immediately for all queue usage up to 1017 * /proc/diskstats. This accounts immediately for all queue usage up to
998 * the current jiffies and restarts the counters again. 1018 * the current jiffies and restarts the counters again.
999 */ 1019 */
1000void disk_round_stats(struct gendisk *disk) 1020void part_round_stats(int cpu, struct hd_struct *part)
1001{ 1021{
1002 unsigned long now = jiffies; 1022 unsigned long now = jiffies;
1003 1023
1004 if (now == disk->stamp) 1024 if (part->partno)
1005 return; 1025 part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
1006 1026 part_round_stats_single(cpu, part, now);
1007 if (disk->in_flight) {
1008 __disk_stat_add(disk, time_in_queue,
1009 disk->in_flight * (now - disk->stamp));
1010 __disk_stat_add(disk, io_ticks, (now - disk->stamp));
1011 }
1012 disk->stamp = now;
1013}
1014EXPORT_SYMBOL_GPL(disk_round_stats);
1015
1016void part_round_stats(struct hd_struct *part)
1017{
1018 unsigned long now = jiffies;
1019
1020 if (now == part->stamp)
1021 return;
1022
1023 if (part->in_flight) {
1024 __part_stat_add(part, time_in_queue,
1025 part->in_flight * (now - part->stamp));
1026 __part_stat_add(part, io_ticks, (now - part->stamp));
1027 }
1028 part->stamp = now;
1029} 1027}
1028EXPORT_SYMBOL_GPL(part_round_stats);
1030 1029
1031/* 1030/*
1032 * queue lock must be held 1031 * queue lock must be held
@@ -1070,6 +1069,7 @@ EXPORT_SYMBOL(blk_put_request);
1070 1069
1071void init_request_from_bio(struct request *req, struct bio *bio) 1070void init_request_from_bio(struct request *req, struct bio *bio)
1072{ 1071{
1072 req->cpu = bio->bi_comp_cpu;
1073 req->cmd_type = REQ_TYPE_FS; 1073 req->cmd_type = REQ_TYPE_FS;
1074 1074
1075 /* 1075 /*
@@ -1081,7 +1081,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1081 /* 1081 /*
1082 * REQ_BARRIER implies no merging, but lets make it explicit 1082 * REQ_BARRIER implies no merging, but lets make it explicit
1083 */ 1083 */
1084 if (unlikely(bio_barrier(bio))) 1084 if (unlikely(bio_discard(bio))) {
1085 req->cmd_flags |= REQ_DISCARD;
1086 if (bio_barrier(bio))
1087 req->cmd_flags |= REQ_SOFTBARRIER;
1088 req->q->prepare_discard_fn(req->q, req);
1089 } else if (unlikely(bio_barrier(bio)))
1085 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); 1090 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
1086 1091
1087 if (bio_sync(bio)) 1092 if (bio_sync(bio))
@@ -1099,7 +1104,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1099static int __make_request(struct request_queue *q, struct bio *bio) 1104static int __make_request(struct request_queue *q, struct bio *bio)
1100{ 1105{
1101 struct request *req; 1106 struct request *req;
1102 int el_ret, nr_sectors, barrier, err; 1107 int el_ret, nr_sectors, barrier, discard, err;
1103 const unsigned short prio = bio_prio(bio); 1108 const unsigned short prio = bio_prio(bio);
1104 const int sync = bio_sync(bio); 1109 const int sync = bio_sync(bio);
1105 int rw_flags; 1110 int rw_flags;
@@ -1114,7 +1119,14 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1114 blk_queue_bounce(q, &bio); 1119 blk_queue_bounce(q, &bio);
1115 1120
1116 barrier = bio_barrier(bio); 1121 barrier = bio_barrier(bio);
1117 if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) { 1122 if (unlikely(barrier) && bio_has_data(bio) &&
1123 (q->next_ordered == QUEUE_ORDERED_NONE)) {
1124 err = -EOPNOTSUPP;
1125 goto end_io;
1126 }
1127
1128 discard = bio_discard(bio);
1129 if (unlikely(discard) && !q->prepare_discard_fn) {
1118 err = -EOPNOTSUPP; 1130 err = -EOPNOTSUPP;
1119 goto end_io; 1131 goto end_io;
1120 } 1132 }
@@ -1138,6 +1150,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1138 req->biotail = bio; 1150 req->biotail = bio;
1139 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 1151 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
1140 req->ioprio = ioprio_best(req->ioprio, prio); 1152 req->ioprio = ioprio_best(req->ioprio, prio);
1153 if (!blk_rq_cpu_valid(req))
1154 req->cpu = bio->bi_comp_cpu;
1141 drive_stat_acct(req, 0); 1155 drive_stat_acct(req, 0);
1142 if (!attempt_back_merge(q, req)) 1156 if (!attempt_back_merge(q, req))
1143 elv_merged_request(q, req, el_ret); 1157 elv_merged_request(q, req, el_ret);
@@ -1165,6 +1179,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1165 req->sector = req->hard_sector = bio->bi_sector; 1179 req->sector = req->hard_sector = bio->bi_sector;
1166 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 1180 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
1167 req->ioprio = ioprio_best(req->ioprio, prio); 1181 req->ioprio = ioprio_best(req->ioprio, prio);
1182 if (!blk_rq_cpu_valid(req))
1183 req->cpu = bio->bi_comp_cpu;
1168 drive_stat_acct(req, 0); 1184 drive_stat_acct(req, 0);
1169 if (!attempt_front_merge(q, req)) 1185 if (!attempt_front_merge(q, req))
1170 elv_merged_request(q, req, el_ret); 1186 elv_merged_request(q, req, el_ret);
@@ -1200,13 +1216,15 @@ get_rq:
1200 init_request_from_bio(req, bio); 1216 init_request_from_bio(req, bio);
1201 1217
1202 spin_lock_irq(q->queue_lock); 1218 spin_lock_irq(q->queue_lock);
1219 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1220 bio_flagged(bio, BIO_CPU_AFFINE))
1221 req->cpu = blk_cpu_to_group(smp_processor_id());
1203 if (elv_queue_empty(q)) 1222 if (elv_queue_empty(q))
1204 blk_plug_device(q); 1223 blk_plug_device(q);
1205 add_request(q, req); 1224 add_request(q, req);
1206out: 1225out:
1207 if (sync) 1226 if (sync)
1208 __generic_unplug_device(q); 1227 __generic_unplug_device(q);
1209
1210 spin_unlock_irq(q->queue_lock); 1228 spin_unlock_irq(q->queue_lock);
1211 return 0; 1229 return 0;
1212 1230
@@ -1260,8 +1278,9 @@ __setup("fail_make_request=", setup_fail_make_request);
1260 1278
1261static int should_fail_request(struct bio *bio) 1279static int should_fail_request(struct bio *bio)
1262{ 1280{
1263 if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) || 1281 struct hd_struct *part = bio->bi_bdev->bd_part;
1264 (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail)) 1282
1283 if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
1265 return should_fail(&fail_make_request, bio->bi_size); 1284 return should_fail(&fail_make_request, bio->bi_size);
1266 1285
1267 return 0; 1286 return 0;
@@ -1314,7 +1333,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
1314} 1333}
1315 1334
1316/** 1335/**
1317 * generic_make_request: hand a buffer to its device driver for I/O 1336 * generic_make_request - hand a buffer to its device driver for I/O
1318 * @bio: The bio describing the location in memory and on the device. 1337 * @bio: The bio describing the location in memory and on the device.
1319 * 1338 *
1320 * generic_make_request() is used to make I/O requests of block 1339 * generic_make_request() is used to make I/O requests of block
@@ -1409,7 +1428,8 @@ end_io:
1409 1428
1410 if (bio_check_eod(bio, nr_sectors)) 1429 if (bio_check_eod(bio, nr_sectors))
1411 goto end_io; 1430 goto end_io;
1412 if (bio_empty_barrier(bio) && !q->prepare_flush_fn) { 1431 if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
1432 (bio_discard(bio) && !q->prepare_discard_fn)) {
1413 err = -EOPNOTSUPP; 1433 err = -EOPNOTSUPP;
1414 goto end_io; 1434 goto end_io;
1415 } 1435 }
@@ -1471,13 +1491,13 @@ void generic_make_request(struct bio *bio)
1471EXPORT_SYMBOL(generic_make_request); 1491EXPORT_SYMBOL(generic_make_request);
1472 1492
1473/** 1493/**
1474 * submit_bio: submit a bio to the block device layer for I/O 1494 * submit_bio - submit a bio to the block device layer for I/O
1475 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 1495 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1476 * @bio: The &struct bio which describes the I/O 1496 * @bio: The &struct bio which describes the I/O
1477 * 1497 *
1478 * submit_bio() is very similar in purpose to generic_make_request(), and 1498 * submit_bio() is very similar in purpose to generic_make_request(), and
1479 * uses that function to do most of the work. Both are fairly rough 1499 * uses that function to do most of the work. Both are fairly rough
1480 * interfaces, @bio must be presetup and ready for I/O. 1500 * interfaces; @bio must be presetup and ready for I/O.
1481 * 1501 *
1482 */ 1502 */
1483void submit_bio(int rw, struct bio *bio) 1503void submit_bio(int rw, struct bio *bio)
@@ -1490,11 +1510,7 @@ void submit_bio(int rw, struct bio *bio)
1490 * If it's a regular read/write or a barrier with data attached, 1510 * If it's a regular read/write or a barrier with data attached,
1491 * go through the normal accounting stuff before submission. 1511 * go through the normal accounting stuff before submission.
1492 */ 1512 */
1493 if (!bio_empty_barrier(bio)) { 1513 if (bio_has_data(bio)) {
1494
1495 BIO_BUG_ON(!bio->bi_size);
1496 BIO_BUG_ON(!bio->bi_io_vec);
1497
1498 if (rw & WRITE) { 1514 if (rw & WRITE) {
1499 count_vm_events(PGPGOUT, count); 1515 count_vm_events(PGPGOUT, count);
1500 } else { 1516 } else {
@@ -1517,9 +1533,90 @@ void submit_bio(int rw, struct bio *bio)
1517EXPORT_SYMBOL(submit_bio); 1533EXPORT_SYMBOL(submit_bio);
1518 1534
1519/** 1535/**
1536 * blk_rq_check_limits - Helper function to check a request for the queue limit
1537 * @q: the queue
1538 * @rq: the request being checked
1539 *
1540 * Description:
1541 * @rq may have been made based on weaker limitations of upper-level queues
1542 * in request stacking drivers, and it may violate the limitation of @q.
1543 * Since the block layer and the underlying device driver trust @rq
1544 * after it is inserted to @q, it should be checked against @q before
1545 * the insertion using this generic function.
1546 *
1547 * This function should also be useful for request stacking drivers
1548 * in some cases below, so export this fuction.
1549 * Request stacking drivers like request-based dm may change the queue
1550 * limits while requests are in the queue (e.g. dm's table swapping).
1551 * Such request stacking drivers should check those requests agaist
1552 * the new queue limits again when they dispatch those requests,
1553 * although such checkings are also done against the old queue limits
1554 * when submitting requests.
1555 */
1556int blk_rq_check_limits(struct request_queue *q, struct request *rq)
1557{
1558 if (rq->nr_sectors > q->max_sectors ||
1559 rq->data_len > q->max_hw_sectors << 9) {
1560 printk(KERN_ERR "%s: over max size limit.\n", __func__);
1561 return -EIO;
1562 }
1563
1564 /*
1565 * queue's settings related to segment counting like q->bounce_pfn
1566 * may differ from that of other stacking queues.
1567 * Recalculate it to check the request correctly on this queue's
1568 * limitation.
1569 */
1570 blk_recalc_rq_segments(rq);
1571 if (rq->nr_phys_segments > q->max_phys_segments ||
1572 rq->nr_phys_segments > q->max_hw_segments) {
1573 printk(KERN_ERR "%s: over max segments limit.\n", __func__);
1574 return -EIO;
1575 }
1576
1577 return 0;
1578}
1579EXPORT_SYMBOL_GPL(blk_rq_check_limits);
1580
1581/**
1582 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
1583 * @q: the queue to submit the request
1584 * @rq: the request being queued
1585 */
1586int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1587{
1588 unsigned long flags;
1589
1590 if (blk_rq_check_limits(q, rq))
1591 return -EIO;
1592
1593#ifdef CONFIG_FAIL_MAKE_REQUEST
1594 if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
1595 should_fail(&fail_make_request, blk_rq_bytes(rq)))
1596 return -EIO;
1597#endif
1598
1599 spin_lock_irqsave(q->queue_lock, flags);
1600
1601 /*
1602 * Submitting request must be dequeued before calling this function
1603 * because it will be linked to another request_queue
1604 */
1605 BUG_ON(blk_queued_rq(rq));
1606
1607 drive_stat_acct(rq, 1);
1608 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1609
1610 spin_unlock_irqrestore(q->queue_lock, flags);
1611
1612 return 0;
1613}
1614EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
1615
1616/**
1520 * __end_that_request_first - end I/O on a request 1617 * __end_that_request_first - end I/O on a request
1521 * @req: the request being processed 1618 * @req: the request being processed
1522 * @error: 0 for success, < 0 for error 1619 * @error: %0 for success, < %0 for error
1523 * @nr_bytes: number of bytes to complete 1620 * @nr_bytes: number of bytes to complete
1524 * 1621 *
1525 * Description: 1622 * Description:
@@ -1527,8 +1624,8 @@ EXPORT_SYMBOL(submit_bio);
1527 * for the next range of segments (if any) in the cluster. 1624 * for the next range of segments (if any) in the cluster.
1528 * 1625 *
1529 * Return: 1626 * Return:
1530 * 0 - we are done with this request, call end_that_request_last() 1627 * %0 - we are done with this request, call end_that_request_last()
1531 * 1 - still buffers pending for this request 1628 * %1 - still buffers pending for this request
1532 **/ 1629 **/
1533static int __end_that_request_first(struct request *req, int error, 1630static int __end_that_request_first(struct request *req, int error,
1534 int nr_bytes) 1631 int nr_bytes)
@@ -1539,7 +1636,7 @@ static int __end_that_request_first(struct request *req, int error,
1539 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); 1636 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
1540 1637
1541 /* 1638 /*
1542 * for a REQ_BLOCK_PC request, we want to carry any eventual 1639 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
1543 * sense key with us all the way through 1640 * sense key with us all the way through
1544 */ 1641 */
1545 if (!blk_pc_request(req)) 1642 if (!blk_pc_request(req))
@@ -1552,11 +1649,14 @@ static int __end_that_request_first(struct request *req, int error,
1552 } 1649 }
1553 1650
1554 if (blk_fs_request(req) && req->rq_disk) { 1651 if (blk_fs_request(req) && req->rq_disk) {
1555 struct hd_struct *part = get_part(req->rq_disk, req->sector);
1556 const int rw = rq_data_dir(req); 1652 const int rw = rq_data_dir(req);
1653 struct hd_struct *part;
1654 int cpu;
1557 1655
1558 all_stat_add(req->rq_disk, part, sectors[rw], 1656 cpu = part_stat_lock();
1559 nr_bytes >> 9, req->sector); 1657 part = disk_map_sector_rcu(req->rq_disk, req->sector);
1658 part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9);
1659 part_stat_unlock();
1560 } 1660 }
1561 1661
1562 total_bytes = bio_nbytes = 0; 1662 total_bytes = bio_nbytes = 0;
@@ -1641,88 +1741,14 @@ static int __end_that_request_first(struct request *req, int error,
1641} 1741}
1642 1742
1643/* 1743/*
1644 * splice the completion data to a local structure and hand off to
1645 * process_completion_queue() to complete the requests
1646 */
1647static void blk_done_softirq(struct softirq_action *h)
1648{
1649 struct list_head *cpu_list, local_list;
1650
1651 local_irq_disable();
1652 cpu_list = &__get_cpu_var(blk_cpu_done);
1653 list_replace_init(cpu_list, &local_list);
1654 local_irq_enable();
1655
1656 while (!list_empty(&local_list)) {
1657 struct request *rq;
1658
1659 rq = list_entry(local_list.next, struct request, donelist);
1660 list_del_init(&rq->donelist);
1661 rq->q->softirq_done_fn(rq);
1662 }
1663}
1664
1665static int __cpuinit blk_cpu_notify(struct notifier_block *self,
1666 unsigned long action, void *hcpu)
1667{
1668 /*
1669 * If a CPU goes away, splice its entries to the current CPU
1670 * and trigger a run of the softirq
1671 */
1672 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
1673 int cpu = (unsigned long) hcpu;
1674
1675 local_irq_disable();
1676 list_splice_init(&per_cpu(blk_cpu_done, cpu),
1677 &__get_cpu_var(blk_cpu_done));
1678 raise_softirq_irqoff(BLOCK_SOFTIRQ);
1679 local_irq_enable();
1680 }
1681
1682 return NOTIFY_OK;
1683}
1684
1685
1686static struct notifier_block blk_cpu_notifier __cpuinitdata = {
1687 .notifier_call = blk_cpu_notify,
1688};
1689
1690/**
1691 * blk_complete_request - end I/O on a request
1692 * @req: the request being processed
1693 *
1694 * Description:
1695 * Ends all I/O on a request. It does not handle partial completions,
1696 * unless the driver actually implements this in its completion callback
1697 * through requeueing. The actual completion happens out-of-order,
1698 * through a softirq handler. The user must have registered a completion
1699 * callback through blk_queue_softirq_done().
1700 **/
1701
1702void blk_complete_request(struct request *req)
1703{
1704 struct list_head *cpu_list;
1705 unsigned long flags;
1706
1707 BUG_ON(!req->q->softirq_done_fn);
1708
1709 local_irq_save(flags);
1710
1711 cpu_list = &__get_cpu_var(blk_cpu_done);
1712 list_add_tail(&req->donelist, cpu_list);
1713 raise_softirq_irqoff(BLOCK_SOFTIRQ);
1714
1715 local_irq_restore(flags);
1716}
1717EXPORT_SYMBOL(blk_complete_request);
1718
1719/*
1720 * queue lock must be held 1744 * queue lock must be held
1721 */ 1745 */
1722static void end_that_request_last(struct request *req, int error) 1746static void end_that_request_last(struct request *req, int error)
1723{ 1747{
1724 struct gendisk *disk = req->rq_disk; 1748 struct gendisk *disk = req->rq_disk;
1725 1749
1750 blk_delete_timer(req);
1751
1726 if (blk_rq_tagged(req)) 1752 if (blk_rq_tagged(req))
1727 blk_queue_end_tag(req->q, req); 1753 blk_queue_end_tag(req->q, req);
1728 1754
@@ -1740,16 +1766,18 @@ static void end_that_request_last(struct request *req, int error)
1740 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { 1766 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
1741 unsigned long duration = jiffies - req->start_time; 1767 unsigned long duration = jiffies - req->start_time;
1742 const int rw = rq_data_dir(req); 1768 const int rw = rq_data_dir(req);
1743 struct hd_struct *part = get_part(disk, req->sector); 1769 struct hd_struct *part;
1744 1770 int cpu;
1745 __all_stat_inc(disk, part, ios[rw], req->sector); 1771
1746 __all_stat_add(disk, part, ticks[rw], duration, req->sector); 1772 cpu = part_stat_lock();
1747 disk_round_stats(disk); 1773 part = disk_map_sector_rcu(disk, req->sector);
1748 disk->in_flight--; 1774
1749 if (part) { 1775 part_stat_inc(cpu, part, ios[rw]);
1750 part_round_stats(part); 1776 part_stat_add(cpu, part, ticks[rw], duration);
1751 part->in_flight--; 1777 part_round_stats(cpu, part);
1752 } 1778 part_dec_in_flight(part);
1779
1780 part_stat_unlock();
1753 } 1781 }
1754 1782
1755 if (req->end_io) 1783 if (req->end_io)
@@ -1762,17 +1790,6 @@ static void end_that_request_last(struct request *req, int error)
1762 } 1790 }
1763} 1791}
1764 1792
1765static inline void __end_request(struct request *rq, int uptodate,
1766 unsigned int nr_bytes)
1767{
1768 int error = 0;
1769
1770 if (uptodate <= 0)
1771 error = uptodate ? uptodate : -EIO;
1772
1773 __blk_end_request(rq, error, nr_bytes);
1774}
1775
1776/** 1793/**
1777 * blk_rq_bytes - Returns bytes left to complete in the entire request 1794 * blk_rq_bytes - Returns bytes left to complete in the entire request
1778 * @rq: the request being processed 1795 * @rq: the request being processed
@@ -1803,74 +1820,57 @@ unsigned int blk_rq_cur_bytes(struct request *rq)
1803EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); 1820EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
1804 1821
1805/** 1822/**
1806 * end_queued_request - end all I/O on a queued request
1807 * @rq: the request being processed
1808 * @uptodate: error value or 0/1 uptodate flag
1809 *
1810 * Description:
1811 * Ends all I/O on a request, and removes it from the block layer queues.
1812 * Not suitable for normal IO completion, unless the driver still has
1813 * the request attached to the block layer.
1814 *
1815 **/
1816void end_queued_request(struct request *rq, int uptodate)
1817{
1818 __end_request(rq, uptodate, blk_rq_bytes(rq));
1819}
1820EXPORT_SYMBOL(end_queued_request);
1821
1822/**
1823 * end_dequeued_request - end all I/O on a dequeued request
1824 * @rq: the request being processed
1825 * @uptodate: error value or 0/1 uptodate flag
1826 *
1827 * Description:
1828 * Ends all I/O on a request. The request must already have been
1829 * dequeued using blkdev_dequeue_request(), as is normally the case
1830 * for most drivers.
1831 *
1832 **/
1833void end_dequeued_request(struct request *rq, int uptodate)
1834{
1835 __end_request(rq, uptodate, blk_rq_bytes(rq));
1836}
1837EXPORT_SYMBOL(end_dequeued_request);
1838
1839
1840/**
1841 * end_request - end I/O on the current segment of the request 1823 * end_request - end I/O on the current segment of the request
1842 * @req: the request being processed 1824 * @req: the request being processed
1843 * @uptodate: error value or 0/1 uptodate flag 1825 * @uptodate: error value or %0/%1 uptodate flag
1844 * 1826 *
1845 * Description: 1827 * Description:
1846 * Ends I/O on the current segment of a request. If that is the only 1828 * Ends I/O on the current segment of a request. If that is the only
1847 * remaining segment, the request is also completed and freed. 1829 * remaining segment, the request is also completed and freed.
1848 * 1830 *
1849 * This is a remnant of how older block drivers handled IO completions. 1831 * This is a remnant of how older block drivers handled I/O completions.
1850 * Modern drivers typically end IO on the full request in one go, unless 1832 * Modern drivers typically end I/O on the full request in one go, unless
1851 * they have a residual value to account for. For that case this function 1833 * they have a residual value to account for. For that case this function
1852 * isn't really useful, unless the residual just happens to be the 1834 * isn't really useful, unless the residual just happens to be the
1853 * full current segment. In other words, don't use this function in new 1835 * full current segment. In other words, don't use this function in new
1854 * code. Either use end_request_completely(), or the 1836 * code. Use blk_end_request() or __blk_end_request() to end a request.
1855 * end_that_request_chunk() (along with end_that_request_last()) for
1856 * partial completions.
1857 *
1858 **/ 1837 **/
1859void end_request(struct request *req, int uptodate) 1838void end_request(struct request *req, int uptodate)
1860{ 1839{
1861 __end_request(req, uptodate, req->hard_cur_sectors << 9); 1840 int error = 0;
1841
1842 if (uptodate <= 0)
1843 error = uptodate ? uptodate : -EIO;
1844
1845 __blk_end_request(req, error, req->hard_cur_sectors << 9);
1862} 1846}
1863EXPORT_SYMBOL(end_request); 1847EXPORT_SYMBOL(end_request);
1864 1848
1849static int end_that_request_data(struct request *rq, int error,
1850 unsigned int nr_bytes, unsigned int bidi_bytes)
1851{
1852 if (rq->bio) {
1853 if (__end_that_request_first(rq, error, nr_bytes))
1854 return 1;
1855
1856 /* Bidi request must be completed as a whole */
1857 if (blk_bidi_rq(rq) &&
1858 __end_that_request_first(rq->next_rq, error, bidi_bytes))
1859 return 1;
1860 }
1861
1862 return 0;
1863}
1864
1865/** 1865/**
1866 * blk_end_io - Generic end_io function to complete a request. 1866 * blk_end_io - Generic end_io function to complete a request.
1867 * @rq: the request being processed 1867 * @rq: the request being processed
1868 * @error: 0 for success, < 0 for error 1868 * @error: %0 for success, < %0 for error
1869 * @nr_bytes: number of bytes to complete @rq 1869 * @nr_bytes: number of bytes to complete @rq
1870 * @bidi_bytes: number of bytes to complete @rq->next_rq 1870 * @bidi_bytes: number of bytes to complete @rq->next_rq
1871 * @drv_callback: function called between completion of bios in the request 1871 * @drv_callback: function called between completion of bios in the request
1872 * and completion of the request. 1872 * and completion of the request.
1873 * If the callback returns non 0, this helper returns without 1873 * If the callback returns non %0, this helper returns without
1874 * completion of the request. 1874 * completion of the request.
1875 * 1875 *
1876 * Description: 1876 * Description:
@@ -1878,8 +1878,8 @@ EXPORT_SYMBOL(end_request);
1878 * If @rq has leftover, sets it up for the next range of segments. 1878 * If @rq has leftover, sets it up for the next range of segments.
1879 * 1879 *
1880 * Return: 1880 * Return:
1881 * 0 - we are done with this request 1881 * %0 - we are done with this request
1882 * 1 - this request is not freed yet, it still has pending buffers. 1882 * %1 - this request is not freed yet, it still has pending buffers.
1883 **/ 1883 **/
1884static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, 1884static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
1885 unsigned int bidi_bytes, 1885 unsigned int bidi_bytes,
@@ -1888,15 +1888,8 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
1888 struct request_queue *q = rq->q; 1888 struct request_queue *q = rq->q;
1889 unsigned long flags = 0UL; 1889 unsigned long flags = 0UL;
1890 1890
1891 if (blk_fs_request(rq) || blk_pc_request(rq)) { 1891 if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
1892 if (__end_that_request_first(rq, error, nr_bytes)) 1892 return 1;
1893 return 1;
1894
1895 /* Bidi request must be completed as a whole */
1896 if (blk_bidi_rq(rq) &&
1897 __end_that_request_first(rq->next_rq, error, bidi_bytes))
1898 return 1;
1899 }
1900 1893
1901 /* Special feature for tricky drivers */ 1894 /* Special feature for tricky drivers */
1902 if (drv_callback && drv_callback(rq)) 1895 if (drv_callback && drv_callback(rq))
@@ -1914,7 +1907,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
1914/** 1907/**
1915 * blk_end_request - Helper function for drivers to complete the request. 1908 * blk_end_request - Helper function for drivers to complete the request.
1916 * @rq: the request being processed 1909 * @rq: the request being processed
1917 * @error: 0 for success, < 0 for error 1910 * @error: %0 for success, < %0 for error
1918 * @nr_bytes: number of bytes to complete 1911 * @nr_bytes: number of bytes to complete
1919 * 1912 *
1920 * Description: 1913 * Description:
@@ -1922,8 +1915,8 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
1922 * If @rq has leftover, sets it up for the next range of segments. 1915 * If @rq has leftover, sets it up for the next range of segments.
1923 * 1916 *
1924 * Return: 1917 * Return:
1925 * 0 - we are done with this request 1918 * %0 - we are done with this request
1926 * 1 - still buffers pending for this request 1919 * %1 - still buffers pending for this request
1927 **/ 1920 **/
1928int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) 1921int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
1929{ 1922{
@@ -1934,22 +1927,20 @@ EXPORT_SYMBOL_GPL(blk_end_request);
1934/** 1927/**
1935 * __blk_end_request - Helper function for drivers to complete the request. 1928 * __blk_end_request - Helper function for drivers to complete the request.
1936 * @rq: the request being processed 1929 * @rq: the request being processed
1937 * @error: 0 for success, < 0 for error 1930 * @error: %0 for success, < %0 for error
1938 * @nr_bytes: number of bytes to complete 1931 * @nr_bytes: number of bytes to complete
1939 * 1932 *
1940 * Description: 1933 * Description:
1941 * Must be called with queue lock held unlike blk_end_request(). 1934 * Must be called with queue lock held unlike blk_end_request().
1942 * 1935 *
1943 * Return: 1936 * Return:
1944 * 0 - we are done with this request 1937 * %0 - we are done with this request
1945 * 1 - still buffers pending for this request 1938 * %1 - still buffers pending for this request
1946 **/ 1939 **/
1947int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) 1940int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
1948{ 1941{
1949 if (blk_fs_request(rq) || blk_pc_request(rq)) { 1942 if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
1950 if (__end_that_request_first(rq, error, nr_bytes)) 1943 return 1;
1951 return 1;
1952 }
1953 1944
1954 add_disk_randomness(rq->rq_disk); 1945 add_disk_randomness(rq->rq_disk);
1955 1946
@@ -1962,7 +1953,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
1962/** 1953/**
1963 * blk_end_bidi_request - Helper function for drivers to complete bidi request. 1954 * blk_end_bidi_request - Helper function for drivers to complete bidi request.
1964 * @rq: the bidi request being processed 1955 * @rq: the bidi request being processed
1965 * @error: 0 for success, < 0 for error 1956 * @error: %0 for success, < %0 for error
1966 * @nr_bytes: number of bytes to complete @rq 1957 * @nr_bytes: number of bytes to complete @rq
1967 * @bidi_bytes: number of bytes to complete @rq->next_rq 1958 * @bidi_bytes: number of bytes to complete @rq->next_rq
1968 * 1959 *
@@ -1970,8 +1961,8 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
1970 * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. 1961 * Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
1971 * 1962 *
1972 * Return: 1963 * Return:
1973 * 0 - we are done with this request 1964 * %0 - we are done with this request
1974 * 1 - still buffers pending for this request 1965 * %1 - still buffers pending for this request
1975 **/ 1966 **/
1976int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, 1967int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
1977 unsigned int bidi_bytes) 1968 unsigned int bidi_bytes)
@@ -1981,13 +1972,43 @@ int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
1981EXPORT_SYMBOL_GPL(blk_end_bidi_request); 1972EXPORT_SYMBOL_GPL(blk_end_bidi_request);
1982 1973
1983/** 1974/**
1975 * blk_update_request - Special helper function for request stacking drivers
1976 * @rq: the request being processed
1977 * @error: %0 for success, < %0 for error
1978 * @nr_bytes: number of bytes to complete @rq
1979 *
1980 * Description:
1981 * Ends I/O on a number of bytes attached to @rq, but doesn't complete
1982 * the request structure even if @rq doesn't have leftover.
1983 * If @rq has leftover, sets it up for the next range of segments.
1984 *
1985 * This special helper function is only for request stacking drivers
1986 * (e.g. request-based dm) so that they can handle partial completion.
1987 * Actual device drivers should use blk_end_request instead.
1988 */
1989void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
1990{
1991 if (!end_that_request_data(rq, error, nr_bytes, 0)) {
1992 /*
1993 * These members are not updated in end_that_request_data()
1994 * when all bios are completed.
1995 * Update them so that the request stacking driver can find
1996 * how many bytes remain in the request later.
1997 */
1998 rq->nr_sectors = rq->hard_nr_sectors = 0;
1999 rq->current_nr_sectors = rq->hard_cur_sectors = 0;
2000 }
2001}
2002EXPORT_SYMBOL_GPL(blk_update_request);
2003
2004/**
1984 * blk_end_request_callback - Special helper function for tricky drivers 2005 * blk_end_request_callback - Special helper function for tricky drivers
1985 * @rq: the request being processed 2006 * @rq: the request being processed
1986 * @error: 0 for success, < 0 for error 2007 * @error: %0 for success, < %0 for error
1987 * @nr_bytes: number of bytes to complete 2008 * @nr_bytes: number of bytes to complete
1988 * @drv_callback: function called between completion of bios in the request 2009 * @drv_callback: function called between completion of bios in the request
1989 * and completion of the request. 2010 * and completion of the request.
1990 * If the callback returns non 0, this helper returns without 2011 * If the callback returns non %0, this helper returns without
1991 * completion of the request. 2012 * completion of the request.
1992 * 2013 *
1993 * Description: 2014 * Description:
@@ -2000,10 +2021,10 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request);
2000 * Don't use this interface in other places anymore. 2021 * Don't use this interface in other places anymore.
2001 * 2022 *
2002 * Return: 2023 * Return:
2003 * 0 - we are done with this request 2024 * %0 - we are done with this request
2004 * 1 - this request is not freed yet. 2025 * %1 - this request is not freed yet.
2005 * this request still has pending buffers or 2026 * this request still has pending buffers or
2006 * the driver doesn't want to finish this request yet. 2027 * the driver doesn't want to finish this request yet.
2007 **/ 2028 **/
2008int blk_end_request_callback(struct request *rq, int error, 2029int blk_end_request_callback(struct request *rq, int error,
2009 unsigned int nr_bytes, 2030 unsigned int nr_bytes,
@@ -2016,15 +2037,17 @@ EXPORT_SYMBOL_GPL(blk_end_request_callback);
2016void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 2037void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2017 struct bio *bio) 2038 struct bio *bio)
2018{ 2039{
2019 /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ 2040 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
2041 we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
2020 rq->cmd_flags |= (bio->bi_rw & 3); 2042 rq->cmd_flags |= (bio->bi_rw & 3);
2021 2043
2022 rq->nr_phys_segments = bio_phys_segments(q, bio); 2044 if (bio_has_data(bio)) {
2023 rq->nr_hw_segments = bio_hw_segments(q, bio); 2045 rq->nr_phys_segments = bio_phys_segments(q, bio);
2046 rq->buffer = bio_data(bio);
2047 }
2024 rq->current_nr_sectors = bio_cur_sectors(bio); 2048 rq->current_nr_sectors = bio_cur_sectors(bio);
2025 rq->hard_cur_sectors = rq->current_nr_sectors; 2049 rq->hard_cur_sectors = rq->current_nr_sectors;
2026 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); 2050 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
2027 rq->buffer = bio_data(bio);
2028 rq->data_len = bio->bi_size; 2051 rq->data_len = bio->bi_size;
2029 2052
2030 rq->bio = rq->biotail = bio; 2053 rq->bio = rq->biotail = bio;
@@ -2033,7 +2056,35 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2033 rq->rq_disk = bio->bi_bdev->bd_disk; 2056 rq->rq_disk = bio->bi_bdev->bd_disk;
2034} 2057}
2035 2058
2036int kblockd_schedule_work(struct work_struct *work) 2059/**
2060 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
2061 * @q : the queue of the device being checked
2062 *
2063 * Description:
2064 * Check if underlying low-level drivers of a device are busy.
2065 * If the drivers want to export their busy state, they must set own
2066 * exporting function using blk_queue_lld_busy() first.
2067 *
2068 * Basically, this function is used only by request stacking drivers
2069 * to stop dispatching requests to underlying devices when underlying
2070 * devices are busy. This behavior helps more I/O merging on the queue
2071 * of the request stacking driver and prevents I/O throughput regression
2072 * on burst I/O load.
2073 *
2074 * Return:
2075 * 0 - Not busy (The request stacking driver should dispatch request)
2076 * 1 - Busy (The request stacking driver should stop dispatching request)
2077 */
2078int blk_lld_busy(struct request_queue *q)
2079{
2080 if (q->lld_busy_fn)
2081 return q->lld_busy_fn(q);
2082
2083 return 0;
2084}
2085EXPORT_SYMBOL_GPL(blk_lld_busy);
2086
2087int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2037{ 2088{
2038 return queue_work(kblockd_workqueue, work); 2089 return queue_work(kblockd_workqueue, work);
2039} 2090}
@@ -2047,8 +2098,6 @@ EXPORT_SYMBOL(kblockd_flush_work);
2047 2098
2048int __init blk_dev_init(void) 2099int __init blk_dev_init(void)
2049{ 2100{
2050 int i;
2051
2052 kblockd_workqueue = create_workqueue("kblockd"); 2101 kblockd_workqueue = create_workqueue("kblockd");
2053 if (!kblockd_workqueue) 2102 if (!kblockd_workqueue)
2054 panic("Failed to create kblockd\n"); 2103 panic("Failed to create kblockd\n");
@@ -2059,12 +2108,6 @@ int __init blk_dev_init(void)
2059 blk_requestq_cachep = kmem_cache_create("blkdev_queue", 2108 blk_requestq_cachep = kmem_cache_create("blkdev_queue",
2060 sizeof(struct request_queue), 0, SLAB_PANIC, NULL); 2109 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
2061 2110
2062 for_each_possible_cpu(i)
2063 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
2064
2065 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
2066 register_hotcpu_notifier(&blk_cpu_notifier);
2067
2068 return 0; 2111 return 0;
2069} 2112}
2070 2113
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9bceff7674f2..6af716d1e54e 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -16,7 +16,7 @@
16/** 16/**
17 * blk_end_sync_rq - executes a completion event on a request 17 * blk_end_sync_rq - executes a completion event on a request
18 * @rq: request to complete 18 * @rq: request to complete
19 * @error: end io status of the request 19 * @error: end I/O status of the request
20 */ 20 */
21static void blk_end_sync_rq(struct request *rq, int error) 21static void blk_end_sync_rq(struct request *rq, int error)
22{ 22{
@@ -41,7 +41,7 @@ static void blk_end_sync_rq(struct request *rq, int error)
41 * @done: I/O completion handler 41 * @done: I/O completion handler
42 * 42 *
43 * Description: 43 * Description:
44 * Insert a fully prepared request at the back of the io scheduler queue 44 * Insert a fully prepared request at the back of the I/O scheduler queue
45 * for execution. Don't wait for completion. 45 * for execution. Don't wait for completion.
46 */ 46 */
47void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, 47void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
@@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
72 * @at_head: insert request at head or tail of queue 72 * @at_head: insert request at head or tail of queue
73 * 73 *
74 * Description: 74 * Description:
75 * Insert a fully prepared request at the back of the io scheduler queue 75 * Insert a fully prepared request at the back of the I/O scheduler queue
76 * for execution and wait for completion. 76 * for execution and wait for completion.
77 */ 77 */
78int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, 78int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 3f1a8478cc38..61a8e2f8fdd0 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -108,51 +108,51 @@ new_segment:
108EXPORT_SYMBOL(blk_rq_map_integrity_sg); 108EXPORT_SYMBOL(blk_rq_map_integrity_sg);
109 109
110/** 110/**
111 * blk_integrity_compare - Compare integrity profile of two block devices 111 * blk_integrity_compare - Compare integrity profile of two disks
112 * @b1: Device to compare 112 * @gd1: Disk to compare
113 * @b2: Device to compare 113 * @gd2: Disk to compare
114 * 114 *
115 * Description: Meta-devices like DM and MD need to verify that all 115 * Description: Meta-devices like DM and MD need to verify that all
116 * sub-devices use the same integrity format before advertising to 116 * sub-devices use the same integrity format before advertising to
117 * upper layers that they can send/receive integrity metadata. This 117 * upper layers that they can send/receive integrity metadata. This
118 * function can be used to check whether two block devices have 118 * function can be used to check whether two gendisk devices have
119 * compatible integrity formats. 119 * compatible integrity formats.
120 */ 120 */
121int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2) 121int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
122{ 122{
123 struct blk_integrity *b1 = bd1->bd_disk->integrity; 123 struct blk_integrity *b1 = gd1->integrity;
124 struct blk_integrity *b2 = bd2->bd_disk->integrity; 124 struct blk_integrity *b2 = gd2->integrity;
125 125
126 BUG_ON(bd1->bd_disk == NULL); 126 if (!b1 && !b2)
127 BUG_ON(bd2->bd_disk == NULL); 127 return 0;
128 128
129 if (!b1 || !b2) 129 if (!b1 || !b2)
130 return 0; 130 return -1;
131 131
132 if (b1->sector_size != b2->sector_size) { 132 if (b1->sector_size != b2->sector_size) {
133 printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__, 133 printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__,
134 bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, 134 gd1->disk_name, gd2->disk_name,
135 b1->sector_size, b2->sector_size); 135 b1->sector_size, b2->sector_size);
136 return -1; 136 return -1;
137 } 137 }
138 138
139 if (b1->tuple_size != b2->tuple_size) { 139 if (b1->tuple_size != b2->tuple_size) {
140 printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__, 140 printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
141 bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, 141 gd1->disk_name, gd2->disk_name,
142 b1->tuple_size, b2->tuple_size); 142 b1->tuple_size, b2->tuple_size);
143 return -1; 143 return -1;
144 } 144 }
145 145
146 if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { 146 if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
147 printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__, 147 printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
148 bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, 148 gd1->disk_name, gd2->disk_name,
149 b1->tag_size, b2->tag_size); 149 b1->tag_size, b2->tag_size);
150 return -1; 150 return -1;
151 } 151 }
152 152
153 if (strcmp(b1->name, b2->name)) { 153 if (strcmp(b1->name, b2->name)) {
154 printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__, 154 printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
155 bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, 155 gd1->disk_name, gd2->disk_name,
156 b1->name, b2->name); 156 b1->name, b2->name);
157 return -1; 157 return -1;
158 } 158 }
@@ -331,7 +331,8 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
331 return -1; 331 return -1;
332 332
333 if (kobject_init_and_add(&bi->kobj, &integrity_ktype, 333 if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
334 &disk->dev.kobj, "%s", "integrity")) { 334 &disk_to_dev(disk)->kobj,
335 "%s", "integrity")) {
335 kmem_cache_free(integrity_cachep, bi); 336 kmem_cache_free(integrity_cachep, bi);
336 return -1; 337 return -1;
337 } 338 }
@@ -375,7 +376,7 @@ void blk_integrity_unregister(struct gendisk *disk)
375 376
376 kobject_uevent(&bi->kobj, KOBJ_REMOVE); 377 kobject_uevent(&bi->kobj, KOBJ_REMOVE);
377 kobject_del(&bi->kobj); 378 kobject_del(&bi->kobj);
378 kobject_put(&disk->dev.kobj);
379 kmem_cache_free(integrity_cachep, bi); 379 kmem_cache_free(integrity_cachep, bi);
380 disk->integrity = NULL;
380} 381}
381EXPORT_SYMBOL(blk_integrity_unregister); 382EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-map.c b/block/blk-map.c
index af37e4ae62f5..4849fa36161e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -41,10 +41,10 @@ static int __blk_rq_unmap_user(struct bio *bio)
41} 41}
42 42
43static int __blk_rq_map_user(struct request_queue *q, struct request *rq, 43static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
44 void __user *ubuf, unsigned int len) 44 struct rq_map_data *map_data, void __user *ubuf,
45 unsigned int len, int null_mapped, gfp_t gfp_mask)
45{ 46{
46 unsigned long uaddr; 47 unsigned long uaddr;
47 unsigned int alignment;
48 struct bio *bio, *orig_bio; 48 struct bio *bio, *orig_bio;
49 int reading, ret; 49 int reading, ret;
50 50
@@ -55,15 +55,17 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
55 * direct dma. else, set up kernel bounce buffers 55 * direct dma. else, set up kernel bounce buffers
56 */ 56 */
57 uaddr = (unsigned long) ubuf; 57 uaddr = (unsigned long) ubuf;
58 alignment = queue_dma_alignment(q) | q->dma_pad_mask; 58 if (blk_rq_aligned(q, ubuf, len) && !map_data)
59 if (!(uaddr & alignment) && !(len & alignment)) 59 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
60 bio = bio_map_user(q, NULL, uaddr, len, reading);
61 else 60 else
62 bio = bio_copy_user(q, uaddr, len, reading); 61 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
63 62
64 if (IS_ERR(bio)) 63 if (IS_ERR(bio))
65 return PTR_ERR(bio); 64 return PTR_ERR(bio);
66 65
66 if (null_mapped)
67 bio->bi_flags |= (1 << BIO_NULL_MAPPED);
68
67 orig_bio = bio; 69 orig_bio = bio;
68 blk_queue_bounce(q, &bio); 70 blk_queue_bounce(q, &bio);
69 71
@@ -85,17 +87,19 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
85} 87}
86 88
87/** 89/**
88 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage 90 * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
89 * @q: request queue where request should be inserted 91 * @q: request queue where request should be inserted
90 * @rq: request structure to fill 92 * @rq: request structure to fill
93 * @map_data: pointer to the rq_map_data holding pages (if necessary)
91 * @ubuf: the user buffer 94 * @ubuf: the user buffer
92 * @len: length of user data 95 * @len: length of user data
96 * @gfp_mask: memory allocation flags
93 * 97 *
94 * Description: 98 * Description:
95 * Data will be mapped directly for zero copy io, if possible. Otherwise 99 * Data will be mapped directly for zero copy I/O, if possible. Otherwise
96 * a kernel bounce buffer is used. 100 * a kernel bounce buffer is used.
97 * 101 *
98 * A matching blk_rq_unmap_user() must be issued at the end of io, while 102 * A matching blk_rq_unmap_user() must be issued at the end of I/O, while
99 * still in process context. 103 * still in process context.
100 * 104 *
101 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 105 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
@@ -105,16 +109,22 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
105 * unmapping. 109 * unmapping.
106 */ 110 */
107int blk_rq_map_user(struct request_queue *q, struct request *rq, 111int blk_rq_map_user(struct request_queue *q, struct request *rq,
108 void __user *ubuf, unsigned long len) 112 struct rq_map_data *map_data, void __user *ubuf,
113 unsigned long len, gfp_t gfp_mask)
109{ 114{
110 unsigned long bytes_read = 0; 115 unsigned long bytes_read = 0;
111 struct bio *bio = NULL; 116 struct bio *bio = NULL;
112 int ret; 117 int ret, null_mapped = 0;
113 118
114 if (len > (q->max_hw_sectors << 9)) 119 if (len > (q->max_hw_sectors << 9))
115 return -EINVAL; 120 return -EINVAL;
116 if (!len || !ubuf) 121 if (!len)
117 return -EINVAL; 122 return -EINVAL;
123 if (!ubuf) {
124 if (!map_data || rq_data_dir(rq) != READ)
125 return -EINVAL;
126 null_mapped = 1;
127 }
118 128
119 while (bytes_read != len) { 129 while (bytes_read != len) {
120 unsigned long map_len, end, start; 130 unsigned long map_len, end, start;
@@ -132,7 +142,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
132 if (end - start > BIO_MAX_PAGES) 142 if (end - start > BIO_MAX_PAGES)
133 map_len -= PAGE_SIZE; 143 map_len -= PAGE_SIZE;
134 144
135 ret = __blk_rq_map_user(q, rq, ubuf, map_len); 145 ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
146 null_mapped, gfp_mask);
136 if (ret < 0) 147 if (ret < 0)
137 goto unmap_rq; 148 goto unmap_rq;
138 if (!bio) 149 if (!bio)
@@ -154,18 +165,20 @@ unmap_rq:
154EXPORT_SYMBOL(blk_rq_map_user); 165EXPORT_SYMBOL(blk_rq_map_user);
155 166
156/** 167/**
157 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage 168 * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
158 * @q: request queue where request should be inserted 169 * @q: request queue where request should be inserted
159 * @rq: request to map data to 170 * @rq: request to map data to
171 * @map_data: pointer to the rq_map_data holding pages (if necessary)
160 * @iov: pointer to the iovec 172 * @iov: pointer to the iovec
161 * @iov_count: number of elements in the iovec 173 * @iov_count: number of elements in the iovec
162 * @len: I/O byte count 174 * @len: I/O byte count
175 * @gfp_mask: memory allocation flags
163 * 176 *
164 * Description: 177 * Description:
165 * Data will be mapped directly for zero copy io, if possible. Otherwise 178 * Data will be mapped directly for zero copy I/O, if possible. Otherwise
166 * a kernel bounce buffer is used. 179 * a kernel bounce buffer is used.
167 * 180 *
168 * A matching blk_rq_unmap_user() must be issued at the end of io, while 181 * A matching blk_rq_unmap_user() must be issued at the end of I/O, while
169 * still in process context. 182 * still in process context.
170 * 183 *
171 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 184 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
@@ -175,7 +188,8 @@ EXPORT_SYMBOL(blk_rq_map_user);
175 * unmapping. 188 * unmapping.
176 */ 189 */
177int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, 190int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
178 struct sg_iovec *iov, int iov_count, unsigned int len) 191 struct rq_map_data *map_data, struct sg_iovec *iov,
192 int iov_count, unsigned int len, gfp_t gfp_mask)
179{ 193{
180 struct bio *bio; 194 struct bio *bio;
181 int i, read = rq_data_dir(rq) == READ; 195 int i, read = rq_data_dir(rq) == READ;
@@ -193,10 +207,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
193 } 207 }
194 } 208 }
195 209
196 if (unaligned || (q->dma_pad_mask & len)) 210 if (unaligned || (q->dma_pad_mask & len) || map_data)
197 bio = bio_copy_user_iov(q, iov, iov_count, read); 211 bio = bio_copy_user_iov(q, map_data, iov, iov_count, read,
212 gfp_mask);
198 else 213 else
199 bio = bio_map_user_iov(q, NULL, iov, iov_count, read); 214 bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask);
200 215
201 if (IS_ERR(bio)) 216 if (IS_ERR(bio))
202 return PTR_ERR(bio); 217 return PTR_ERR(bio);
@@ -216,6 +231,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
216 rq->buffer = rq->data = NULL; 231 rq->buffer = rq->data = NULL;
217 return 0; 232 return 0;
218} 233}
234EXPORT_SYMBOL(blk_rq_map_user_iov);
219 235
220/** 236/**
221 * blk_rq_unmap_user - unmap a request with user data 237 * blk_rq_unmap_user - unmap a request with user data
@@ -224,7 +240,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
224 * Description: 240 * Description:
225 * Unmap a rq previously mapped by blk_rq_map_user(). The caller must 241 * Unmap a rq previously mapped by blk_rq_map_user(). The caller must
226 * supply the original rq->bio from the blk_rq_map_user() return, since 242 * supply the original rq->bio from the blk_rq_map_user() return, since
227 * the io completion may have changed rq->bio. 243 * the I/O completion may have changed rq->bio.
228 */ 244 */
229int blk_rq_unmap_user(struct bio *bio) 245int blk_rq_unmap_user(struct bio *bio)
230{ 246{
@@ -250,7 +266,7 @@ int blk_rq_unmap_user(struct bio *bio)
250EXPORT_SYMBOL(blk_rq_unmap_user); 266EXPORT_SYMBOL(blk_rq_unmap_user);
251 267
252/** 268/**
253 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage 269 * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage
254 * @q: request queue where request should be inserted 270 * @q: request queue where request should be inserted
255 * @rq: request to fill 271 * @rq: request to fill
256 * @kbuf: the kernel buffer 272 * @kbuf: the kernel buffer
@@ -264,8 +280,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
264int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, 280int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
265 unsigned int len, gfp_t gfp_mask) 281 unsigned int len, gfp_t gfp_mask)
266{ 282{
267 unsigned long kaddr;
268 unsigned int alignment;
269 int reading = rq_data_dir(rq) == READ; 283 int reading = rq_data_dir(rq) == READ;
270 int do_copy = 0; 284 int do_copy = 0;
271 struct bio *bio; 285 struct bio *bio;
@@ -275,11 +289,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
275 if (!len || !kbuf) 289 if (!len || !kbuf)
276 return -EINVAL; 290 return -EINVAL;
277 291
278 kaddr = (unsigned long)kbuf; 292 do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf);
279 alignment = queue_dma_alignment(q) | q->dma_pad_mask;
280 do_copy = ((kaddr & alignment) || (len & alignment) ||
281 object_is_on_stack(kbuf));
282
283 if (do_copy) 293 if (do_copy)
284 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); 294 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
285 else 295 else
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5efc9e7a68b7..908d3e11ac52 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -11,7 +11,7 @@
11 11
12void blk_recalc_rq_sectors(struct request *rq, int nsect) 12void blk_recalc_rq_sectors(struct request *rq, int nsect)
13{ 13{
14 if (blk_fs_request(rq)) { 14 if (blk_fs_request(rq) || blk_discard_rq(rq)) {
15 rq->hard_sector += nsect; 15 rq->hard_sector += nsect;
16 rq->hard_nr_sectors -= nsect; 16 rq->hard_nr_sectors -= nsect;
17 17
@@ -41,12 +41,9 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect)
41void blk_recalc_rq_segments(struct request *rq) 41void blk_recalc_rq_segments(struct request *rq)
42{ 42{
43 int nr_phys_segs; 43 int nr_phys_segs;
44 int nr_hw_segs;
45 unsigned int phys_size; 44 unsigned int phys_size;
46 unsigned int hw_size;
47 struct bio_vec *bv, *bvprv = NULL; 45 struct bio_vec *bv, *bvprv = NULL;
48 int seg_size; 46 int seg_size;
49 int hw_seg_size;
50 int cluster; 47 int cluster;
51 struct req_iterator iter; 48 struct req_iterator iter;
52 int high, highprv = 1; 49 int high, highprv = 1;
@@ -56,8 +53,8 @@ void blk_recalc_rq_segments(struct request *rq)
56 return; 53 return;
57 54
58 cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); 55 cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
59 hw_seg_size = seg_size = 0; 56 seg_size = 0;
60 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; 57 phys_size = nr_phys_segs = 0;
61 rq_for_each_segment(bv, rq, iter) { 58 rq_for_each_segment(bv, rq, iter) {
62 /* 59 /*
63 * the trick here is making sure that a high page is never 60 * the trick here is making sure that a high page is never
@@ -66,7 +63,7 @@ void blk_recalc_rq_segments(struct request *rq)
66 */ 63 */
67 high = page_to_pfn(bv->bv_page) > q->bounce_pfn; 64 high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
68 if (high || highprv) 65 if (high || highprv)
69 goto new_hw_segment; 66 goto new_segment;
70 if (cluster) { 67 if (cluster) {
71 if (seg_size + bv->bv_len > q->max_segment_size) 68 if (seg_size + bv->bv_len > q->max_segment_size)
72 goto new_segment; 69 goto new_segment;
@@ -74,40 +71,19 @@ void blk_recalc_rq_segments(struct request *rq)
74 goto new_segment; 71 goto new_segment;
75 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) 72 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
76 goto new_segment; 73 goto new_segment;
77 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
78 goto new_hw_segment;
79 74
80 seg_size += bv->bv_len; 75 seg_size += bv->bv_len;
81 hw_seg_size += bv->bv_len;
82 bvprv = bv; 76 bvprv = bv;
83 continue; 77 continue;
84 } 78 }
85new_segment: 79new_segment:
86 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
87 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
88 hw_seg_size += bv->bv_len;
89 else {
90new_hw_segment:
91 if (nr_hw_segs == 1 &&
92 hw_seg_size > rq->bio->bi_hw_front_size)
93 rq->bio->bi_hw_front_size = hw_seg_size;
94 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
95 nr_hw_segs++;
96 }
97
98 nr_phys_segs++; 80 nr_phys_segs++;
99 bvprv = bv; 81 bvprv = bv;
100 seg_size = bv->bv_len; 82 seg_size = bv->bv_len;
101 highprv = high; 83 highprv = high;
102 } 84 }
103 85
104 if (nr_hw_segs == 1 &&
105 hw_seg_size > rq->bio->bi_hw_front_size)
106 rq->bio->bi_hw_front_size = hw_seg_size;
107 if (hw_seg_size > rq->biotail->bi_hw_back_size)
108 rq->biotail->bi_hw_back_size = hw_seg_size;
109 rq->nr_phys_segments = nr_phys_segs; 86 rq->nr_phys_segments = nr_phys_segs;
110 rq->nr_hw_segments = nr_hw_segs;
111} 87}
112 88
113void blk_recount_segments(struct request_queue *q, struct bio *bio) 89void blk_recount_segments(struct request_queue *q, struct bio *bio)
@@ -120,7 +96,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
120 blk_recalc_rq_segments(&rq); 96 blk_recalc_rq_segments(&rq);
121 bio->bi_next = nxt; 97 bio->bi_next = nxt;
122 bio->bi_phys_segments = rq.nr_phys_segments; 98 bio->bi_phys_segments = rq.nr_phys_segments;
123 bio->bi_hw_segments = rq.nr_hw_segments;
124 bio->bi_flags |= (1 << BIO_SEG_VALID); 99 bio->bi_flags |= (1 << BIO_SEG_VALID);
125} 100}
126EXPORT_SYMBOL(blk_recount_segments); 101EXPORT_SYMBOL(blk_recount_segments);
@@ -131,13 +106,17 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
131 if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) 106 if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
132 return 0; 107 return 0;
133 108
134 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
135 return 0;
136 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 109 if (bio->bi_size + nxt->bi_size > q->max_segment_size)
137 return 0; 110 return 0;
138 111
112 if (!bio_has_data(bio))
113 return 1;
114
115 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
116 return 0;
117
139 /* 118 /*
140 * bio and nxt are contigous in memory, check if the queue allows 119 * bio and nxt are contiguous in memory; check if the queue allows
141 * these two to be merged into one 120 * these two to be merged into one
142 */ 121 */
143 if (BIO_SEG_BOUNDARY(q, bio, nxt)) 122 if (BIO_SEG_BOUNDARY(q, bio, nxt))
@@ -146,22 +125,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
146 return 0; 125 return 0;
147} 126}
148 127
149static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
150 struct bio *nxt)
151{
152 if (!bio_flagged(bio, BIO_SEG_VALID))
153 blk_recount_segments(q, bio);
154 if (!bio_flagged(nxt, BIO_SEG_VALID))
155 blk_recount_segments(q, nxt);
156 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
157 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
158 return 0;
159 if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
160 return 0;
161
162 return 1;
163}
164
165/* 128/*
166 * map a request to scatterlist, return number of sg entries setup. Caller 129 * map a request to scatterlist, return number of sg entries setup. Caller
167 * must make sure sg can hold rq->nr_phys_segments entries 130 * must make sure sg can hold rq->nr_phys_segments entries
@@ -275,10 +238,9 @@ static inline int ll_new_hw_segment(struct request_queue *q,
275 struct request *req, 238 struct request *req,
276 struct bio *bio) 239 struct bio *bio)
277{ 240{
278 int nr_hw_segs = bio_hw_segments(q, bio);
279 int nr_phys_segs = bio_phys_segments(q, bio); 241 int nr_phys_segs = bio_phys_segments(q, bio);
280 242
281 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments 243 if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
282 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 244 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
283 req->cmd_flags |= REQ_NOMERGE; 245 req->cmd_flags |= REQ_NOMERGE;
284 if (req == q->last_merge) 246 if (req == q->last_merge)
@@ -290,7 +252,6 @@ static inline int ll_new_hw_segment(struct request_queue *q,
290 * This will form the start of a new hw segment. Bump both 252 * This will form the start of a new hw segment. Bump both
291 * counters. 253 * counters.
292 */ 254 */
293 req->nr_hw_segments += nr_hw_segs;
294 req->nr_phys_segments += nr_phys_segs; 255 req->nr_phys_segments += nr_phys_segs;
295 return 1; 256 return 1;
296} 257}
@@ -299,7 +260,6 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
299 struct bio *bio) 260 struct bio *bio)
300{ 261{
301 unsigned short max_sectors; 262 unsigned short max_sectors;
302 int len;
303 263
304 if (unlikely(blk_pc_request(req))) 264 if (unlikely(blk_pc_request(req)))
305 max_sectors = q->max_hw_sectors; 265 max_sectors = q->max_hw_sectors;
@@ -316,19 +276,6 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
316 blk_recount_segments(q, req->biotail); 276 blk_recount_segments(q, req->biotail);
317 if (!bio_flagged(bio, BIO_SEG_VALID)) 277 if (!bio_flagged(bio, BIO_SEG_VALID))
318 blk_recount_segments(q, bio); 278 blk_recount_segments(q, bio);
319 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
320 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio))
321 && !BIOVEC_VIRT_OVERSIZE(len)) {
322 int mergeable = ll_new_mergeable(q, req, bio);
323
324 if (mergeable) {
325 if (req->nr_hw_segments == 1)
326 req->bio->bi_hw_front_size = len;
327 if (bio->bi_hw_segments == 1)
328 bio->bi_hw_back_size = len;
329 }
330 return mergeable;
331 }
332 279
333 return ll_new_hw_segment(q, req, bio); 280 return ll_new_hw_segment(q, req, bio);
334} 281}
@@ -337,7 +284,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
337 struct bio *bio) 284 struct bio *bio)
338{ 285{
339 unsigned short max_sectors; 286 unsigned short max_sectors;
340 int len;
341 287
342 if (unlikely(blk_pc_request(req))) 288 if (unlikely(blk_pc_request(req)))
343 max_sectors = q->max_hw_sectors; 289 max_sectors = q->max_hw_sectors;
@@ -351,23 +297,10 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
351 q->last_merge = NULL; 297 q->last_merge = NULL;
352 return 0; 298 return 0;
353 } 299 }
354 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
355 if (!bio_flagged(bio, BIO_SEG_VALID)) 300 if (!bio_flagged(bio, BIO_SEG_VALID))
356 blk_recount_segments(q, bio); 301 blk_recount_segments(q, bio);
357 if (!bio_flagged(req->bio, BIO_SEG_VALID)) 302 if (!bio_flagged(req->bio, BIO_SEG_VALID))
358 blk_recount_segments(q, req->bio); 303 blk_recount_segments(q, req->bio);
359 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
360 !BIOVEC_VIRT_OVERSIZE(len)) {
361 int mergeable = ll_new_mergeable(q, req, bio);
362
363 if (mergeable) {
364 if (bio->bi_hw_segments == 1)
365 bio->bi_hw_front_size = len;
366 if (req->nr_hw_segments == 1)
367 req->biotail->bi_hw_back_size = len;
368 }
369 return mergeable;
370 }
371 304
372 return ll_new_hw_segment(q, req, bio); 305 return ll_new_hw_segment(q, req, bio);
373} 306}
@@ -376,7 +309,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
376 struct request *next) 309 struct request *next)
377{ 310{
378 int total_phys_segments; 311 int total_phys_segments;
379 int total_hw_segments;
380 312
381 /* 313 /*
382 * First check if the either of the requests are re-queued 314 * First check if the either of the requests are re-queued
@@ -398,26 +330,11 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
398 if (total_phys_segments > q->max_phys_segments) 330 if (total_phys_segments > q->max_phys_segments)
399 return 0; 331 return 0;
400 332
401 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; 333 if (total_phys_segments > q->max_hw_segments)
402 if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
403 int len = req->biotail->bi_hw_back_size +
404 next->bio->bi_hw_front_size;
405 /*
406 * propagate the combined length to the end of the requests
407 */
408 if (req->nr_hw_segments == 1)
409 req->bio->bi_hw_front_size = len;
410 if (next->nr_hw_segments == 1)
411 next->biotail->bi_hw_back_size = len;
412 total_hw_segments--;
413 }
414
415 if (total_hw_segments > q->max_hw_segments)
416 return 0; 334 return 0;
417 335
418 /* Merge is OK... */ 336 /* Merge is OK... */
419 req->nr_phys_segments = total_phys_segments; 337 req->nr_phys_segments = total_phys_segments;
420 req->nr_hw_segments = total_hw_segments;
421 return 1; 338 return 1;
422} 339}
423 340
@@ -470,17 +387,21 @@ static int attempt_merge(struct request_queue *q, struct request *req,
470 elv_merge_requests(q, req, next); 387 elv_merge_requests(q, req, next);
471 388
472 if (req->rq_disk) { 389 if (req->rq_disk) {
473 struct hd_struct *part 390 struct hd_struct *part;
474 = get_part(req->rq_disk, req->sector); 391 int cpu;
475 disk_round_stats(req->rq_disk); 392
476 req->rq_disk->in_flight--; 393 cpu = part_stat_lock();
477 if (part) { 394 part = disk_map_sector_rcu(req->rq_disk, req->sector);
478 part_round_stats(part); 395
479 part->in_flight--; 396 part_round_stats(cpu, part);
480 } 397 part_dec_in_flight(part);
398
399 part_stat_unlock();
481 } 400 }
482 401
483 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 402 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
403 if (blk_rq_cpu_valid(next))
404 req->cpu = next->cpu;
484 405
485 __blk_put_request(q, next); 406 __blk_put_request(q, next);
486 return 1; 407 return 1;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index dfc77012843f..b21dcdb64151 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -33,6 +33,23 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
33EXPORT_SYMBOL(blk_queue_prep_rq); 33EXPORT_SYMBOL(blk_queue_prep_rq);
34 34
35/** 35/**
36 * blk_queue_set_discard - set a discard_sectors function for queue
37 * @q: queue
38 * @dfn: prepare_discard function
39 *
40 * It's possible for a queue to register a discard callback which is used
41 * to transform a discard request into the appropriate type for the
42 * hardware. If none is registered, then discard requests are failed
43 * with %EOPNOTSUPP.
44 *
45 */
46void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
47{
48 q->prepare_discard_fn = dfn;
49}
50EXPORT_SYMBOL(blk_queue_set_discard);
51
52/**
36 * blk_queue_merge_bvec - set a merge_bvec function for queue 53 * blk_queue_merge_bvec - set a merge_bvec function for queue
37 * @q: queue 54 * @q: queue
38 * @mbfn: merge_bvec_fn 55 * @mbfn: merge_bvec_fn
@@ -60,6 +77,24 @@ void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
60} 77}
61EXPORT_SYMBOL(blk_queue_softirq_done); 78EXPORT_SYMBOL(blk_queue_softirq_done);
62 79
80void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
81{
82 q->rq_timeout = timeout;
83}
84EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
85
86void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
87{
88 q->rq_timed_out_fn = fn;
89}
90EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
91
92void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
93{
94 q->lld_busy_fn = fn;
95}
96EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
97
63/** 98/**
64 * blk_queue_make_request - define an alternate make_request function for a device 99 * blk_queue_make_request - define an alternate make_request function for a device
65 * @q: the request queue for the device to be affected 100 * @q: the request queue for the device to be affected
@@ -127,7 +162,7 @@ EXPORT_SYMBOL(blk_queue_make_request);
127 * Different hardware can have different requirements as to what pages 162 * Different hardware can have different requirements as to what pages
128 * it can do I/O directly to. A low level driver can call 163 * it can do I/O directly to. A low level driver can call
129 * blk_queue_bounce_limit to have lower memory pages allocated as bounce 164 * blk_queue_bounce_limit to have lower memory pages allocated as bounce
130 * buffers for doing I/O to pages residing above @page. 165 * buffers for doing I/O to pages residing above @dma_addr.
131 **/ 166 **/
132void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) 167void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
133{ 168{
@@ -212,7 +247,7 @@ EXPORT_SYMBOL(blk_queue_max_phys_segments);
212 * Description: 247 * Description:
213 * Enables a low level driver to set an upper limit on the number of 248 * Enables a low level driver to set an upper limit on the number of
214 * hw data segments in a request. This would be the largest number of 249 * hw data segments in a request. This would be the largest number of
215 * address/length pairs the host adapter can actually give as once 250 * address/length pairs the host adapter can actually give at once
216 * to the device. 251 * to the device.
217 **/ 252 **/
218void blk_queue_max_hw_segments(struct request_queue *q, 253void blk_queue_max_hw_segments(struct request_queue *q,
@@ -393,7 +428,7 @@ EXPORT_SYMBOL(blk_queue_segment_boundary);
393 * @mask: alignment mask 428 * @mask: alignment mask
394 * 429 *
395 * description: 430 * description:
396 * set required memory and length aligment for direct dma transactions. 431 * set required memory and length alignment for direct dma transactions.
397 * this is used when buiding direct io requests for the queue. 432 * this is used when buiding direct io requests for the queue.
398 * 433 *
399 **/ 434 **/
@@ -409,7 +444,7 @@ EXPORT_SYMBOL(blk_queue_dma_alignment);
409 * @mask: alignment mask 444 * @mask: alignment mask
410 * 445 *
411 * description: 446 * description:
412 * update required memory and length aligment for direct dma transactions. 447 * update required memory and length alignment for direct dma transactions.
413 * If the requested alignment is larger than the current alignment, then 448 * If the requested alignment is larger than the current alignment, then
414 * the current queue alignment is updated to the new value, otherwise it 449 * the current queue alignment is updated to the new value, otherwise it
415 * is left alone. The design of this is to allow multiple objects 450 * is left alone. The design of this is to allow multiple objects
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
new file mode 100644
index 000000000000..e660d26ca656
--- /dev/null
+++ b/block/blk-softirq.c
@@ -0,0 +1,175 @@
1/*
2 * Functions related to softirq rq completions
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/init.h>
7#include <linux/bio.h>
8#include <linux/blkdev.h>
9#include <linux/interrupt.h>
10#include <linux/cpu.h>
11
12#include "blk.h"
13
14static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
15
16/*
17 * Softirq action handler - move entries to local list and loop over them
18 * while passing them to the queue registered handler.
19 */
20static void blk_done_softirq(struct softirq_action *h)
21{
22 struct list_head *cpu_list, local_list;
23
24 local_irq_disable();
25 cpu_list = &__get_cpu_var(blk_cpu_done);
26 list_replace_init(cpu_list, &local_list);
27 local_irq_enable();
28
29 while (!list_empty(&local_list)) {
30 struct request *rq;
31
32 rq = list_entry(local_list.next, struct request, csd.list);
33 list_del_init(&rq->csd.list);
34 rq->q->softirq_done_fn(rq);
35 }
36}
37
38#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
39static void trigger_softirq(void *data)
40{
41 struct request *rq = data;
42 unsigned long flags;
43 struct list_head *list;
44
45 local_irq_save(flags);
46 list = &__get_cpu_var(blk_cpu_done);
47 list_add_tail(&rq->csd.list, list);
48
49 if (list->next == &rq->csd.list)
50 raise_softirq_irqoff(BLOCK_SOFTIRQ);
51
52 local_irq_restore(flags);
53}
54
55/*
56 * Setup and invoke a run of 'trigger_softirq' on the given cpu.
57 */
58static int raise_blk_irq(int cpu, struct request *rq)
59{
60 if (cpu_online(cpu)) {
61 struct call_single_data *data = &rq->csd;
62
63 data->func = trigger_softirq;
64 data->info = rq;
65 data->flags = 0;
66
67 __smp_call_function_single(cpu, data);
68 return 0;
69 }
70
71 return 1;
72}
73#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
74static int raise_blk_irq(int cpu, struct request *rq)
75{
76 return 1;
77}
78#endif
79
80static int __cpuinit blk_cpu_notify(struct notifier_block *self,
81 unsigned long action, void *hcpu)
82{
83 /*
84 * If a CPU goes away, splice its entries to the current CPU
85 * and trigger a run of the softirq
86 */
87 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
88 int cpu = (unsigned long) hcpu;
89
90 local_irq_disable();
91 list_splice_init(&per_cpu(blk_cpu_done, cpu),
92 &__get_cpu_var(blk_cpu_done));
93 raise_softirq_irqoff(BLOCK_SOFTIRQ);
94 local_irq_enable();
95 }
96
97 return NOTIFY_OK;
98}
99
100static struct notifier_block __cpuinitdata blk_cpu_notifier = {
101 .notifier_call = blk_cpu_notify,
102};
103
104void __blk_complete_request(struct request *req)
105{
106 struct request_queue *q = req->q;
107 unsigned long flags;
108 int ccpu, cpu, group_cpu;
109
110 BUG_ON(!q->softirq_done_fn);
111
112 local_irq_save(flags);
113 cpu = smp_processor_id();
114 group_cpu = blk_cpu_to_group(cpu);
115
116 /*
117 * Select completion CPU
118 */
119 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
120 ccpu = req->cpu;
121 else
122 ccpu = cpu;
123
124 if (ccpu == cpu || ccpu == group_cpu) {
125 struct list_head *list;
126do_local:
127 list = &__get_cpu_var(blk_cpu_done);
128 list_add_tail(&req->csd.list, list);
129
130 /*
131 * if the list only contains our just added request,
132 * signal a raise of the softirq. If there are already
133 * entries there, someone already raised the irq but it
134 * hasn't run yet.
135 */
136 if (list->next == &req->csd.list)
137 raise_softirq_irqoff(BLOCK_SOFTIRQ);
138 } else if (raise_blk_irq(ccpu, req))
139 goto do_local;
140
141 local_irq_restore(flags);
142}
143
144/**
145 * blk_complete_request - end I/O on a request
146 * @req: the request being processed
147 *
148 * Description:
149 * Ends all I/O on a request. It does not handle partial completions,
150 * unless the driver actually implements this in its completion callback
151 * through requeueing. The actual completion happens out-of-order,
152 * through a softirq handler. The user must have registered a completion
153 * callback through blk_queue_softirq_done().
154 **/
155void blk_complete_request(struct request *req)
156{
157 if (unlikely(blk_should_fake_timeout(req->q)))
158 return;
159 if (!blk_mark_rq_complete(req))
160 __blk_complete_request(req);
161}
162EXPORT_SYMBOL(blk_complete_request);
163
164__init int blk_softirq_init(void)
165{
166 int i;
167
168 for_each_possible_cpu(i)
169 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
170
171 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
172 register_hotcpu_notifier(&blk_cpu_notifier);
173 return 0;
174}
175subsys_initcall(blk_softirq_init);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 304ec73ab821..21e275d7eed9 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -156,6 +156,30 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
156 return ret; 156 return ret;
157} 157}
158 158
159static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
160{
161 unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
162
163 return queue_var_show(set != 0, page);
164}
165
166static ssize_t
167queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
168{
169 ssize_t ret = -EINVAL;
170#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
171 unsigned long val;
172
173 ret = queue_var_store(&val, page, count);
174 spin_lock_irq(q->queue_lock);
175 if (val)
176 queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
177 else
178 queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
179 spin_unlock_irq(q->queue_lock);
180#endif
181 return ret;
182}
159 183
160static struct queue_sysfs_entry queue_requests_entry = { 184static struct queue_sysfs_entry queue_requests_entry = {
161 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, 185 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -197,6 +221,12 @@ static struct queue_sysfs_entry queue_nomerges_entry = {
197 .store = queue_nomerges_store, 221 .store = queue_nomerges_store,
198}; 222};
199 223
224static struct queue_sysfs_entry queue_rq_affinity_entry = {
225 .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
226 .show = queue_rq_affinity_show,
227 .store = queue_rq_affinity_store,
228};
229
200static struct attribute *default_attrs[] = { 230static struct attribute *default_attrs[] = {
201 &queue_requests_entry.attr, 231 &queue_requests_entry.attr,
202 &queue_ra_entry.attr, 232 &queue_ra_entry.attr,
@@ -205,6 +235,7 @@ static struct attribute *default_attrs[] = {
205 &queue_iosched_entry.attr, 235 &queue_iosched_entry.attr,
206 &queue_hw_sector_size_entry.attr, 236 &queue_hw_sector_size_entry.attr,
207 &queue_nomerges_entry.attr, 237 &queue_nomerges_entry.attr,
238 &queue_rq_affinity_entry.attr,
208 NULL, 239 NULL,
209}; 240};
210 241
@@ -310,7 +341,7 @@ int blk_register_queue(struct gendisk *disk)
310 if (!q->request_fn) 341 if (!q->request_fn)
311 return 0; 342 return 0;
312 343
313 ret = kobject_add(&q->kobj, kobject_get(&disk->dev.kobj), 344 ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
314 "%s", "queue"); 345 "%s", "queue");
315 if (ret < 0) 346 if (ret < 0)
316 return ret; 347 return ret;
@@ -339,6 +370,6 @@ void blk_unregister_queue(struct gendisk *disk)
339 370
340 kobject_uevent(&q->kobj, KOBJ_REMOVE); 371 kobject_uevent(&q->kobj, KOBJ_REMOVE);
341 kobject_del(&q->kobj); 372 kobject_del(&q->kobj);
342 kobject_put(&disk->dev.kobj); 373 kobject_put(&disk_to_dev(disk)->kobj);
343 } 374 }
344} 375}
diff --git a/block/blk-tag.c b/block/blk-tag.c
index ed5166fbc599..c0d419e84ce7 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -29,7 +29,7 @@ EXPORT_SYMBOL(blk_queue_find_tag);
29 * __blk_free_tags - release a given set of tag maintenance info 29 * __blk_free_tags - release a given set of tag maintenance info
30 * @bqt: the tag map to free 30 * @bqt: the tag map to free
31 * 31 *
32 * Tries to free the specified @bqt@. Returns true if it was 32 * Tries to free the specified @bqt. Returns true if it was
33 * actually freed and false if there are still references using it 33 * actually freed and false if there are still references using it
34 */ 34 */
35static int __blk_free_tags(struct blk_queue_tag *bqt) 35static int __blk_free_tags(struct blk_queue_tag *bqt)
@@ -78,7 +78,7 @@ void __blk_queue_free_tags(struct request_queue *q)
78 * blk_free_tags - release a given set of tag maintenance info 78 * blk_free_tags - release a given set of tag maintenance info
79 * @bqt: the tag map to free 79 * @bqt: the tag map to free
80 * 80 *
81 * For externally managed @bqt@ frees the map. Callers of this 81 * For externally managed @bqt frees the map. Callers of this
82 * function must guarantee to have released all the queues that 82 * function must guarantee to have released all the queues that
83 * might have been using this tag map. 83 * might have been using this tag map.
84 */ 84 */
@@ -94,7 +94,7 @@ EXPORT_SYMBOL(blk_free_tags);
94 * @q: the request queue for the device 94 * @q: the request queue for the device
95 * 95 *
96 * Notes: 96 * Notes:
97 * This is used to disabled tagged queuing to a device, yet leave 97 * This is used to disable tagged queuing to a device, yet leave
98 * queue in function. 98 * queue in function.
99 **/ 99 **/
100void blk_queue_free_tags(struct request_queue *q) 100void blk_queue_free_tags(struct request_queue *q)
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
271 * @rq: the request that has completed 271 * @rq: the request that has completed
272 * 272 *
273 * Description: 273 * Description:
274 * Typically called when end_that_request_first() returns 0, meaning 274 * Typically called when end_that_request_first() returns %0, meaning
275 * all transfers have been done for a request. It's important to call 275 * all transfers have been done for a request. It's important to call
276 * this function before end_that_request_last(), as that will put the 276 * this function before end_that_request_last(), as that will put the
277 * request back on the free list thus corrupting the internal tag list. 277 * request back on the free list thus corrupting the internal tag list.
@@ -337,6 +337,7 @@ EXPORT_SYMBOL(blk_queue_end_tag);
337int blk_queue_start_tag(struct request_queue *q, struct request *rq) 337int blk_queue_start_tag(struct request_queue *q, struct request *rq)
338{ 338{
339 struct blk_queue_tag *bqt = q->queue_tags; 339 struct blk_queue_tag *bqt = q->queue_tags;
340 unsigned max_depth, offset;
340 int tag; 341 int tag;
341 342
342 if (unlikely((rq->cmd_flags & REQ_QUEUED))) { 343 if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
@@ -350,10 +351,19 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
350 /* 351 /*
351 * Protect against shared tag maps, as we may not have exclusive 352 * Protect against shared tag maps, as we may not have exclusive
352 * access to the tag map. 353 * access to the tag map.
354 *
355 * We reserve a few tags just for sync IO, since we don't want
356 * to starve sync IO on behalf of flooding async IO.
353 */ 357 */
358 max_depth = bqt->max_depth;
359 if (rq_is_sync(rq))
360 offset = 0;
361 else
362 offset = max_depth >> 2;
363
354 do { 364 do {
355 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); 365 tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
356 if (tag >= bqt->max_depth) 366 if (tag >= max_depth)
357 return 1; 367 return 1;
358 368
359 } while (test_and_set_bit_lock(tag, bqt->tag_map)); 369 } while (test_and_set_bit_lock(tag, bqt->tag_map));
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
new file mode 100644
index 000000000000..972a63f848fb
--- /dev/null
+++ b/block/blk-timeout.c
@@ -0,0 +1,238 @@
1/*
2 * Functions related to generic timeout handling of requests.
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/blkdev.h>
7#include <linux/fault-inject.h>
8
9#include "blk.h"
10
11#ifdef CONFIG_FAIL_IO_TIMEOUT
12
13static DECLARE_FAULT_ATTR(fail_io_timeout);
14
15static int __init setup_fail_io_timeout(char *str)
16{
17 return setup_fault_attr(&fail_io_timeout, str);
18}
19__setup("fail_io_timeout=", setup_fail_io_timeout);
20
21int blk_should_fake_timeout(struct request_queue *q)
22{
23 if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
24 return 0;
25
26 return should_fail(&fail_io_timeout, 1);
27}
28
29static int __init fail_io_timeout_debugfs(void)
30{
31 return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout");
32}
33
34late_initcall(fail_io_timeout_debugfs);
35
36ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
37 char *buf)
38{
39 struct gendisk *disk = dev_to_disk(dev);
40 int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);
41
42 return sprintf(buf, "%d\n", set != 0);
43}
44
45ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
46 const char *buf, size_t count)
47{
48 struct gendisk *disk = dev_to_disk(dev);
49 int val;
50
51 if (count) {
52 struct request_queue *q = disk->queue;
53 char *p = (char *) buf;
54
55 val = simple_strtoul(p, &p, 10);
56 spin_lock_irq(q->queue_lock);
57 if (val)
58 queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
59 else
60 queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
61 spin_unlock_irq(q->queue_lock);
62 }
63
64 return count;
65}
66
67#endif /* CONFIG_FAIL_IO_TIMEOUT */
68
69/*
70 * blk_delete_timer - Delete/cancel timer for a given function.
71 * @req: request that we are canceling timer for
72 *
73 */
74void blk_delete_timer(struct request *req)
75{
76 struct request_queue *q = req->q;
77
78 /*
79 * Nothing to detach
80 */
81 if (!q->rq_timed_out_fn || !req->deadline)
82 return;
83
84 list_del_init(&req->timeout_list);
85
86 if (list_empty(&q->timeout_list))
87 del_timer(&q->timeout);
88}
89
90static void blk_rq_timed_out(struct request *req)
91{
92 struct request_queue *q = req->q;
93 enum blk_eh_timer_return ret;
94
95 ret = q->rq_timed_out_fn(req);
96 switch (ret) {
97 case BLK_EH_HANDLED:
98 __blk_complete_request(req);
99 break;
100 case BLK_EH_RESET_TIMER:
101 blk_clear_rq_complete(req);
102 blk_add_timer(req);
103 break;
104 case BLK_EH_NOT_HANDLED:
105 /*
106 * LLD handles this for now but in the future
107 * we can send a request msg to abort the command
108 * and we can move more of the generic scsi eh code to
109 * the blk layer.
110 */
111 break;
112 default:
113 printk(KERN_ERR "block: bad eh return: %d\n", ret);
114 break;
115 }
116}
117
118void blk_rq_timed_out_timer(unsigned long data)
119{
120 struct request_queue *q = (struct request_queue *) data;
121 unsigned long flags, uninitialized_var(next), next_set = 0;
122 struct request *rq, *tmp;
123
124 spin_lock_irqsave(q->queue_lock, flags);
125
126 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {
127 if (time_after_eq(jiffies, rq->deadline)) {
128 list_del_init(&rq->timeout_list);
129
130 /*
131 * Check if we raced with end io completion
132 */
133 if (blk_mark_rq_complete(rq))
134 continue;
135 blk_rq_timed_out(rq);
136 }
137 if (!next_set) {
138 next = rq->deadline;
139 next_set = 1;
140 } else if (time_after(next, rq->deadline))
141 next = rq->deadline;
142 }
143
144 if (next_set && !list_empty(&q->timeout_list))
145 mod_timer(&q->timeout, round_jiffies(next));
146
147 spin_unlock_irqrestore(q->queue_lock, flags);
148}
149
150/**
151 * blk_abort_request -- Request request recovery for the specified command
152 * @req: pointer to the request of interest
153 *
154 * This function requests that the block layer start recovery for the
155 * request by deleting the timer and calling the q's timeout function.
156 * LLDDs who implement their own error recovery MAY ignore the timeout
157 * event if they generated blk_abort_req. Must hold queue lock.
158 */
159void blk_abort_request(struct request *req)
160{
161 if (blk_mark_rq_complete(req))
162 return;
163 blk_delete_timer(req);
164 blk_rq_timed_out(req);
165}
166EXPORT_SYMBOL_GPL(blk_abort_request);
167
168/**
169 * blk_add_timer - Start timeout timer for a single request
170 * @req: request that is about to start running.
171 *
172 * Notes:
173 * Each request has its own timer, and as it is added to the queue, we
174 * set up the timer. When the request completes, we cancel the timer.
175 */
176void blk_add_timer(struct request *req)
177{
178 struct request_queue *q = req->q;
179 unsigned long expiry;
180
181 if (!q->rq_timed_out_fn)
182 return;
183
184 BUG_ON(!list_empty(&req->timeout_list));
185 BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
186
187 if (req->timeout)
188 req->deadline = jiffies + req->timeout;
189 else {
190 req->deadline = jiffies + q->rq_timeout;
191 /*
192 * Some LLDs, like scsi, peek at the timeout to prevent
193 * a command from being retried forever.
194 */
195 req->timeout = q->rq_timeout;
196 }
197 list_add_tail(&req->timeout_list, &q->timeout_list);
198
199 /*
200 * If the timer isn't already pending or this timeout is earlier
201 * than an existing one, modify the timer. Round to next nearest
202 * second.
203 */
204 expiry = round_jiffies(req->deadline);
205
206 /*
207 * We use ->deadline == 0 to detect whether a timer was added or
208 * not, so just increase to next jiffy for that specific case
209 */
210 if (unlikely(!req->deadline))
211 req->deadline = 1;
212
213 if (!timer_pending(&q->timeout) ||
214 time_before(expiry, q->timeout.expires))
215 mod_timer(&q->timeout, expiry);
216}
217
218/**
219 * blk_abort_queue -- Abort all request on given queue
220 * @queue: pointer to queue
221 *
222 */
223void blk_abort_queue(struct request_queue *q)
224{
225 unsigned long flags;
226 struct request *rq, *tmp;
227
228 spin_lock_irqsave(q->queue_lock, flags);
229
230 elv_abort_queue(q);
231
232 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
233 blk_abort_request(rq);
234
235 spin_unlock_irqrestore(q->queue_lock, flags);
236
237}
238EXPORT_SYMBOL_GPL(blk_abort_queue);
diff --git a/block/blk.h b/block/blk.h
index c79f30e1df52..e5c579769963 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -17,6 +17,42 @@ void __blk_queue_free_tags(struct request_queue *q);
17 17
18void blk_unplug_work(struct work_struct *work); 18void blk_unplug_work(struct work_struct *work);
19void blk_unplug_timeout(unsigned long data); 19void blk_unplug_timeout(unsigned long data);
20void blk_rq_timed_out_timer(unsigned long data);
21void blk_delete_timer(struct request *);
22void blk_add_timer(struct request *);
23
24/*
25 * Internal atomic flags for request handling
26 */
27enum rq_atomic_flags {
28 REQ_ATOM_COMPLETE = 0,
29};
30
31/*
32 * EH timer and IO completion will both attempt to 'grab' the request, make
33 * sure that only one of them suceeds
34 */
35static inline int blk_mark_rq_complete(struct request *rq)
36{
37 return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
38}
39
40static inline void blk_clear_rq_complete(struct request *rq)
41{
42 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
43}
44
45#ifdef CONFIG_FAIL_IO_TIMEOUT
46int blk_should_fake_timeout(struct request_queue *);
47ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
48ssize_t part_timeout_store(struct device *, struct device_attribute *,
49 const char *, size_t);
50#else
51static inline int blk_should_fake_timeout(struct request_queue *q)
52{
53 return 0;
54}
55#endif
20 56
21struct io_context *current_io_context(gfp_t gfp_flags, int node); 57struct io_context *current_io_context(gfp_t gfp_flags, int node);
22 58
@@ -59,4 +95,16 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
59 95
60#endif /* BLK_DEV_INTEGRITY */ 96#endif /* BLK_DEV_INTEGRITY */
61 97
98static inline int blk_cpu_to_group(int cpu)
99{
100#ifdef CONFIG_SCHED_MC
101 cpumask_t mask = cpu_coregroup_map(cpu);
102 return first_cpu(mask);
103#elif defined(CONFIG_SCHED_SMT)
104 return first_cpu(per_cpu(cpu_sibling_map, cpu));
105#else
106 return cpu;
107#endif
108}
109
62#endif 110#endif
diff --git a/block/blktrace.c b/block/blktrace.c
index eb9651ccb241..85049a7e7a17 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -111,23 +111,9 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
111 */ 111 */
112static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; 112static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
113 113
114/* 114/* The ilog2() calls fall out because they're constant */
115 * Bio action bits of interest 115#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \
116 */ 116 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) )
117static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
118
119/*
120 * More could be added as needed, taking care to increment the decrementer
121 * to get correct indexing
122 */
123#define trace_barrier_bit(rw) \
124 (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
125#define trace_sync_bit(rw) \
126 (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
127#define trace_ahead_bit(rw) \
128 (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
129#define trace_meta_bit(rw) \
130 (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
131 117
132/* 118/*
133 * The worker for the various blk_add_trace*() types. Fills out a 119 * The worker for the various blk_add_trace*() types. Fills out a
@@ -147,10 +133,11 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
147 return; 133 return;
148 134
149 what |= ddir_act[rw & WRITE]; 135 what |= ddir_act[rw & WRITE];
150 what |= bio_act[trace_barrier_bit(rw)]; 136 what |= MASK_TC_BIT(rw, BARRIER);
151 what |= bio_act[trace_sync_bit(rw)]; 137 what |= MASK_TC_BIT(rw, SYNC);
152 what |= bio_act[trace_ahead_bit(rw)]; 138 what |= MASK_TC_BIT(rw, AHEAD);
153 what |= bio_act[trace_meta_bit(rw)]; 139 what |= MASK_TC_BIT(rw, META);
140 what |= MASK_TC_BIT(rw, DISCARD);
154 141
155 pid = tsk->pid; 142 pid = tsk->pid;
156 if (unlikely(act_log_check(bt, what, sector, pid))) 143 if (unlikely(act_log_check(bt, what, sector, pid)))
@@ -382,7 +369,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
382 if (!buts->buf_size || !buts->buf_nr) 369 if (!buts->buf_size || !buts->buf_nr)
383 return -EINVAL; 370 return -EINVAL;
384 371
385 strcpy(buts->name, name); 372 strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
373 buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
386 374
387 /* 375 /*
388 * some device names have larger paths - convert the slashes 376 * some device names have larger paths - convert the slashes
diff --git a/block/bsg.c b/block/bsg.c
index 0aae8d7ba99c..56cb343c76d8 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -283,7 +283,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, int has_write_perm)
283 next_rq->cmd_type = rq->cmd_type; 283 next_rq->cmd_type = rq->cmd_type;
284 284
285 dxferp = (void*)(unsigned long)hdr->din_xferp; 285 dxferp = (void*)(unsigned long)hdr->din_xferp;
286 ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len); 286 ret = blk_rq_map_user(q, next_rq, NULL, dxferp,
287 hdr->din_xfer_len, GFP_KERNEL);
287 if (ret) 288 if (ret)
288 goto out; 289 goto out;
289 } 290 }
@@ -298,7 +299,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, int has_write_perm)
298 dxfer_len = 0; 299 dxfer_len = 0;
299 300
300 if (dxfer_len) { 301 if (dxfer_len) {
301 ret = blk_rq_map_user(q, rq, dxferp, dxfer_len); 302 ret = blk_rq_map_user(q, rq, NULL, dxferp, dxfer_len,
303 GFP_KERNEL);
302 if (ret) 304 if (ret)
303 goto out; 305 goto out;
304 } 306 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1e2aff812ee2..6a062eebbd15 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -39,6 +39,7 @@ static int cfq_slice_idle = HZ / 125;
39#define CFQ_MIN_TT (2) 39#define CFQ_MIN_TT (2)
40 40
41#define CFQ_SLICE_SCALE (5) 41#define CFQ_SLICE_SCALE (5)
42#define CFQ_HW_QUEUE_MIN (5)
42 43
43#define RQ_CIC(rq) \ 44#define RQ_CIC(rq) \
44 ((struct cfq_io_context *) (rq)->elevator_private) 45 ((struct cfq_io_context *) (rq)->elevator_private)
@@ -86,7 +87,14 @@ struct cfq_data {
86 87
87 int rq_in_driver; 88 int rq_in_driver;
88 int sync_flight; 89 int sync_flight;
90
91 /*
92 * queue-depth detection
93 */
94 int rq_queued;
89 int hw_tag; 95 int hw_tag;
96 int hw_tag_samples;
97 int rq_in_driver_peak;
90 98
91 /* 99 /*
92 * idle window management 100 * idle window management
@@ -244,7 +252,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
244{ 252{
245 if (cfqd->busy_queues) { 253 if (cfqd->busy_queues) {
246 cfq_log(cfqd, "schedule dispatch"); 254 cfq_log(cfqd, "schedule dispatch");
247 kblockd_schedule_work(&cfqd->unplug_work); 255 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
248 } 256 }
249} 257}
250 258
@@ -654,15 +662,6 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
654 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 662 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
655 cfqd->rq_in_driver); 663 cfqd->rq_in_driver);
656 664
657 /*
658 * If the depth is larger 1, it really could be queueing. But lets
659 * make the mark a little higher - idling could still be good for
660 * low queueing, and a low queueing number could also just indicate
661 * a SCSI mid layer like behaviour where limit+1 is often seen.
662 */
663 if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
664 cfqd->hw_tag = 1;
665
666 cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors; 665 cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
667} 666}
668 667
@@ -686,6 +685,7 @@ static void cfq_remove_request(struct request *rq)
686 list_del_init(&rq->queuelist); 685 list_del_init(&rq->queuelist);
687 cfq_del_rq_rb(rq); 686 cfq_del_rq_rb(rq);
688 687
688 cfqq->cfqd->rq_queued--;
689 if (rq_is_meta(rq)) { 689 if (rq_is_meta(rq)) {
690 WARN_ON(!cfqq->meta_pending); 690 WARN_ON(!cfqq->meta_pending);
691 cfqq->meta_pending--; 691 cfqq->meta_pending--;
@@ -878,6 +878,14 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
878 struct cfq_io_context *cic; 878 struct cfq_io_context *cic;
879 unsigned long sl; 879 unsigned long sl;
880 880
881 /*
882 * SSD device without seek penalty, disable idling. But only do so
883 * for devices that support queuing, otherwise we still have a problem
884 * with sync vs async workloads.
885 */
886 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
887 return;
888
881 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 889 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
882 WARN_ON(cfq_cfqq_slice_new(cfqq)); 890 WARN_ON(cfq_cfqq_slice_new(cfqq));
883 891
@@ -1833,6 +1841,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1833{ 1841{
1834 struct cfq_io_context *cic = RQ_CIC(rq); 1842 struct cfq_io_context *cic = RQ_CIC(rq);
1835 1843
1844 cfqd->rq_queued++;
1836 if (rq_is_meta(rq)) 1845 if (rq_is_meta(rq))
1837 cfqq->meta_pending++; 1846 cfqq->meta_pending++;
1838 1847
@@ -1880,6 +1889,31 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
1880 cfq_rq_enqueued(cfqd, cfqq, rq); 1889 cfq_rq_enqueued(cfqd, cfqq, rq);
1881} 1890}
1882 1891
1892/*
1893 * Update hw_tag based on peak queue depth over 50 samples under
1894 * sufficient load.
1895 */
1896static void cfq_update_hw_tag(struct cfq_data *cfqd)
1897{
1898 if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
1899 cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
1900
1901 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
1902 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
1903 return;
1904
1905 if (cfqd->hw_tag_samples++ < 50)
1906 return;
1907
1908 if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
1909 cfqd->hw_tag = 1;
1910 else
1911 cfqd->hw_tag = 0;
1912
1913 cfqd->hw_tag_samples = 0;
1914 cfqd->rq_in_driver_peak = 0;
1915}
1916
1883static void cfq_completed_request(struct request_queue *q, struct request *rq) 1917static void cfq_completed_request(struct request_queue *q, struct request *rq)
1884{ 1918{
1885 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1919 struct cfq_queue *cfqq = RQ_CFQQ(rq);
@@ -1890,6 +1924,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
1890 now = jiffies; 1924 now = jiffies;
1891 cfq_log_cfqq(cfqd, cfqq, "complete"); 1925 cfq_log_cfqq(cfqd, cfqq, "complete");
1892 1926
1927 cfq_update_hw_tag(cfqd);
1928
1893 WARN_ON(!cfqd->rq_in_driver); 1929 WARN_ON(!cfqd->rq_in_driver);
1894 WARN_ON(!cfqq->dispatched); 1930 WARN_ON(!cfqq->dispatched);
1895 cfqd->rq_in_driver--; 1931 cfqd->rq_in_driver--;
@@ -2200,6 +2236,7 @@ static void *cfq_init_queue(struct request_queue *q)
2200 cfqd->cfq_slice[1] = cfq_slice_sync; 2236 cfqd->cfq_slice[1] = cfq_slice_sync;
2201 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 2237 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2202 cfqd->cfq_slice_idle = cfq_slice_idle; 2238 cfqd->cfq_slice_idle = cfq_slice_idle;
2239 cfqd->hw_tag = 1;
2203 2240
2204 return cfqd; 2241 return cfqd;
2205} 2242}
diff --git a/block/cmd-filter.c b/block/cmd-filter.c
index 79c14996ac11..e669aed4c6bc 100644
--- a/block/cmd-filter.c
+++ b/block/cmd-filter.c
@@ -211,14 +211,10 @@ int blk_register_filter(struct gendisk *disk)
211{ 211{
212 int ret; 212 int ret;
213 struct blk_cmd_filter *filter = &disk->queue->cmd_filter; 213 struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
214 struct kobject *parent = kobject_get(disk->holder_dir->parent);
215 214
216 if (!parent) 215 ret = kobject_init_and_add(&filter->kobj, &rcf_ktype,
217 return -ENODEV; 216 &disk_to_dev(disk)->kobj,
218
219 ret = kobject_init_and_add(&filter->kobj, &rcf_ktype, parent,
220 "%s", "cmd_filter"); 217 "%s", "cmd_filter");
221
222 if (ret < 0) 218 if (ret < 0)
223 return ret; 219 return ret;
224 220
@@ -231,7 +227,6 @@ void blk_unregister_filter(struct gendisk *disk)
231 struct blk_cmd_filter *filter = &disk->queue->cmd_filter; 227 struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
232 228
233 kobject_put(&filter->kobj); 229 kobject_put(&filter->kobj);
234 kobject_put(disk->holder_dir->parent);
235} 230}
236EXPORT_SYMBOL(blk_unregister_filter); 231EXPORT_SYMBOL(blk_unregister_filter);
237#endif 232#endif
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index c23177e4623f..1e559fba7bdf 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -788,6 +788,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
788 return compat_hdio_getgeo(disk, bdev, compat_ptr(arg)); 788 return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
789 case BLKFLSBUF: 789 case BLKFLSBUF:
790 case BLKROSET: 790 case BLKROSET:
791 case BLKDISCARD:
791 /* 792 /*
792 * the ones below are implemented in blkdev_locked_ioctl, 793 * the ones below are implemented in blkdev_locked_ioctl,
793 * but we call blkdev_ioctl, which gets the lock for us 794 * but we call blkdev_ioctl, which gets the lock for us
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 342448c3d2dd..fd311179f44c 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -33,7 +33,7 @@ struct deadline_data {
33 */ 33 */
34 struct rb_root sort_list[2]; 34 struct rb_root sort_list[2];
35 struct list_head fifo_list[2]; 35 struct list_head fifo_list[2];
36 36
37 /* 37 /*
38 * next in sort order. read, write or both are NULL 38 * next in sort order. read, write or both are NULL
39 */ 39 */
@@ -53,7 +53,11 @@ struct deadline_data {
53 53
54static void deadline_move_request(struct deadline_data *, struct request *); 54static void deadline_move_request(struct deadline_data *, struct request *);
55 55
56#define RQ_RB_ROOT(dd, rq) (&(dd)->sort_list[rq_data_dir((rq))]) 56static inline struct rb_root *
57deadline_rb_root(struct deadline_data *dd, struct request *rq)
58{
59 return &dd->sort_list[rq_data_dir(rq)];
60}
57 61
58/* 62/*
59 * get the request after `rq' in sector-sorted order 63 * get the request after `rq' in sector-sorted order
@@ -72,15 +76,11 @@ deadline_latter_request(struct request *rq)
72static void 76static void
73deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) 77deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
74{ 78{
75 struct rb_root *root = RQ_RB_ROOT(dd, rq); 79 struct rb_root *root = deadline_rb_root(dd, rq);
76 struct request *__alias; 80 struct request *__alias;
77 81
78retry: 82 while (unlikely(__alias = elv_rb_add(root, rq)))
79 __alias = elv_rb_add(root, rq);
80 if (unlikely(__alias)) {
81 deadline_move_request(dd, __alias); 83 deadline_move_request(dd, __alias);
82 goto retry;
83 }
84} 84}
85 85
86static inline void 86static inline void
@@ -91,7 +91,7 @@ deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
91 if (dd->next_rq[data_dir] == rq) 91 if (dd->next_rq[data_dir] == rq)
92 dd->next_rq[data_dir] = deadline_latter_request(rq); 92 dd->next_rq[data_dir] = deadline_latter_request(rq);
93 93
94 elv_rb_del(RQ_RB_ROOT(dd, rq), rq); 94 elv_rb_del(deadline_rb_root(dd, rq), rq);
95} 95}
96 96
97/* 97/*
@@ -106,7 +106,7 @@ deadline_add_request(struct request_queue *q, struct request *rq)
106 deadline_add_rq_rb(dd, rq); 106 deadline_add_rq_rb(dd, rq);
107 107
108 /* 108 /*
109 * set expire time (only used for reads) and add to fifo list 109 * set expire time and add to fifo list
110 */ 110 */
111 rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); 111 rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
112 list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); 112 list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
@@ -162,7 +162,7 @@ static void deadline_merged_request(struct request_queue *q,
162 * if the merge was a front merge, we need to reposition request 162 * if the merge was a front merge, we need to reposition request
163 */ 163 */
164 if (type == ELEVATOR_FRONT_MERGE) { 164 if (type == ELEVATOR_FRONT_MERGE) {
165 elv_rb_del(RQ_RB_ROOT(dd, req), req); 165 elv_rb_del(deadline_rb_root(dd, req), req);
166 deadline_add_rq_rb(dd, req); 166 deadline_add_rq_rb(dd, req);
167 } 167 }
168} 168}
@@ -212,7 +212,7 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
212 dd->next_rq[WRITE] = NULL; 212 dd->next_rq[WRITE] = NULL;
213 dd->next_rq[data_dir] = deadline_latter_request(rq); 213 dd->next_rq[data_dir] = deadline_latter_request(rq);
214 214
215 dd->last_sector = rq->sector + rq->nr_sectors; 215 dd->last_sector = rq_end_sector(rq);
216 216
217 /* 217 /*
218 * take it off the sort and fifo list, move 218 * take it off the sort and fifo list, move
@@ -222,7 +222,7 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
222} 222}
223 223
224/* 224/*
225 * deadline_check_fifo returns 0 if there are no expired reads on the fifo, 225 * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
226 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) 226 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
227 */ 227 */
228static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) 228static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
@@ -258,17 +258,9 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
258 else 258 else
259 rq = dd->next_rq[READ]; 259 rq = dd->next_rq[READ];
260 260
261 if (rq) { 261 if (rq && dd->batching < dd->fifo_batch)
262 /* we have a "next request" */ 262 /* we have a next request are still entitled to batch */
263 263 goto dispatch_request;
264 if (dd->last_sector != rq->sector)
265 /* end the batch on a non sequential request */
266 dd->batching += dd->fifo_batch;
267
268 if (dd->batching < dd->fifo_batch)
269 /* we are still entitled to batch */
270 goto dispatch_request;
271 }
272 264
273 /* 265 /*
274 * at this point we are not running a batch. select the appropriate 266 * at this point we are not running a batch. select the appropriate
diff --git a/block/elevator.c b/block/elevator.c
index ed6f8f32d27e..04518921db31 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -34,8 +34,9 @@
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/blktrace_api.h> 35#include <linux/blktrace_api.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/uaccess.h>
37 38
38#include <asm/uaccess.h> 39#include "blk.h"
39 40
40static DEFINE_SPINLOCK(elv_list_lock); 41static DEFINE_SPINLOCK(elv_list_lock);
41static LIST_HEAD(elv_list); 42static LIST_HEAD(elv_list);
@@ -75,6 +76,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
75 return 0; 76 return 0;
76 77
77 /* 78 /*
79 * Don't merge file system requests and discard requests
80 */
81 if (bio_discard(bio) != bio_discard(rq->bio))
82 return 0;
83
84 /*
78 * different data direction or already started, don't merge 85 * different data direction or already started, don't merge
79 */ 86 */
80 if (bio_data_dir(bio) != rq_data_dir(rq)) 87 if (bio_data_dir(bio) != rq_data_dir(rq))
@@ -438,6 +445,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
438 list_for_each_prev(entry, &q->queue_head) { 445 list_for_each_prev(entry, &q->queue_head) {
439 struct request *pos = list_entry_rq(entry); 446 struct request *pos = list_entry_rq(entry);
440 447
448 if (blk_discard_rq(rq) != blk_discard_rq(pos))
449 break;
441 if (rq_data_dir(rq) != rq_data_dir(pos)) 450 if (rq_data_dir(rq) != rq_data_dir(pos))
442 break; 451 break;
443 if (pos->cmd_flags & stop_flags) 452 if (pos->cmd_flags & stop_flags)
@@ -607,7 +616,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
607 break; 616 break;
608 617
609 case ELEVATOR_INSERT_SORT: 618 case ELEVATOR_INSERT_SORT:
610 BUG_ON(!blk_fs_request(rq)); 619 BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));
611 rq->cmd_flags |= REQ_SORTED; 620 rq->cmd_flags |= REQ_SORTED;
612 q->nr_sorted++; 621 q->nr_sorted++;
613 if (rq_mergeable(rq)) { 622 if (rq_mergeable(rq)) {
@@ -692,7 +701,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
692 * this request is scheduling boundary, update 701 * this request is scheduling boundary, update
693 * end_sector 702 * end_sector
694 */ 703 */
695 if (blk_fs_request(rq)) { 704 if (blk_fs_request(rq) || blk_discard_rq(rq)) {
696 q->end_sector = rq_end_sector(rq); 705 q->end_sector = rq_end_sector(rq);
697 q->boundary_rq = rq; 706 q->boundary_rq = rq;
698 } 707 }
@@ -745,7 +754,7 @@ struct request *elv_next_request(struct request_queue *q)
745 * not ever see it. 754 * not ever see it.
746 */ 755 */
747 if (blk_empty_barrier(rq)) { 756 if (blk_empty_barrier(rq)) {
748 end_queued_request(rq, 1); 757 __blk_end_request(rq, 0, blk_rq_bytes(rq));
749 continue; 758 continue;
750 } 759 }
751 if (!(rq->cmd_flags & REQ_STARTED)) { 760 if (!(rq->cmd_flags & REQ_STARTED)) {
@@ -764,6 +773,12 @@ struct request *elv_next_request(struct request_queue *q)
764 */ 773 */
765 rq->cmd_flags |= REQ_STARTED; 774 rq->cmd_flags |= REQ_STARTED;
766 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 775 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
776
777 /*
778 * We are now handing the request to the hardware,
779 * add the timeout handler
780 */
781 blk_add_timer(rq);
767 } 782 }
768 783
769 if (!q->boundary_rq || q->boundary_rq == rq) { 784 if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -782,7 +797,6 @@ struct request *elv_next_request(struct request_queue *q)
782 * device can handle 797 * device can handle
783 */ 798 */
784 rq->nr_phys_segments++; 799 rq->nr_phys_segments++;
785 rq->nr_hw_segments++;
786 } 800 }
787 801
788 if (!q->prep_rq_fn) 802 if (!q->prep_rq_fn)
@@ -805,14 +819,13 @@ struct request *elv_next_request(struct request_queue *q)
805 * so that we don't add it again 819 * so that we don't add it again
806 */ 820 */
807 --rq->nr_phys_segments; 821 --rq->nr_phys_segments;
808 --rq->nr_hw_segments;
809 } 822 }
810 823
811 rq = NULL; 824 rq = NULL;
812 break; 825 break;
813 } else if (ret == BLKPREP_KILL) { 826 } else if (ret == BLKPREP_KILL) {
814 rq->cmd_flags |= REQ_QUIET; 827 rq->cmd_flags |= REQ_QUIET;
815 end_queued_request(rq, 0); 828 __blk_end_request(rq, -EIO, blk_rq_bytes(rq));
816 } else { 829 } else {
817 printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); 830 printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
818 break; 831 break;
@@ -901,6 +914,19 @@ int elv_may_queue(struct request_queue *q, int rw)
901 return ELV_MQUEUE_MAY; 914 return ELV_MQUEUE_MAY;
902} 915}
903 916
917void elv_abort_queue(struct request_queue *q)
918{
919 struct request *rq;
920
921 while (!list_empty(&q->queue_head)) {
922 rq = list_entry_rq(q->queue_head.next);
923 rq->cmd_flags |= REQ_QUIET;
924 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
925 __blk_end_request(rq, -EIO, blk_rq_bytes(rq));
926 }
927}
928EXPORT_SYMBOL(elv_abort_queue);
929
904void elv_completed_request(struct request_queue *q, struct request *rq) 930void elv_completed_request(struct request_queue *q, struct request *rq)
905{ 931{
906 elevator_t *e = q->elevator; 932 elevator_t *e = q->elevator;
diff --git a/block/genhd.c b/block/genhd.c
index e0ce23ac2ece..4cd3433c99ac 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -16,6 +16,7 @@
16#include <linux/kobj_map.h> 16#include <linux/kobj_map.h>
17#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/idr.h>
19 20
20#include "blk.h" 21#include "blk.h"
21 22
@@ -24,8 +25,194 @@ static DEFINE_MUTEX(block_class_lock);
24struct kobject *block_depr; 25struct kobject *block_depr;
25#endif 26#endif
26 27
28/* for extended dynamic devt allocation, currently only one major is used */
29#define MAX_EXT_DEVT (1 << MINORBITS)
30
31/* For extended devt allocation. ext_devt_mutex prevents look up
32 * results from going away underneath its user.
33 */
34static DEFINE_MUTEX(ext_devt_mutex);
35static DEFINE_IDR(ext_devt_idr);
36
27static struct device_type disk_type; 37static struct device_type disk_type;
28 38
39/**
40 * disk_get_part - get partition
41 * @disk: disk to look partition from
42 * @partno: partition number
43 *
44 * Look for partition @partno from @disk. If found, increment
45 * reference count and return it.
46 *
47 * CONTEXT:
48 * Don't care.
49 *
50 * RETURNS:
51 * Pointer to the found partition on success, NULL if not found.
52 */
53struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
54{
55 struct hd_struct *part = NULL;
56 struct disk_part_tbl *ptbl;
57
58 if (unlikely(partno < 0))
59 return NULL;
60
61 rcu_read_lock();
62
63 ptbl = rcu_dereference(disk->part_tbl);
64 if (likely(partno < ptbl->len)) {
65 part = rcu_dereference(ptbl->part[partno]);
66 if (part)
67 get_device(part_to_dev(part));
68 }
69
70 rcu_read_unlock();
71
72 return part;
73}
74EXPORT_SYMBOL_GPL(disk_get_part);
75
76/**
77 * disk_part_iter_init - initialize partition iterator
78 * @piter: iterator to initialize
79 * @disk: disk to iterate over
80 * @flags: DISK_PITER_* flags
81 *
82 * Initialize @piter so that it iterates over partitions of @disk.
83 *
84 * CONTEXT:
85 * Don't care.
86 */
87void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
88 unsigned int flags)
89{
90 struct disk_part_tbl *ptbl;
91
92 rcu_read_lock();
93 ptbl = rcu_dereference(disk->part_tbl);
94
95 piter->disk = disk;
96 piter->part = NULL;
97
98 if (flags & DISK_PITER_REVERSE)
99 piter->idx = ptbl->len - 1;
100 else if (flags & DISK_PITER_INCL_PART0)
101 piter->idx = 0;
102 else
103 piter->idx = 1;
104
105 piter->flags = flags;
106
107 rcu_read_unlock();
108}
109EXPORT_SYMBOL_GPL(disk_part_iter_init);
110
111/**
112 * disk_part_iter_next - proceed iterator to the next partition and return it
113 * @piter: iterator of interest
114 *
115 * Proceed @piter to the next partition and return it.
116 *
117 * CONTEXT:
118 * Don't care.
119 */
120struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
121{
122 struct disk_part_tbl *ptbl;
123 int inc, end;
124
125 /* put the last partition */
126 disk_put_part(piter->part);
127 piter->part = NULL;
128
129 /* get part_tbl */
130 rcu_read_lock();
131 ptbl = rcu_dereference(piter->disk->part_tbl);
132
133 /* determine iteration parameters */
134 if (piter->flags & DISK_PITER_REVERSE) {
135 inc = -1;
136 if (piter->flags & DISK_PITER_INCL_PART0)
137 end = -1;
138 else
139 end = 0;
140 } else {
141 inc = 1;
142 end = ptbl->len;
143 }
144
145 /* iterate to the next partition */
146 for (; piter->idx != end; piter->idx += inc) {
147 struct hd_struct *part;
148
149 part = rcu_dereference(ptbl->part[piter->idx]);
150 if (!part)
151 continue;
152 if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects)
153 continue;
154
155 get_device(part_to_dev(part));
156 piter->part = part;
157 piter->idx += inc;
158 break;
159 }
160
161 rcu_read_unlock();
162
163 return piter->part;
164}
165EXPORT_SYMBOL_GPL(disk_part_iter_next);
166
167/**
168 * disk_part_iter_exit - finish up partition iteration
169 * @piter: iter of interest
170 *
171 * Called when iteration is over. Cleans up @piter.
172 *
173 * CONTEXT:
174 * Don't care.
175 */
176void disk_part_iter_exit(struct disk_part_iter *piter)
177{
178 disk_put_part(piter->part);
179 piter->part = NULL;
180}
181EXPORT_SYMBOL_GPL(disk_part_iter_exit);
182
183/**
184 * disk_map_sector_rcu - map sector to partition
185 * @disk: gendisk of interest
186 * @sector: sector to map
187 *
188 * Find out which partition @sector maps to on @disk. This is
189 * primarily used for stats accounting.
190 *
191 * CONTEXT:
192 * RCU read locked. The returned partition pointer is valid only
193 * while preemption is disabled.
194 *
195 * RETURNS:
196 * Found partition on success, part0 is returned if no partition matches
197 */
198struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
199{
200 struct disk_part_tbl *ptbl;
201 int i;
202
203 ptbl = rcu_dereference(disk->part_tbl);
204
205 for (i = 1; i < ptbl->len; i++) {
206 struct hd_struct *part = rcu_dereference(ptbl->part[i]);
207
208 if (part && part->start_sect <= sector &&
209 sector < part->start_sect + part->nr_sects)
210 return part;
211 }
212 return &disk->part0;
213}
214EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
215
29/* 216/*
30 * Can be deleted altogether. Later. 217 * Can be deleted altogether. Later.
31 * 218 *
@@ -43,14 +230,14 @@ static inline int major_to_index(int major)
43} 230}
44 231
45#ifdef CONFIG_PROC_FS 232#ifdef CONFIG_PROC_FS
46void blkdev_show(struct seq_file *f, off_t offset) 233void blkdev_show(struct seq_file *seqf, off_t offset)
47{ 234{
48 struct blk_major_name *dp; 235 struct blk_major_name *dp;
49 236
50 if (offset < BLKDEV_MAJOR_HASH_SIZE) { 237 if (offset < BLKDEV_MAJOR_HASH_SIZE) {
51 mutex_lock(&block_class_lock); 238 mutex_lock(&block_class_lock);
52 for (dp = major_names[offset]; dp; dp = dp->next) 239 for (dp = major_names[offset]; dp; dp = dp->next)
53 seq_printf(f, "%3d %s\n", dp->major, dp->name); 240 seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
54 mutex_unlock(&block_class_lock); 241 mutex_unlock(&block_class_lock);
55 } 242 }
56} 243}
@@ -136,6 +323,118 @@ EXPORT_SYMBOL(unregister_blkdev);
136 323
137static struct kobj_map *bdev_map; 324static struct kobj_map *bdev_map;
138 325
326/**
327 * blk_mangle_minor - scatter minor numbers apart
328 * @minor: minor number to mangle
329 *
330 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
331 * is enabled. Mangling twice gives the original value.
332 *
333 * RETURNS:
334 * Mangled value.
335 *
336 * CONTEXT:
337 * Don't care.
338 */
339static int blk_mangle_minor(int minor)
340{
341#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
342 int i;
343
344 for (i = 0; i < MINORBITS / 2; i++) {
345 int low = minor & (1 << i);
346 int high = minor & (1 << (MINORBITS - 1 - i));
347 int distance = MINORBITS - 1 - 2 * i;
348
349 minor ^= low | high; /* clear both bits */
350 low <<= distance; /* swap the positions */
351 high >>= distance;
352 minor |= low | high; /* and set */
353 }
354#endif
355 return minor;
356}
357
358/**
359 * blk_alloc_devt - allocate a dev_t for a partition
360 * @part: partition to allocate dev_t for
361 * @gfp_mask: memory allocation flag
362 * @devt: out parameter for resulting dev_t
363 *
364 * Allocate a dev_t for block device.
365 *
366 * RETURNS:
367 * 0 on success, allocated dev_t is returned in *@devt. -errno on
368 * failure.
369 *
370 * CONTEXT:
371 * Might sleep.
372 */
373int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
374{
375 struct gendisk *disk = part_to_disk(part);
376 int idx, rc;
377
378 /* in consecutive minor range? */
379 if (part->partno < disk->minors) {
380 *devt = MKDEV(disk->major, disk->first_minor + part->partno);
381 return 0;
382 }
383
384 /* allocate ext devt */
385 do {
386 if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL))
387 return -ENOMEM;
388 rc = idr_get_new(&ext_devt_idr, part, &idx);
389 } while (rc == -EAGAIN);
390
391 if (rc)
392 return rc;
393
394 if (idx > MAX_EXT_DEVT) {
395 idr_remove(&ext_devt_idr, idx);
396 return -EBUSY;
397 }
398
399 *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
400 return 0;
401}
402
403/**
404 * blk_free_devt - free a dev_t
405 * @devt: dev_t to free
406 *
407 * Free @devt which was allocated using blk_alloc_devt().
408 *
409 * CONTEXT:
410 * Might sleep.
411 */
412void blk_free_devt(dev_t devt)
413{
414 might_sleep();
415
416 if (devt == MKDEV(0, 0))
417 return;
418
419 if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
420 mutex_lock(&ext_devt_mutex);
421 idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
422 mutex_unlock(&ext_devt_mutex);
423 }
424}
425
426static char *bdevt_str(dev_t devt, char *buf)
427{
428 if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
429 char tbuf[BDEVT_SIZE];
430 snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
431 snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
432 } else
433 snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
434
435 return buf;
436}
437
139/* 438/*
140 * Register device numbers dev..(dev+range-1) 439 * Register device numbers dev..(dev+range-1)
141 * range must be nonzero 440 * range must be nonzero
@@ -157,11 +456,11 @@ void blk_unregister_region(dev_t devt, unsigned long range)
157 456
158EXPORT_SYMBOL(blk_unregister_region); 457EXPORT_SYMBOL(blk_unregister_region);
159 458
160static struct kobject *exact_match(dev_t devt, int *part, void *data) 459static struct kobject *exact_match(dev_t devt, int *partno, void *data)
161{ 460{
162 struct gendisk *p = data; 461 struct gendisk *p = data;
163 462
164 return &p->dev.kobj; 463 return &disk_to_dev(p)->kobj;
165} 464}
166 465
167static int exact_lock(dev_t devt, void *data) 466static int exact_lock(dev_t devt, void *data)
@@ -179,21 +478,46 @@ static int exact_lock(dev_t devt, void *data)
179 * 478 *
180 * This function registers the partitioning information in @disk 479 * This function registers the partitioning information in @disk
181 * with the kernel. 480 * with the kernel.
481 *
482 * FIXME: error handling
182 */ 483 */
183void add_disk(struct gendisk *disk) 484void add_disk(struct gendisk *disk)
184{ 485{
185 struct backing_dev_info *bdi; 486 struct backing_dev_info *bdi;
487 dev_t devt;
186 int retval; 488 int retval;
187 489
490 /* minors == 0 indicates to use ext devt from part0 and should
491 * be accompanied with EXT_DEVT flag. Make sure all
492 * parameters make sense.
493 */
494 WARN_ON(disk->minors && !(disk->major || disk->first_minor));
495 WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
496
188 disk->flags |= GENHD_FL_UP; 497 disk->flags |= GENHD_FL_UP;
189 blk_register_region(MKDEV(disk->major, disk->first_minor), 498
190 disk->minors, NULL, exact_match, exact_lock, disk); 499 retval = blk_alloc_devt(&disk->part0, &devt);
500 if (retval) {
501 WARN_ON(1);
502 return;
503 }
504 disk_to_dev(disk)->devt = devt;
505
506 /* ->major and ->first_minor aren't supposed to be
507 * dereferenced from here on, but set them just in case.
508 */
509 disk->major = MAJOR(devt);
510 disk->first_minor = MINOR(devt);
511
512 blk_register_region(disk_devt(disk), disk->minors, NULL,
513 exact_match, exact_lock, disk);
191 register_disk(disk); 514 register_disk(disk);
192 blk_register_queue(disk); 515 blk_register_queue(disk);
193 516
194 bdi = &disk->queue->backing_dev_info; 517 bdi = &disk->queue->backing_dev_info;
195 bdi_register_dev(bdi, MKDEV(disk->major, disk->first_minor)); 518 bdi_register_dev(bdi, disk_devt(disk));
196 retval = sysfs_create_link(&disk->dev.kobj, &bdi->dev->kobj, "bdi"); 519 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
520 "bdi");
197 WARN_ON(retval); 521 WARN_ON(retval);
198} 522}
199 523
@@ -202,78 +526,71 @@ EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */
202 526
203void unlink_gendisk(struct gendisk *disk) 527void unlink_gendisk(struct gendisk *disk)
204{ 528{
205 sysfs_remove_link(&disk->dev.kobj, "bdi"); 529 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
206 bdi_unregister(&disk->queue->backing_dev_info); 530 bdi_unregister(&disk->queue->backing_dev_info);
207 blk_unregister_queue(disk); 531 blk_unregister_queue(disk);
208 blk_unregister_region(MKDEV(disk->major, disk->first_minor), 532 blk_unregister_region(disk_devt(disk), disk->minors);
209 disk->minors);
210} 533}
211 534
212/** 535/**
213 * get_gendisk - get partitioning information for a given device 536 * get_gendisk - get partitioning information for a given device
214 * @dev: device to get partitioning information for 537 * @devt: device to get partitioning information for
538 * @part: returned partition index
215 * 539 *
216 * This function gets the structure containing partitioning 540 * This function gets the structure containing partitioning
217 * information for the given device @dev. 541 * information for the given device @devt.
218 */ 542 */
219struct gendisk *get_gendisk(dev_t devt, int *part) 543struct gendisk *get_gendisk(dev_t devt, int *partno)
220{ 544{
221 struct kobject *kobj = kobj_lookup(bdev_map, devt, part); 545 struct gendisk *disk = NULL;
222 struct device *dev = kobj_to_dev(kobj); 546
547 if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
548 struct kobject *kobj;
549
550 kobj = kobj_lookup(bdev_map, devt, partno);
551 if (kobj)
552 disk = dev_to_disk(kobj_to_dev(kobj));
553 } else {
554 struct hd_struct *part;
223 555
224 return kobj ? dev_to_disk(dev) : NULL; 556 mutex_lock(&ext_devt_mutex);
557 part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
558 if (part && get_disk(part_to_disk(part))) {
559 *partno = part->partno;
560 disk = part_to_disk(part);
561 }
562 mutex_unlock(&ext_devt_mutex);
563 }
564
565 return disk;
225} 566}
226 567
227/* 568/**
228 * print a partitions - intended for places where the root filesystem can't be 569 * bdget_disk - do bdget() by gendisk and partition number
229 * mounted and thus to give the victim some idea of what went wrong 570 * @disk: gendisk of interest
571 * @partno: partition number
572 *
573 * Find partition @partno from @disk, do bdget() on it.
574 *
575 * CONTEXT:
576 * Don't care.
577 *
578 * RETURNS:
579 * Resulting block_device on success, NULL on failure.
230 */ 580 */
231static int printk_partition(struct device *dev, void *data) 581struct block_device *bdget_disk(struct gendisk *disk, int partno)
232{ 582{
233 struct gendisk *sgp; 583 struct hd_struct *part;
234 char buf[BDEVNAME_SIZE]; 584 struct block_device *bdev = NULL;
235 int n;
236
237 if (dev->type != &disk_type)
238 goto exit;
239 585
240 sgp = dev_to_disk(dev); 586 part = disk_get_part(disk, partno);
241 /* 587 if (part)
242 * Don't show empty devices or things that have been surpressed 588 bdev = bdget(part_devt(part));
243 */ 589 disk_put_part(part);
244 if (get_capacity(sgp) == 0 ||
245 (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
246 goto exit;
247 590
248 /* 591 return bdev;
249 * Note, unlike /proc/partitions, I am showing the numbers in
250 * hex - the same format as the root= option takes.
251 */
252 printk("%02x%02x %10llu %s",
253 sgp->major, sgp->first_minor,
254 (unsigned long long)get_capacity(sgp) >> 1,
255 disk_name(sgp, 0, buf));
256 if (sgp->driverfs_dev != NULL &&
257 sgp->driverfs_dev->driver != NULL)
258 printk(" driver: %s\n",
259 sgp->driverfs_dev->driver->name);
260 else
261 printk(" (driver?)\n");
262
263 /* now show the partitions */
264 for (n = 0; n < sgp->minors - 1; ++n) {
265 if (sgp->part[n] == NULL)
266 goto exit;
267 if (sgp->part[n]->nr_sects == 0)
268 goto exit;
269 printk(" %02x%02x %10llu %s\n",
270 sgp->major, n + 1 + sgp->first_minor,
271 (unsigned long long)sgp->part[n]->nr_sects >> 1,
272 disk_name(sgp, n + 1, buf));
273 }
274exit:
275 return 0;
276} 592}
593EXPORT_SYMBOL(bdget_disk);
277 594
278/* 595/*
279 * print a full list of all partitions - intended for places where the root 596 * print a full list of all partitions - intended for places where the root
@@ -282,120 +599,145 @@ exit:
282 */ 599 */
283void __init printk_all_partitions(void) 600void __init printk_all_partitions(void)
284{ 601{
285 mutex_lock(&block_class_lock); 602 struct class_dev_iter iter;
286 class_for_each_device(&block_class, NULL, NULL, printk_partition); 603 struct device *dev;
287 mutex_unlock(&block_class_lock); 604
605 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
606 while ((dev = class_dev_iter_next(&iter))) {
607 struct gendisk *disk = dev_to_disk(dev);
608 struct disk_part_iter piter;
609 struct hd_struct *part;
610 char name_buf[BDEVNAME_SIZE];
611 char devt_buf[BDEVT_SIZE];
612
613 /*
614 * Don't show empty devices or things that have been
615 * surpressed
616 */
617 if (get_capacity(disk) == 0 ||
618 (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
619 continue;
620
621 /*
622 * Note, unlike /proc/partitions, I am showing the
623 * numbers in hex - the same format as the root=
624 * option takes.
625 */
626 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
627 while ((part = disk_part_iter_next(&piter))) {
628 bool is_part0 = part == &disk->part0;
629
630 printk("%s%s %10llu %s", is_part0 ? "" : " ",
631 bdevt_str(part_devt(part), devt_buf),
632 (unsigned long long)part->nr_sects >> 1,
633 disk_name(disk, part->partno, name_buf));
634 if (is_part0) {
635 if (disk->driverfs_dev != NULL &&
636 disk->driverfs_dev->driver != NULL)
637 printk(" driver: %s\n",
638 disk->driverfs_dev->driver->name);
639 else
640 printk(" (driver?)\n");
641 } else
642 printk("\n");
643 }
644 disk_part_iter_exit(&piter);
645 }
646 class_dev_iter_exit(&iter);
288} 647}
289 648
290#ifdef CONFIG_PROC_FS 649#ifdef CONFIG_PROC_FS
291/* iterator */ 650/* iterator */
292static int find_start(struct device *dev, void *data) 651static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
293{ 652{
294 loff_t *k = data; 653 loff_t skip = *pos;
654 struct class_dev_iter *iter;
655 struct device *dev;
295 656
296 if (dev->type != &disk_type) 657 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
297 return 0; 658 if (!iter)
298 if (!*k) 659 return ERR_PTR(-ENOMEM);
299 return 1; 660
300 (*k)--; 661 seqf->private = iter;
301 return 0; 662 class_dev_iter_init(iter, &block_class, NULL, &disk_type);
663 do {
664 dev = class_dev_iter_next(iter);
665 if (!dev)
666 return NULL;
667 } while (skip--);
668
669 return dev_to_disk(dev);
302} 670}
303 671
304static void *part_start(struct seq_file *part, loff_t *pos) 672static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
305{ 673{
306 struct device *dev; 674 struct device *dev;
307 loff_t k = *pos;
308
309 if (!k)
310 part->private = (void *)1LU; /* tell show to print header */
311 675
312 mutex_lock(&block_class_lock); 676 (*pos)++;
313 dev = class_find_device(&block_class, NULL, &k, find_start); 677 dev = class_dev_iter_next(seqf->private);
314 if (dev) { 678 if (dev)
315 put_device(dev);
316 return dev_to_disk(dev); 679 return dev_to_disk(dev);
317 } 680
318 return NULL; 681 return NULL;
319} 682}
320 683
321static int find_next(struct device *dev, void *data) 684static void disk_seqf_stop(struct seq_file *seqf, void *v)
322{ 685{
323 if (dev->type == &disk_type) 686 struct class_dev_iter *iter = seqf->private;
324 return 1;
325 return 0;
326}
327 687
328static void *part_next(struct seq_file *part, void *v, loff_t *pos) 688 /* stop is called even after start failed :-( */
329{ 689 if (iter) {
330 struct gendisk *gp = v; 690 class_dev_iter_exit(iter);
331 struct device *dev; 691 kfree(iter);
332 ++*pos;
333 dev = class_find_device(&block_class, &gp->dev, NULL, find_next);
334 if (dev) {
335 put_device(dev);
336 return dev_to_disk(dev);
337 } 692 }
338 return NULL;
339} 693}
340 694
341static void part_stop(struct seq_file *part, void *v) 695static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
342{ 696{
343 mutex_unlock(&block_class_lock); 697 static void *p;
698
699 p = disk_seqf_start(seqf, pos);
700 if (!IS_ERR(p) && p && !*pos)
701 seq_puts(seqf, "major minor #blocks name\n\n");
702 return p;
344} 703}
345 704
346static int show_partition(struct seq_file *part, void *v) 705static int show_partition(struct seq_file *seqf, void *v)
347{ 706{
348 struct gendisk *sgp = v; 707 struct gendisk *sgp = v;
349 int n; 708 struct disk_part_iter piter;
709 struct hd_struct *part;
350 char buf[BDEVNAME_SIZE]; 710 char buf[BDEVNAME_SIZE];
351 711
352 /*
353 * Print header if start told us to do. This is to preserve
354 * the original behavior of not printing header if no
355 * partition exists. This hackery will be removed later with
356 * class iteration clean up.
357 */
358 if (part->private) {
359 seq_puts(part, "major minor #blocks name\n\n");
360 part->private = NULL;
361 }
362
363 /* Don't show non-partitionable removeable devices or empty devices */ 712 /* Don't show non-partitionable removeable devices or empty devices */
364 if (!get_capacity(sgp) || 713 if (!get_capacity(sgp) || (!disk_partitionable(sgp) &&
365 (sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE))) 714 (sgp->flags & GENHD_FL_REMOVABLE)))
366 return 0; 715 return 0;
367 if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) 716 if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
368 return 0; 717 return 0;
369 718
370 /* show the full disk and all non-0 size partitions of it */ 719 /* show the full disk and all non-0 size partitions of it */
371 seq_printf(part, "%4d %4d %10llu %s\n", 720 disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
372 sgp->major, sgp->first_minor, 721 while ((part = disk_part_iter_next(&piter)))
373 (unsigned long long)get_capacity(sgp) >> 1, 722 seq_printf(seqf, "%4d %7d %10llu %s\n",
374 disk_name(sgp, 0, buf)); 723 MAJOR(part_devt(part)), MINOR(part_devt(part)),
375 for (n = 0; n < sgp->minors - 1; n++) { 724 (unsigned long long)part->nr_sects >> 1,
376 if (!sgp->part[n]) 725 disk_name(sgp, part->partno, buf));
377 continue; 726 disk_part_iter_exit(&piter);
378 if (sgp->part[n]->nr_sects == 0)
379 continue;
380 seq_printf(part, "%4d %4d %10llu %s\n",
381 sgp->major, n + 1 + sgp->first_minor,
382 (unsigned long long)sgp->part[n]->nr_sects >> 1 ,
383 disk_name(sgp, n + 1, buf));
384 }
385 727
386 return 0; 728 return 0;
387} 729}
388 730
389const struct seq_operations partitions_op = { 731const struct seq_operations partitions_op = {
390 .start = part_start, 732 .start = show_partition_start,
391 .next = part_next, 733 .next = disk_seqf_next,
392 .stop = part_stop, 734 .stop = disk_seqf_stop,
393 .show = show_partition 735 .show = show_partition
394}; 736};
395#endif 737#endif
396 738
397 739
398static struct kobject *base_probe(dev_t devt, int *part, void *data) 740static struct kobject *base_probe(dev_t devt, int *partno, void *data)
399{ 741{
400 if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) 742 if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
401 /* Make old-style 2.4 aliases work */ 743 /* Make old-style 2.4 aliases work */
@@ -431,29 +773,29 @@ static ssize_t disk_range_show(struct device *dev,
431 return sprintf(buf, "%d\n", disk->minors); 773 return sprintf(buf, "%d\n", disk->minors);
432} 774}
433 775
434static ssize_t disk_removable_show(struct device *dev, 776static ssize_t disk_ext_range_show(struct device *dev,
435 struct device_attribute *attr, char *buf) 777 struct device_attribute *attr, char *buf)
436{ 778{
437 struct gendisk *disk = dev_to_disk(dev); 779 struct gendisk *disk = dev_to_disk(dev);
438 780
439 return sprintf(buf, "%d\n", 781 return sprintf(buf, "%d\n", disk_max_parts(disk));
440 (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
441} 782}
442 783
443static ssize_t disk_ro_show(struct device *dev, 784static ssize_t disk_removable_show(struct device *dev,
444 struct device_attribute *attr, char *buf) 785 struct device_attribute *attr, char *buf)
445{ 786{
446 struct gendisk *disk = dev_to_disk(dev); 787 struct gendisk *disk = dev_to_disk(dev);
447 788
448 return sprintf(buf, "%d\n", disk->policy ? 1 : 0); 789 return sprintf(buf, "%d\n",
790 (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
449} 791}
450 792
451static ssize_t disk_size_show(struct device *dev, 793static ssize_t disk_ro_show(struct device *dev,
452 struct device_attribute *attr, char *buf) 794 struct device_attribute *attr, char *buf)
453{ 795{
454 struct gendisk *disk = dev_to_disk(dev); 796 struct gendisk *disk = dev_to_disk(dev);
455 797
456 return sprintf(buf, "%llu\n", (unsigned long long)get_capacity(disk)); 798 return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
457} 799}
458 800
459static ssize_t disk_capability_show(struct device *dev, 801static ssize_t disk_capability_show(struct device *dev,
@@ -464,73 +806,26 @@ static ssize_t disk_capability_show(struct device *dev,
464 return sprintf(buf, "%x\n", disk->flags); 806 return sprintf(buf, "%x\n", disk->flags);
465} 807}
466 808
467static ssize_t disk_stat_show(struct device *dev,
468 struct device_attribute *attr, char *buf)
469{
470 struct gendisk *disk = dev_to_disk(dev);
471
472 preempt_disable();
473 disk_round_stats(disk);
474 preempt_enable();
475 return sprintf(buf,
476 "%8lu %8lu %8llu %8u "
477 "%8lu %8lu %8llu %8u "
478 "%8u %8u %8u"
479 "\n",
480 disk_stat_read(disk, ios[READ]),
481 disk_stat_read(disk, merges[READ]),
482 (unsigned long long)disk_stat_read(disk, sectors[READ]),
483 jiffies_to_msecs(disk_stat_read(disk, ticks[READ])),
484 disk_stat_read(disk, ios[WRITE]),
485 disk_stat_read(disk, merges[WRITE]),
486 (unsigned long long)disk_stat_read(disk, sectors[WRITE]),
487 jiffies_to_msecs(disk_stat_read(disk, ticks[WRITE])),
488 disk->in_flight,
489 jiffies_to_msecs(disk_stat_read(disk, io_ticks)),
490 jiffies_to_msecs(disk_stat_read(disk, time_in_queue)));
491}
492
493#ifdef CONFIG_FAIL_MAKE_REQUEST
494static ssize_t disk_fail_show(struct device *dev,
495 struct device_attribute *attr, char *buf)
496{
497 struct gendisk *disk = dev_to_disk(dev);
498
499 return sprintf(buf, "%d\n", disk->flags & GENHD_FL_FAIL ? 1 : 0);
500}
501
502static ssize_t disk_fail_store(struct device *dev,
503 struct device_attribute *attr,
504 const char *buf, size_t count)
505{
506 struct gendisk *disk = dev_to_disk(dev);
507 int i;
508
509 if (count > 0 && sscanf(buf, "%d", &i) > 0) {
510 if (i == 0)
511 disk->flags &= ~GENHD_FL_FAIL;
512 else
513 disk->flags |= GENHD_FL_FAIL;
514 }
515
516 return count;
517}
518
519#endif
520
521static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); 809static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
810static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
522static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); 811static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
523static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); 812static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
524static DEVICE_ATTR(size, S_IRUGO, disk_size_show, NULL); 813static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
525static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); 814static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
526static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL); 815static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
527#ifdef CONFIG_FAIL_MAKE_REQUEST 816#ifdef CONFIG_FAIL_MAKE_REQUEST
528static struct device_attribute dev_attr_fail = 817static struct device_attribute dev_attr_fail =
529 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, disk_fail_show, disk_fail_store); 818 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
819#endif
820#ifdef CONFIG_FAIL_IO_TIMEOUT
821static struct device_attribute dev_attr_fail_timeout =
822 __ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show,
823 part_timeout_store);
530#endif 824#endif
531 825
532static struct attribute *disk_attrs[] = { 826static struct attribute *disk_attrs[] = {
533 &dev_attr_range.attr, 827 &dev_attr_range.attr,
828 &dev_attr_ext_range.attr,
534 &dev_attr_removable.attr, 829 &dev_attr_removable.attr,
535 &dev_attr_ro.attr, 830 &dev_attr_ro.attr,
536 &dev_attr_size.attr, 831 &dev_attr_size.attr,
@@ -539,6 +834,9 @@ static struct attribute *disk_attrs[] = {
539#ifdef CONFIG_FAIL_MAKE_REQUEST 834#ifdef CONFIG_FAIL_MAKE_REQUEST
540 &dev_attr_fail.attr, 835 &dev_attr_fail.attr,
541#endif 836#endif
837#ifdef CONFIG_FAIL_IO_TIMEOUT
838 &dev_attr_fail_timeout.attr,
839#endif
542 NULL 840 NULL
543}; 841};
544 842
@@ -551,13 +849,87 @@ static struct attribute_group *disk_attr_groups[] = {
551 NULL 849 NULL
552}; 850};
553 851
852static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
853{
854 struct disk_part_tbl *ptbl =
855 container_of(head, struct disk_part_tbl, rcu_head);
856
857 kfree(ptbl);
858}
859
860/**
861 * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
862 * @disk: disk to replace part_tbl for
863 * @new_ptbl: new part_tbl to install
864 *
865 * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The
866 * original ptbl is freed using RCU callback.
867 *
868 * LOCKING:
869 * Matching bd_mutx locked.
870 */
871static void disk_replace_part_tbl(struct gendisk *disk,
872 struct disk_part_tbl *new_ptbl)
873{
874 struct disk_part_tbl *old_ptbl = disk->part_tbl;
875
876 rcu_assign_pointer(disk->part_tbl, new_ptbl);
877 if (old_ptbl)
878 call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
879}
880
881/**
882 * disk_expand_part_tbl - expand disk->part_tbl
883 * @disk: disk to expand part_tbl for
884 * @partno: expand such that this partno can fit in
885 *
886 * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl
887 * uses RCU to allow unlocked dereferencing for stats and other stuff.
888 *
889 * LOCKING:
890 * Matching bd_mutex locked, might sleep.
891 *
892 * RETURNS:
893 * 0 on success, -errno on failure.
894 */
895int disk_expand_part_tbl(struct gendisk *disk, int partno)
896{
897 struct disk_part_tbl *old_ptbl = disk->part_tbl;
898 struct disk_part_tbl *new_ptbl;
899 int len = old_ptbl ? old_ptbl->len : 0;
900 int target = partno + 1;
901 size_t size;
902 int i;
903
904 /* disk_max_parts() is zero during initialization, ignore if so */
905 if (disk_max_parts(disk) && target > disk_max_parts(disk))
906 return -EINVAL;
907
908 if (target <= len)
909 return 0;
910
911 size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]);
912 new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id);
913 if (!new_ptbl)
914 return -ENOMEM;
915
916 INIT_RCU_HEAD(&new_ptbl->rcu_head);
917 new_ptbl->len = target;
918
919 for (i = 0; i < len; i++)
920 rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
921
922 disk_replace_part_tbl(disk, new_ptbl);
923 return 0;
924}
925
554static void disk_release(struct device *dev) 926static void disk_release(struct device *dev)
555{ 927{
556 struct gendisk *disk = dev_to_disk(dev); 928 struct gendisk *disk = dev_to_disk(dev);
557 929
558 kfree(disk->random); 930 kfree(disk->random);
559 kfree(disk->part); 931 disk_replace_part_tbl(disk, NULL);
560 free_disk_stats(disk); 932 free_part_stats(&disk->part0);
561 kfree(disk); 933 kfree(disk);
562} 934}
563struct class block_class = { 935struct class block_class = {
@@ -578,83 +950,31 @@ static struct device_type disk_type = {
578 * The output looks suspiciously like /proc/partitions with a bunch of 950 * The output looks suspiciously like /proc/partitions with a bunch of
579 * extra fields. 951 * extra fields.
580 */ 952 */
581 953static int diskstats_show(struct seq_file *seqf, void *v)
582static void *diskstats_start(struct seq_file *part, loff_t *pos)
583{
584 struct device *dev;
585 loff_t k = *pos;
586
587 mutex_lock(&block_class_lock);
588 dev = class_find_device(&block_class, NULL, &k, find_start);
589 if (dev) {
590 put_device(dev);
591 return dev_to_disk(dev);
592 }
593 return NULL;
594}
595
596static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
597{
598 struct gendisk *gp = v;
599 struct device *dev;
600
601 ++*pos;
602 dev = class_find_device(&block_class, &gp->dev, NULL, find_next);
603 if (dev) {
604 put_device(dev);
605 return dev_to_disk(dev);
606 }
607 return NULL;
608}
609
610static void diskstats_stop(struct seq_file *part, void *v)
611{
612 mutex_unlock(&block_class_lock);
613}
614
615static int diskstats_show(struct seq_file *s, void *v)
616{ 954{
617 struct gendisk *gp = v; 955 struct gendisk *gp = v;
956 struct disk_part_iter piter;
957 struct hd_struct *hd;
618 char buf[BDEVNAME_SIZE]; 958 char buf[BDEVNAME_SIZE];
619 int n = 0; 959 int cpu;
620 960
621 /* 961 /*
622 if (&gp->dev.kobj.entry == block_class.devices.next) 962 if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
623 seq_puts(s, "major minor name" 963 seq_puts(seqf, "major minor name"
624 " rio rmerge rsect ruse wio wmerge " 964 " rio rmerge rsect ruse wio wmerge "
625 "wsect wuse running use aveq" 965 "wsect wuse running use aveq"
626 "\n\n"); 966 "\n\n");
627 */ 967 */
628 968
629 preempt_disable(); 969 disk_part_iter_init(&piter, gp, DISK_PITER_INCL_PART0);
630 disk_round_stats(gp); 970 while ((hd = disk_part_iter_next(&piter))) {
631 preempt_enable(); 971 cpu = part_stat_lock();
632 seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", 972 part_round_stats(cpu, hd);
633 gp->major, n + gp->first_minor, disk_name(gp, n, buf), 973 part_stat_unlock();
634 disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), 974 seq_printf(seqf, "%4d %7d %s %lu %lu %llu "
635 (unsigned long long)disk_stat_read(gp, sectors[0]),
636 jiffies_to_msecs(disk_stat_read(gp, ticks[0])),
637 disk_stat_read(gp, ios[1]), disk_stat_read(gp, merges[1]),
638 (unsigned long long)disk_stat_read(gp, sectors[1]),
639 jiffies_to_msecs(disk_stat_read(gp, ticks[1])),
640 gp->in_flight,
641 jiffies_to_msecs(disk_stat_read(gp, io_ticks)),
642 jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
643
644 /* now show all non-0 size partitions of it */
645 for (n = 0; n < gp->minors - 1; n++) {
646 struct hd_struct *hd = gp->part[n];
647
648 if (!hd || !hd->nr_sects)
649 continue;
650
651 preempt_disable();
652 part_round_stats(hd);
653 preempt_enable();
654 seq_printf(s, "%4d %4d %s %lu %lu %llu "
655 "%u %lu %lu %llu %u %u %u %u\n", 975 "%u %lu %lu %llu %u %u %u %u\n",
656 gp->major, n + gp->first_minor + 1, 976 MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
657 disk_name(gp, n + 1, buf), 977 disk_name(gp, hd->partno, buf),
658 part_stat_read(hd, ios[0]), 978 part_stat_read(hd, ios[0]),
659 part_stat_read(hd, merges[0]), 979 part_stat_read(hd, merges[0]),
660 (unsigned long long)part_stat_read(hd, sectors[0]), 980 (unsigned long long)part_stat_read(hd, sectors[0]),
@@ -668,14 +988,15 @@ static int diskstats_show(struct seq_file *s, void *v)
668 jiffies_to_msecs(part_stat_read(hd, time_in_queue)) 988 jiffies_to_msecs(part_stat_read(hd, time_in_queue))
669 ); 989 );
670 } 990 }
991 disk_part_iter_exit(&piter);
671 992
672 return 0; 993 return 0;
673} 994}
674 995
675const struct seq_operations diskstats_op = { 996const struct seq_operations diskstats_op = {
676 .start = diskstats_start, 997 .start = disk_seqf_start,
677 .next = diskstats_next, 998 .next = disk_seqf_next,
678 .stop = diskstats_stop, 999 .stop = disk_seqf_stop,
679 .show = diskstats_show 1000 .show = diskstats_show
680}; 1001};
681#endif /* CONFIG_PROC_FS */ 1002#endif /* CONFIG_PROC_FS */
@@ -690,7 +1011,7 @@ static void media_change_notify_thread(struct work_struct *work)
690 * set enviroment vars to indicate which event this is for 1011 * set enviroment vars to indicate which event this is for
691 * so that user space will know to go check the media status. 1012 * so that user space will know to go check the media status.
692 */ 1013 */
693 kobject_uevent_env(&gd->dev.kobj, KOBJ_CHANGE, envp); 1014 kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
694 put_device(gd->driverfs_dev); 1015 put_device(gd->driverfs_dev);
695} 1016}
696 1017
@@ -703,42 +1024,29 @@ void genhd_media_change_notify(struct gendisk *disk)
703EXPORT_SYMBOL_GPL(genhd_media_change_notify); 1024EXPORT_SYMBOL_GPL(genhd_media_change_notify);
704#endif /* 0 */ 1025#endif /* 0 */
705 1026
706struct find_block { 1027dev_t blk_lookup_devt(const char *name, int partno)
707 const char *name;
708 int part;
709};
710
711static int match_id(struct device *dev, void *data)
712{ 1028{
713 struct find_block *find = data; 1029 dev_t devt = MKDEV(0, 0);
1030 struct class_dev_iter iter;
1031 struct device *dev;
714 1032
715 if (dev->type != &disk_type) 1033 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
716 return 0; 1034 while ((dev = class_dev_iter_next(&iter))) {
717 if (strcmp(dev->bus_id, find->name) == 0) {
718 struct gendisk *disk = dev_to_disk(dev); 1035 struct gendisk *disk = dev_to_disk(dev);
719 if (find->part < disk->minors) 1036 struct hd_struct *part;
720 return 1;
721 }
722 return 0;
723}
724 1037
725dev_t blk_lookup_devt(const char *name, int part) 1038 if (strcmp(dev->bus_id, name))
726{ 1039 continue;
727 struct device *dev;
728 dev_t devt = MKDEV(0, 0);
729 struct find_block find;
730 1040
731 mutex_lock(&block_class_lock); 1041 part = disk_get_part(disk, partno);
732 find.name = name; 1042 if (part) {
733 find.part = part; 1043 devt = part_devt(part);
734 dev = class_find_device(&block_class, NULL, &find, match_id); 1044 disk_put_part(part);
735 if (dev) { 1045 break;
736 put_device(dev); 1046 }
737 devt = MKDEV(MAJOR(dev->devt), 1047 disk_put_part(part);
738 MINOR(dev->devt) + part);
739 } 1048 }
740 mutex_unlock(&block_class_lock); 1049 class_dev_iter_exit(&iter);
741
742 return devt; 1050 return devt;
743} 1051}
744EXPORT_SYMBOL(blk_lookup_devt); 1052EXPORT_SYMBOL(blk_lookup_devt);
@@ -747,6 +1055,7 @@ struct gendisk *alloc_disk(int minors)
747{ 1055{
748 return alloc_disk_node(minors, -1); 1056 return alloc_disk_node(minors, -1);
749} 1057}
1058EXPORT_SYMBOL(alloc_disk);
750 1059
751struct gendisk *alloc_disk_node(int minors, int node_id) 1060struct gendisk *alloc_disk_node(int minors, int node_id)
752{ 1061{
@@ -755,32 +1064,28 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
755 disk = kmalloc_node(sizeof(struct gendisk), 1064 disk = kmalloc_node(sizeof(struct gendisk),
756 GFP_KERNEL | __GFP_ZERO, node_id); 1065 GFP_KERNEL | __GFP_ZERO, node_id);
757 if (disk) { 1066 if (disk) {
758 if (!init_disk_stats(disk)) { 1067 if (!init_part_stats(&disk->part0)) {
759 kfree(disk); 1068 kfree(disk);
760 return NULL; 1069 return NULL;
761 } 1070 }
762 if (minors > 1) { 1071 if (disk_expand_part_tbl(disk, 0)) {
763 int size = (minors - 1) * sizeof(struct hd_struct *); 1072 free_part_stats(&disk->part0);
764 disk->part = kmalloc_node(size, 1073 kfree(disk);
765 GFP_KERNEL | __GFP_ZERO, node_id); 1074 return NULL;
766 if (!disk->part) {
767 free_disk_stats(disk);
768 kfree(disk);
769 return NULL;
770 }
771 } 1075 }
1076 disk->part_tbl->part[0] = &disk->part0;
1077
772 disk->minors = minors; 1078 disk->minors = minors;
773 rand_initialize_disk(disk); 1079 rand_initialize_disk(disk);
774 disk->dev.class = &block_class; 1080 disk_to_dev(disk)->class = &block_class;
775 disk->dev.type = &disk_type; 1081 disk_to_dev(disk)->type = &disk_type;
776 device_initialize(&disk->dev); 1082 device_initialize(disk_to_dev(disk));
777 INIT_WORK(&disk->async_notify, 1083 INIT_WORK(&disk->async_notify,
778 media_change_notify_thread); 1084 media_change_notify_thread);
1085 disk->node_id = node_id;
779 } 1086 }
780 return disk; 1087 return disk;
781} 1088}
782
783EXPORT_SYMBOL(alloc_disk);
784EXPORT_SYMBOL(alloc_disk_node); 1089EXPORT_SYMBOL(alloc_disk_node);
785 1090
786struct kobject *get_disk(struct gendisk *disk) 1091struct kobject *get_disk(struct gendisk *disk)
@@ -793,7 +1098,7 @@ struct kobject *get_disk(struct gendisk *disk)
793 owner = disk->fops->owner; 1098 owner = disk->fops->owner;
794 if (owner && !try_module_get(owner)) 1099 if (owner && !try_module_get(owner))
795 return NULL; 1100 return NULL;
796 kobj = kobject_get(&disk->dev.kobj); 1101 kobj = kobject_get(&disk_to_dev(disk)->kobj);
797 if (kobj == NULL) { 1102 if (kobj == NULL) {
798 module_put(owner); 1103 module_put(owner);
799 return NULL; 1104 return NULL;
@@ -807,27 +1112,28 @@ EXPORT_SYMBOL(get_disk);
807void put_disk(struct gendisk *disk) 1112void put_disk(struct gendisk *disk)
808{ 1113{
809 if (disk) 1114 if (disk)
810 kobject_put(&disk->dev.kobj); 1115 kobject_put(&disk_to_dev(disk)->kobj);
811} 1116}
812 1117
813EXPORT_SYMBOL(put_disk); 1118EXPORT_SYMBOL(put_disk);
814 1119
815void set_device_ro(struct block_device *bdev, int flag) 1120void set_device_ro(struct block_device *bdev, int flag)
816{ 1121{
817 if (bdev->bd_contains != bdev) 1122 bdev->bd_part->policy = flag;
818 bdev->bd_part->policy = flag;
819 else
820 bdev->bd_disk->policy = flag;
821} 1123}
822 1124
823EXPORT_SYMBOL(set_device_ro); 1125EXPORT_SYMBOL(set_device_ro);
824 1126
825void set_disk_ro(struct gendisk *disk, int flag) 1127void set_disk_ro(struct gendisk *disk, int flag)
826{ 1128{
827 int i; 1129 struct disk_part_iter piter;
828 disk->policy = flag; 1130 struct hd_struct *part;
829 for (i = 0; i < disk->minors - 1; i++) 1131
830 if (disk->part[i]) disk->part[i]->policy = flag; 1132 disk_part_iter_init(&piter, disk,
1133 DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0);
1134 while ((part = disk_part_iter_next(&piter)))
1135 part->policy = flag;
1136 disk_part_iter_exit(&piter);
831} 1137}
832 1138
833EXPORT_SYMBOL(set_disk_ro); 1139EXPORT_SYMBOL(set_disk_ro);
@@ -836,18 +1142,15 @@ int bdev_read_only(struct block_device *bdev)
836{ 1142{
837 if (!bdev) 1143 if (!bdev)
838 return 0; 1144 return 0;
839 else if (bdev->bd_contains != bdev) 1145 return bdev->bd_part->policy;
840 return bdev->bd_part->policy;
841 else
842 return bdev->bd_disk->policy;
843} 1146}
844 1147
845EXPORT_SYMBOL(bdev_read_only); 1148EXPORT_SYMBOL(bdev_read_only);
846 1149
847int invalidate_partition(struct gendisk *disk, int index) 1150int invalidate_partition(struct gendisk *disk, int partno)
848{ 1151{
849 int res = 0; 1152 int res = 0;
850 struct block_device *bdev = bdget_disk(disk, index); 1153 struct block_device *bdev = bdget_disk(disk, partno);
851 if (bdev) { 1154 if (bdev) {
852 fsync_bdev(bdev); 1155 fsync_bdev(bdev);
853 res = __invalidate_device(bdev); 1156 res = __invalidate_device(bdev);
diff --git a/block/ioctl.c b/block/ioctl.c
index 77185e5c026a..38bee321e1fa 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -12,11 +12,12 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
12{ 12{
13 struct block_device *bdevp; 13 struct block_device *bdevp;
14 struct gendisk *disk; 14 struct gendisk *disk;
15 struct hd_struct *part;
15 struct blkpg_ioctl_arg a; 16 struct blkpg_ioctl_arg a;
16 struct blkpg_partition p; 17 struct blkpg_partition p;
18 struct disk_part_iter piter;
17 long long start, length; 19 long long start, length;
18 int part; 20 int partno;
19 int i;
20 int err; 21 int err;
21 22
22 if (!capable(CAP_SYS_ADMIN)) 23 if (!capable(CAP_SYS_ADMIN))
@@ -28,8 +29,8 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
28 disk = bdev->bd_disk; 29 disk = bdev->bd_disk;
29 if (bdev != bdev->bd_contains) 30 if (bdev != bdev->bd_contains)
30 return -EINVAL; 31 return -EINVAL;
31 part = p.pno; 32 partno = p.pno;
32 if (part <= 0 || part >= disk->minors) 33 if (partno <= 0)
33 return -EINVAL; 34 return -EINVAL;
34 switch (a.op) { 35 switch (a.op) {
35 case BLKPG_ADD_PARTITION: 36 case BLKPG_ADD_PARTITION:
@@ -43,36 +44,37 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
43 || pstart < 0 || plength < 0) 44 || pstart < 0 || plength < 0)
44 return -EINVAL; 45 return -EINVAL;
45 } 46 }
46 /* partition number in use? */ 47
47 mutex_lock(&bdev->bd_mutex); 48 mutex_lock(&bdev->bd_mutex);
48 if (disk->part[part - 1]) {
49 mutex_unlock(&bdev->bd_mutex);
50 return -EBUSY;
51 }
52 /* overlap? */
53 for (i = 0; i < disk->minors - 1; i++) {
54 struct hd_struct *s = disk->part[i];
55 49
56 if (!s) 50 /* overlap? */
57 continue; 51 disk_part_iter_init(&piter, disk,
58 if (!(start+length <= s->start_sect || 52 DISK_PITER_INCL_EMPTY);
59 start >= s->start_sect + s->nr_sects)) { 53 while ((part = disk_part_iter_next(&piter))) {
54 if (!(start + length <= part->start_sect ||
55 start >= part->start_sect + part->nr_sects)) {
56 disk_part_iter_exit(&piter);
60 mutex_unlock(&bdev->bd_mutex); 57 mutex_unlock(&bdev->bd_mutex);
61 return -EBUSY; 58 return -EBUSY;
62 } 59 }
63 } 60 }
61 disk_part_iter_exit(&piter);
62
64 /* all seems OK */ 63 /* all seems OK */
65 err = add_partition(disk, part, start, length, ADDPART_FLAG_NONE); 64 err = add_partition(disk, partno, start, length,
65 ADDPART_FLAG_NONE);
66 mutex_unlock(&bdev->bd_mutex); 66 mutex_unlock(&bdev->bd_mutex);
67 return err; 67 return err;
68 case BLKPG_DEL_PARTITION: 68 case BLKPG_DEL_PARTITION:
69 if (!disk->part[part-1]) 69 part = disk_get_part(disk, partno);
70 return -ENXIO; 70 if (!part)
71 if (disk->part[part - 1]->nr_sects == 0)
72 return -ENXIO; 71 return -ENXIO;
73 bdevp = bdget_disk(disk, part); 72
73 bdevp = bdget(part_devt(part));
74 disk_put_part(part);
74 if (!bdevp) 75 if (!bdevp)
75 return -ENOMEM; 76 return -ENOMEM;
77
76 mutex_lock(&bdevp->bd_mutex); 78 mutex_lock(&bdevp->bd_mutex);
77 if (bdevp->bd_openers) { 79 if (bdevp->bd_openers) {
78 mutex_unlock(&bdevp->bd_mutex); 80 mutex_unlock(&bdevp->bd_mutex);
@@ -84,7 +86,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
84 invalidate_bdev(bdevp); 86 invalidate_bdev(bdevp);
85 87
86 mutex_lock_nested(&bdev->bd_mutex, 1); 88 mutex_lock_nested(&bdev->bd_mutex, 1);
87 delete_partition(disk, part); 89 delete_partition(disk, partno);
88 mutex_unlock(&bdev->bd_mutex); 90 mutex_unlock(&bdev->bd_mutex);
89 mutex_unlock(&bdevp->bd_mutex); 91 mutex_unlock(&bdevp->bd_mutex);
90 bdput(bdevp); 92 bdput(bdevp);
@@ -100,7 +102,7 @@ static int blkdev_reread_part(struct block_device *bdev)
100 struct gendisk *disk = bdev->bd_disk; 102 struct gendisk *disk = bdev->bd_disk;
101 int res; 103 int res;
102 104
103 if (disk->minors == 1 || bdev != bdev->bd_contains) 105 if (!disk_partitionable(disk) || bdev != bdev->bd_contains)
104 return -EINVAL; 106 return -EINVAL;
105 if (!capable(CAP_SYS_ADMIN)) 107 if (!capable(CAP_SYS_ADMIN))
106 return -EACCES; 108 return -EACCES;
@@ -111,6 +113,69 @@ static int blkdev_reread_part(struct block_device *bdev)
111 return res; 113 return res;
112} 114}
113 115
116static void blk_ioc_discard_endio(struct bio *bio, int err)
117{
118 if (err) {
119 if (err == -EOPNOTSUPP)
120 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
121 clear_bit(BIO_UPTODATE, &bio->bi_flags);
122 }
123 complete(bio->bi_private);
124}
125
126static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
127 uint64_t len)
128{
129 struct request_queue *q = bdev_get_queue(bdev);
130 int ret = 0;
131
132 if (start & 511)
133 return -EINVAL;
134 if (len & 511)
135 return -EINVAL;
136 start >>= 9;
137 len >>= 9;
138
139 if (start + len > (bdev->bd_inode->i_size >> 9))
140 return -EINVAL;
141
142 if (!q->prepare_discard_fn)
143 return -EOPNOTSUPP;
144
145 while (len && !ret) {
146 DECLARE_COMPLETION_ONSTACK(wait);
147 struct bio *bio;
148
149 bio = bio_alloc(GFP_KERNEL, 0);
150 if (!bio)
151 return -ENOMEM;
152
153 bio->bi_end_io = blk_ioc_discard_endio;
154 bio->bi_bdev = bdev;
155 bio->bi_private = &wait;
156 bio->bi_sector = start;
157
158 if (len > q->max_hw_sectors) {
159 bio->bi_size = q->max_hw_sectors << 9;
160 len -= q->max_hw_sectors;
161 start += q->max_hw_sectors;
162 } else {
163 bio->bi_size = len << 9;
164 len = 0;
165 }
166 submit_bio(DISCARD_NOBARRIER, bio);
167
168 wait_for_completion(&wait);
169
170 if (bio_flagged(bio, BIO_EOPNOTSUPP))
171 ret = -EOPNOTSUPP;
172 else if (!bio_flagged(bio, BIO_UPTODATE))
173 ret = -EIO;
174 bio_put(bio);
175 }
176 return ret;
177}
178
114static int put_ushort(unsigned long arg, unsigned short val) 179static int put_ushort(unsigned long arg, unsigned short val)
115{ 180{
116 return put_user(val, (unsigned short __user *)arg); 181 return put_user(val, (unsigned short __user *)arg);
@@ -258,6 +323,19 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
258 set_device_ro(bdev, n); 323 set_device_ro(bdev, n);
259 unlock_kernel(); 324 unlock_kernel();
260 return 0; 325 return 0;
326
327 case BLKDISCARD: {
328 uint64_t range[2];
329
330 if (!(file->f_mode & FMODE_WRITE))
331 return -EBADF;
332
333 if (copy_from_user(range, (void __user *)arg, sizeof(range)))
334 return -EFAULT;
335
336 return blk_ioctl_discard(bdev, range[0], range[1]);
337 }
338
261 case HDIO_GETGEO: { 339 case HDIO_GETGEO: {
262 struct hd_geometry geo; 340 struct hd_geometry geo;
263 341
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index ec4b7f234626..c34272a348fe 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -185,6 +185,7 @@ void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
185 __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); 185 __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
186 __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); 186 __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
187 __set_bit(GPCMD_SET_STREAMING, filter->write_ok); 187 __set_bit(GPCMD_SET_STREAMING, filter->write_ok);
188 __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
188} 189}
189EXPORT_SYMBOL_GPL(blk_set_cmd_filter_defaults); 190EXPORT_SYMBOL_GPL(blk_set_cmd_filter_defaults);
190 191
@@ -313,11 +314,12 @@ static int sg_io(struct file *file, struct request_queue *q,
313 goto out; 314 goto out;
314 } 315 }
315 316
316 ret = blk_rq_map_user_iov(q, rq, iov, hdr->iovec_count, 317 ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count,
317 hdr->dxfer_len); 318 hdr->dxfer_len, GFP_KERNEL);
318 kfree(iov); 319 kfree(iov);
319 } else if (hdr->dxfer_len) 320 } else if (hdr->dxfer_len)
320 ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len); 321 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
322 GFP_KERNEL);
321 323
322 if (ret) 324 if (ret)
323 goto out; 325 goto out;