aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-30 15:43:21 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-30 15:43:21 -0500
commite48b7b66a6531f02f1264c7196f7069a9ce9251a (patch)
treed45ce978262e3c32ce8fe460516bb9aae0cc2fb4
parent5ccf73bb4dc7cc9e1f761202a34de5714164724f (diff)
parent9bd3f98821a83041e77ee25158b80b535d02d7b4 (diff)
Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block
* 'for-linus' of git://git.kernel.dk/linux-2.6-block: block: blk_rq_err_sectors cleanup block: Honor the gfp_mask for alloc_page() in blkdev_issue_discard() block: Fix incorrect alignment offset reporting and update documentation cfq-iosched: don't regard requests with long distance as close aoe: switch to the new bio_flush_dcache_pages() interface drivers/block/mg_disk.c: use resource_size() drivers/block/DAC960.c: use DAC960_V2_Controller block: Fix topology stacking for data and discard alignment drbd: remove unused #include <linux/version.h> drbd: remove duplicated #include drbd: Fix test of unsigned in _drbd_fault_random() drbd: Constify struct file_operations cfq-iosched: Remove prio_change logic for workload selection cfq-iosched: Get rid of nr_groups cfq-iosched: Remove the check for same cfq group from allow_merge drbd: fix test of unsigned in _drbd_fault_random() block: remove Documentation/block/as-iosched.txt
-rw-r--r--Documentation/block/00-INDEX2
-rw-r--r--Documentation/block/as-iosched.txt172
-rw-r--r--block/blk-barrier.c2
-rw-r--r--block/blk-settings.c121
-rw-r--r--block/cfq-iosched.c67
-rw-r--r--drivers/block/DAC960.c2
-rw-r--r--drivers/block/aoe/aoecmd.c17
-rw-r--r--drivers/block/drbd/drbd_int.h2
-rw-r--r--drivers/block/drbd/drbd_main.c5
-rw-r--r--drivers/block/drbd/drbd_proc.c2
-rw-r--r--drivers/block/drbd/drbd_receiver.c1
-rw-r--r--drivers/block/drbd/drbd_worker.c2
-rw-r--r--drivers/block/mg_disk.c2
-rw-r--r--include/linux/blkdev.h17
14 files changed, 115 insertions, 299 deletions
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index 961a0513f8c3..a406286f6f3e 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,7 +1,5 @@
100-INDEX 100-INDEX
2 - This file 2 - This file
3as-iosched.txt
4 - Anticipatory IO scheduler
5barrier.txt 3barrier.txt
6 - I/O Barriers 4 - I/O Barriers
7biodoc.txt 5biodoc.txt
diff --git a/Documentation/block/as-iosched.txt b/Documentation/block/as-iosched.txt
deleted file mode 100644
index 738b72be128e..000000000000
--- a/Documentation/block/as-iosched.txt
+++ /dev/null
@@ -1,172 +0,0 @@
1Anticipatory IO scheduler
2-------------------------
3Nick Piggin <piggin@cyberone.com.au> 13 Sep 2003
4
5Attention! Database servers, especially those using "TCQ" disks should
6investigate performance with the 'deadline' IO scheduler. Any system with high
7disk performance requirements should do so, in fact.
8
9If you see unusual performance characteristics of your disk systems, or you
10see big performance regressions versus the deadline scheduler, please email
11me. Database users don't bother unless you're willing to test a lot of patches
12from me ;) its a known issue.
13
14Also, users with hardware RAID controllers, doing striping, may find
15highly variable performance results with using the as-iosched. The
16as-iosched anticipatory implementation is based on the notion that a disk
17device has only one physical seeking head. A striped RAID controller
18actually has a head for each physical device in the logical RAID device.
19
20However, setting the antic_expire (see tunable parameters below) produces
21very similar behavior to the deadline IO scheduler.
22
23Selecting IO schedulers
24-----------------------
25Refer to Documentation/block/switching-sched.txt for information on
26selecting an io scheduler on a per-device basis.
27
28Anticipatory IO scheduler Policies
29----------------------------------
30The as-iosched implementation implements several layers of policies
31to determine when an IO request is dispatched to the disk controller.
32Here are the policies outlined, in order of application.
33
341. one-way Elevator algorithm.
35
36The elevator algorithm is similar to that used in deadline scheduler, with
37the addition that it allows limited backward movement of the elevator
38(i.e. seeks backwards). A seek backwards can occur when choosing between
39two IO requests where one is behind the elevator's current position, and
40the other is in front of the elevator's position. If the seek distance to
41the request in back of the elevator is less than half the seek distance to
42the request in front of the elevator, then the request in back can be chosen.
43Backward seeks are also limited to a maximum of MAXBACK (1024*1024) sectors.
44This favors forward movement of the elevator, while allowing opportunistic
45"short" backward seeks.
46
472. FIFO expiration times for reads and for writes.
48
49This is again very similar to the deadline IO scheduler. The expiration
50times for requests on these lists is tunable using the parameters read_expire
51and write_expire discussed below. When a read or a write expires in this way,
52the IO scheduler will interrupt its current elevator sweep or read anticipation
53to service the expired request.
54
553. Read and write request batching
56
57A batch is a collection of read requests or a collection of write
58requests. The as scheduler alternates dispatching read and write batches
59to the driver. In the case a read batch, the scheduler submits read
60requests to the driver as long as there are read requests to submit, and
61the read batch time limit has not been exceeded (read_batch_expire).
62The read batch time limit begins counting down only when there are
63competing write requests pending.
64
65In the case of a write batch, the scheduler submits write requests to
66the driver as long as there are write requests available, and the
67write batch time limit has not been exceeded (write_batch_expire).
68However, the length of write batches will be gradually shortened
69when read batches frequently exceed their time limit.
70
71When changing between batch types, the scheduler waits for all requests
72from the previous batch to complete before scheduling requests for the
73next batch.
74
75The read and write fifo expiration times described in policy 2 above
76are checked only when in scheduling IO of a batch for the corresponding
77(read/write) type. So for example, the read FIFO timeout values are
78tested only during read batches. Likewise, the write FIFO timeout
79values are tested only during write batches. For this reason,
80it is generally not recommended for the read batch time
81to be longer than the write expiration time, nor for the write batch
82time to exceed the read expiration time (see tunable parameters below).
83
84When the IO scheduler changes from a read to a write batch,
85it begins the elevator from the request that is on the head of the
86write expiration FIFO. Likewise, when changing from a write batch to
87a read batch, scheduler begins the elevator from the first entry
88on the read expiration FIFO.
89
904. Read anticipation.
91
92Read anticipation occurs only when scheduling a read batch.
93This implementation of read anticipation allows only one read request
94to be dispatched to the disk controller at a time. In
95contrast, many write requests may be dispatched to the disk controller
96at a time during a write batch. It is this characteristic that can make
97the anticipatory scheduler perform anomalously with controllers supporting
98TCQ, or with hardware striped RAID devices. Setting the antic_expire
99queue parameter (see below) to zero disables this behavior, and the
100anticipatory scheduler behaves essentially like the deadline scheduler.
101
102When read anticipation is enabled (antic_expire is not zero), reads
103are dispatched to the disk controller one at a time.
104At the end of each read request, the IO scheduler examines its next
105candidate read request from its sorted read list. If that next request
106is from the same process as the request that just completed,
107or if the next request in the queue is "very close" to the
108just completed request, it is dispatched immediately. Otherwise,
109statistics (average think time, average seek distance) on the process
110that submitted the just completed request are examined. If it seems
111likely that that process will submit another request soon, and that
112request is likely to be near the just completed request, then the IO
113scheduler will stop dispatching more read requests for up to (antic_expire)
114milliseconds, hoping that process will submit a new request near the one
115that just completed. If such a request is made, then it is dispatched
116immediately. If the antic_expire wait time expires, then the IO scheduler
117will dispatch the next read request from the sorted read queue.
118
119To decide whether an anticipatory wait is worthwhile, the scheduler
120maintains statistics for each process that can be used to compute
121mean "think time" (the time between read requests), and mean seek
122distance for that process. One observation is that these statistics
123are associated with each process, but those statistics are not associated
124with a specific IO device. So for example, if a process is doing IO
125on several file systems on separate devices, the statistics will be
126a combination of IO behavior from all those devices.
127
128
129Tuning the anticipatory IO scheduler
130------------------------------------
131When using 'as', the anticipatory IO scheduler there are 5 parameters under
132/sys/block/*/queue/iosched/. All are units of milliseconds.
133
134The parameters are:
135* read_expire
136 Controls how long until a read request becomes "expired". It also controls the
137 interval between which expired requests are served, so set to 50, a request
138 might take anywhere < 100ms to be serviced _if_ it is the next on the
139 expired list. Obviously request expiration strategies won't make the disk
140 go faster. The result basically equates to the timeslice a single reader
141 gets in the presence of other IO. 100*((seek time / read_expire) + 1) is
142 very roughly the % streaming read efficiency your disk should get with
143 multiple readers.
144
145* read_batch_expire
146 Controls how much time a batch of reads is given before pending writes are
147 served. A higher value is more efficient. This might be set below read_expire
148 if writes are to be given higher priority than reads, but reads are to be
149 as efficient as possible when there are no writes. Generally though, it
150 should be some multiple of read_expire.
151
152* write_expire, and
153* write_batch_expire are equivalent to the above, for writes.
154
155* antic_expire
156 Controls the maximum amount of time we can anticipate a good read (one
157 with a short seek distance from the most recently completed request) before
158 giving up. Many other factors may cause anticipation to be stopped early,
159 or some processes will not be "anticipated" at all. Should be a bit higher
160 for big seek time devices though not a linear correspondence - most
161 processes have only a few ms thinktime.
162
163In addition to the tunables above there is a read-only file named est_time
164which, when read, will show:
165
166 - The probability of a task exiting without a cooperating task
167 submitting an anticipated IO.
168
169 - The current mean think time.
170
171 - The seek distance used to determine if an incoming IO is better.
172
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8873b9b439ff..8618d8996fea 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -402,7 +402,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
402 * our current implementations need. If we'll ever need 402 * our current implementations need. If we'll ever need
403 * more the interface will need revisiting. 403 * more the interface will need revisiting.
404 */ 404 */
405 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 405 page = alloc_page(gfp_mask | __GFP_ZERO);
406 if (!page) 406 if (!page)
407 goto out_free_bio; 407 goto out_free_bio;
408 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) 408 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6ae118d6e193..d52d4adc440b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -505,21 +505,30 @@ static unsigned int lcm(unsigned int a, unsigned int b)
505 505
506/** 506/**
507 * blk_stack_limits - adjust queue_limits for stacked devices 507 * blk_stack_limits - adjust queue_limits for stacked devices
508 * @t: the stacking driver limits (top) 508 * @t: the stacking driver limits (top device)
509 * @b: the underlying queue limits (bottom) 509 * @b: the underlying queue limits (bottom, component device)
510 * @offset: offset to beginning of data within component device 510 * @offset: offset to beginning of data within component device
511 * 511 *
512 * Description: 512 * Description:
513 * Merges two queue_limit structs. Returns 0 if alignment didn't 513 * This function is used by stacking drivers like MD and DM to ensure
514 * change. Returns -1 if adding the bottom device caused 514 * that all component devices have compatible block sizes and
515 * misalignment. 515 * alignments. The stacking driver must provide a queue_limits
516 * struct (top) and then iteratively call the stacking function for
517 * all component (bottom) devices. The stacking function will
518 * attempt to combine the values and ensure proper alignment.
519 *
520 * Returns 0 if the top and bottom queue_limits are compatible. The
521 * top device's block sizes and alignment offsets may be adjusted to
522 * ensure alignment with the bottom device. If no compatible sizes
523 * and alignments exist, -1 is returned and the resulting top
524 * queue_limits will have the misaligned flag set to indicate that
525 * the alignment_offset is undefined.
516 */ 526 */
517int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, 527int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
518 sector_t offset) 528 sector_t offset)
519{ 529{
520 int ret; 530 sector_t alignment;
521 531 unsigned int top, bottom;
522 ret = 0;
523 532
524 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); 533 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
525 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); 534 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -537,6 +546,22 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
537 t->max_segment_size = min_not_zero(t->max_segment_size, 546 t->max_segment_size = min_not_zero(t->max_segment_size,
538 b->max_segment_size); 547 b->max_segment_size);
539 548
549 alignment = queue_limit_alignment_offset(b, offset);
550
551 /* Bottom device has different alignment. Check that it is
552 * compatible with the current top alignment.
553 */
554 if (t->alignment_offset != alignment) {
555
556 top = max(t->physical_block_size, t->io_min)
557 + t->alignment_offset;
558 bottom = max(b->physical_block_size, b->io_min) + alignment;
559
560 /* Verify that top and bottom intervals line up */
561 if (max(top, bottom) & (min(top, bottom) - 1))
562 t->misaligned = 1;
563 }
564
540 t->logical_block_size = max(t->logical_block_size, 565 t->logical_block_size = max(t->logical_block_size,
541 b->logical_block_size); 566 b->logical_block_size);
542 567
@@ -544,54 +569,64 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
544 b->physical_block_size); 569 b->physical_block_size);
545 570
546 t->io_min = max(t->io_min, b->io_min); 571 t->io_min = max(t->io_min, b->io_min);
572 t->io_opt = lcm(t->io_opt, b->io_opt);
573
547 t->no_cluster |= b->no_cluster; 574 t->no_cluster |= b->no_cluster;
548 t->discard_zeroes_data &= b->discard_zeroes_data; 575 t->discard_zeroes_data &= b->discard_zeroes_data;
549 576
550 /* Bottom device offset aligned? */ 577 /* Physical block size a multiple of the logical block size? */
551 if (offset && 578 if (t->physical_block_size & (t->logical_block_size - 1)) {
552 (offset & (b->physical_block_size - 1)) != b->alignment_offset) { 579 t->physical_block_size = t->logical_block_size;
553 t->misaligned = 1; 580 t->misaligned = 1;
554 ret = -1;
555 } 581 }
556 582
557 /* 583 /* Minimum I/O a multiple of the physical block size? */
558 * Temporarily disable discard granularity. It's currently buggy 584 if (t->io_min & (t->physical_block_size - 1)) {
559 * since we default to 0 for discard_granularity, hence this 585 t->io_min = t->physical_block_size;
560 * "failure" will always trigger for non-zero offsets. 586 t->misaligned = 1;
561 */
562#if 0
563 if (offset &&
564 (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
565 t->discard_misaligned = 1;
566 ret = -1;
567 } 587 }
568#endif
569
570 /* If top has no alignment offset, inherit from bottom */
571 if (!t->alignment_offset)
572 t->alignment_offset =
573 b->alignment_offset & (b->physical_block_size - 1);
574 588
575 if (!t->discard_alignment) 589 /* Optimal I/O a multiple of the physical block size? */
576 t->discard_alignment = 590 if (t->io_opt & (t->physical_block_size - 1)) {
577 b->discard_alignment & (b->discard_granularity - 1); 591 t->io_opt = 0;
578
579 /* Top device aligned on logical block boundary? */
580 if (t->alignment_offset & (t->logical_block_size - 1)) {
581 t->misaligned = 1; 592 t->misaligned = 1;
582 ret = -1;
583 } 593 }
584 594
585 /* Find lcm() of optimal I/O size and granularity */ 595 /* Find lowest common alignment_offset */
586 t->io_opt = lcm(t->io_opt, b->io_opt); 596 t->alignment_offset = lcm(t->alignment_offset, alignment)
587 t->discard_granularity = lcm(t->discard_granularity, 597 & (max(t->physical_block_size, t->io_min) - 1);
588 b->discard_granularity);
589 598
590 /* Verify that optimal I/O size is a multiple of io_min */ 599 /* Verify that new alignment_offset is on a logical block boundary */
591 if (t->io_min && t->io_opt % t->io_min) 600 if (t->alignment_offset & (t->logical_block_size - 1))
592 ret = -1; 601 t->misaligned = 1;
602
603 /* Discard alignment and granularity */
604 if (b->discard_granularity) {
605 unsigned int granularity = b->discard_granularity;
606 offset &= granularity - 1;
607
608 alignment = (granularity + b->discard_alignment - offset)
609 & (granularity - 1);
610
611 if (t->discard_granularity != 0 &&
612 t->discard_alignment != alignment) {
613 top = t->discard_granularity + t->discard_alignment;
614 bottom = b->discard_granularity + alignment;
615
616 /* Verify that top and bottom intervals line up */
617 if (max(top, bottom) & (min(top, bottom) - 1))
618 t->discard_misaligned = 1;
619 }
620
621 t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
622 b->max_discard_sectors);
623 t->discard_granularity = max(t->discard_granularity,
624 b->discard_granularity);
625 t->discard_alignment = lcm(t->discard_alignment, alignment) &
626 (t->discard_granularity - 1);
627 }
593 628
594 return ret; 629 return t->misaligned ? -1 : 0;
595} 630}
596EXPORT_SYMBOL(blk_stack_limits); 631EXPORT_SYMBOL(blk_stack_limits);
597 632
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e2f80463ed0d..918c7fd9aeb1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -208,8 +208,6 @@ struct cfq_data {
208 /* Root service tree for cfq_groups */ 208 /* Root service tree for cfq_groups */
209 struct cfq_rb_root grp_service_tree; 209 struct cfq_rb_root grp_service_tree;
210 struct cfq_group root_group; 210 struct cfq_group root_group;
211 /* Number of active cfq groups on group service tree */
212 int nr_groups;
213 211
214 /* 212 /*
215 * The priority currently being served 213 * The priority currently being served
@@ -294,8 +292,7 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
294 292
295static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 293static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
296 enum wl_prio_t prio, 294 enum wl_prio_t prio,
297 enum wl_type_t type, 295 enum wl_type_t type)
298 struct cfq_data *cfqd)
299{ 296{
300 if (!cfqg) 297 if (!cfqg)
301 return NULL; 298 return NULL;
@@ -842,7 +839,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
842 839
843 __cfq_group_service_tree_add(st, cfqg); 840 __cfq_group_service_tree_add(st, cfqg);
844 cfqg->on_st = true; 841 cfqg->on_st = true;
845 cfqd->nr_groups++;
846 st->total_weight += cfqg->weight; 842 st->total_weight += cfqg->weight;
847} 843}
848 844
@@ -863,7 +859,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
863 859
864 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 860 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
865 cfqg->on_st = false; 861 cfqg->on_st = false;
866 cfqd->nr_groups--;
867 st->total_weight -= cfqg->weight; 862 st->total_weight -= cfqg->weight;
868 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 863 if (!RB_EMPTY_NODE(&cfqg->rb_node))
869 cfq_rb_erase(&cfqg->rb_node, st); 864 cfq_rb_erase(&cfqg->rb_node, st);
@@ -1150,7 +1145,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1150#endif 1145#endif
1151 1146
1152 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1147 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1153 cfqq_type(cfqq), cfqd); 1148 cfqq_type(cfqq));
1154 if (cfq_class_idle(cfqq)) { 1149 if (cfq_class_idle(cfqq)) {
1155 rb_key = CFQ_IDLE_DELAY; 1150 rb_key = CFQ_IDLE_DELAY;
1156 parent = rb_last(&service_tree->rb); 1151 parent = rb_last(&service_tree->rb);
@@ -1513,9 +1508,6 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1513 struct cfq_io_context *cic; 1508 struct cfq_io_context *cic;
1514 struct cfq_queue *cfqq; 1509 struct cfq_queue *cfqq;
1515 1510
1516 /* Deny merge if bio and rq don't belong to same cfq group */
1517 if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0))
1518 return false;
1519 /* 1511 /*
1520 * Disallow merge of a sync bio into an async request. 1512 * Disallow merge of a sync bio into an async request.
1521 */ 1513 */
@@ -1616,7 +1608,7 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1616{ 1608{
1617 struct cfq_rb_root *service_tree = 1609 struct cfq_rb_root *service_tree =
1618 service_tree_for(cfqd->serving_group, cfqd->serving_prio, 1610 service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1619 cfqd->serving_type, cfqd); 1611 cfqd->serving_type);
1620 1612
1621 if (!cfqd->rq_queued) 1613 if (!cfqd->rq_queued)
1622 return NULL; 1614 return NULL;
@@ -1675,13 +1667,17 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1675#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) 1667#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR)
1676 1668
1677static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1669static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1678 struct request *rq) 1670 struct request *rq, bool for_preempt)
1679{ 1671{
1680 sector_t sdist = cfqq->seek_mean; 1672 sector_t sdist = cfqq->seek_mean;
1681 1673
1682 if (!sample_valid(cfqq->seek_samples)) 1674 if (!sample_valid(cfqq->seek_samples))
1683 sdist = CFQQ_SEEK_THR; 1675 sdist = CFQQ_SEEK_THR;
1684 1676
1677 /* if seek_mean is big, using it as close criteria is meaningless */
1678 if (sdist > CFQQ_SEEK_THR && !for_preempt)
1679 sdist = CFQQ_SEEK_THR;
1680
1685 return cfq_dist_from_last(cfqd, rq) <= sdist; 1681 return cfq_dist_from_last(cfqd, rq) <= sdist;
1686} 1682}
1687 1683
@@ -1709,7 +1705,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1709 * will contain the closest sector. 1705 * will contain the closest sector.
1710 */ 1706 */
1711 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1707 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1712 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1708 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
1713 return __cfqq; 1709 return __cfqq;
1714 1710
1715 if (blk_rq_pos(__cfqq->next_rq) < sector) 1711 if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1720,7 +1716,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1720 return NULL; 1716 return NULL;
1721 1717
1722 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1718 __cfqq = rb_entry(node, struct cfq_queue, p_node);
1723 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1719 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
1724 return __cfqq; 1720 return __cfqq;
1725 1721
1726 return NULL; 1722 return NULL;
@@ -1963,8 +1959,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
1963} 1959}
1964 1960
1965static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 1961static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
1966 struct cfq_group *cfqg, enum wl_prio_t prio, 1962 struct cfq_group *cfqg, enum wl_prio_t prio)
1967 bool prio_changed)
1968{ 1963{
1969 struct cfq_queue *queue; 1964 struct cfq_queue *queue;
1970 int i; 1965 int i;
@@ -1972,24 +1967,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
1972 unsigned long lowest_key = 0; 1967 unsigned long lowest_key = 0;
1973 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 1968 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
1974 1969
1975 if (prio_changed) { 1970 for (i = 0; i <= SYNC_WORKLOAD; ++i) {
1976 /* 1971 /* select the one with lowest rb_key */
1977 * When priorities switched, we prefer starting 1972 queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
1978 * from SYNC_NOIDLE (first choice), or just SYNC
1979 * over ASYNC
1980 */
1981 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
1982 return cur_best;
1983 cur_best = SYNC_WORKLOAD;
1984 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
1985 return cur_best;
1986
1987 return ASYNC_WORKLOAD;
1988 }
1989
1990 for (i = 0; i < 3; ++i) {
1991 /* otherwise, select the one with lowest rb_key */
1992 queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd));
1993 if (queue && 1973 if (queue &&
1994 (!key_valid || time_before(queue->rb_key, lowest_key))) { 1974 (!key_valid || time_before(queue->rb_key, lowest_key))) {
1995 lowest_key = queue->rb_key; 1975 lowest_key = queue->rb_key;
@@ -2003,8 +1983,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2003 1983
2004static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 1984static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2005{ 1985{
2006 enum wl_prio_t previous_prio = cfqd->serving_prio;
2007 bool prio_changed;
2008 unsigned slice; 1986 unsigned slice;
2009 unsigned count; 1987 unsigned count;
2010 struct cfq_rb_root *st; 1988 struct cfq_rb_root *st;
@@ -2032,24 +2010,19 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2032 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2010 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2033 * expiration time 2011 * expiration time
2034 */ 2012 */
2035 prio_changed = (cfqd->serving_prio != previous_prio); 2013 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2036 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
2037 cfqd);
2038 count = st->count; 2014 count = st->count;
2039 2015
2040 /* 2016 /*
2041 * If priority didn't change, check workload expiration, 2017 * check workload expiration, and that we still have other queues ready
2042 * and that we still have other queues ready
2043 */ 2018 */
2044 if (!prio_changed && count && 2019 if (count && !time_after(jiffies, cfqd->workload_expires))
2045 !time_after(jiffies, cfqd->workload_expires))
2046 return; 2020 return;
2047 2021
2048 /* otherwise select new workload type */ 2022 /* otherwise select new workload type */
2049 cfqd->serving_type = 2023 cfqd->serving_type =
2050 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed); 2024 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2051 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, 2025 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2052 cfqd);
2053 count = st->count; 2026 count = st->count;
2054 2027
2055 /* 2028 /*
@@ -3143,7 +3116,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3143 * if this request is as-good as one we would expect from the 3116 * if this request is as-good as one we would expect from the
3144 * current cfqq, let it preempt 3117 * current cfqq, let it preempt
3145 */ 3118 */
3146 if (cfq_rq_close(cfqd, cfqq, rq)) 3119 if (cfq_rq_close(cfqd, cfqq, rq, true))
3147 return true; 3120 return true;
3148 3121
3149 return false; 3122 return false;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index eb4fa1943944..ce1fa923c414 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -7101,7 +7101,7 @@ static struct DAC960_privdata DAC960_BA_privdata = {
7101 7101
7102static struct DAC960_privdata DAC960_LP_privdata = { 7102static struct DAC960_privdata DAC960_LP_privdata = {
7103 .HardwareType = DAC960_LP_Controller, 7103 .HardwareType = DAC960_LP_Controller,
7104 .FirmwareType = DAC960_LP_Controller, 7104 .FirmwareType = DAC960_V2_Controller,
7105 .InterruptHandler = DAC960_LP_InterruptHandler, 7105 .InterruptHandler = DAC960_LP_InterruptHandler,
7106 .MemoryWindowSize = DAC960_LP_RegisterWindowSize, 7106 .MemoryWindowSize = DAC960_LP_RegisterWindowSize,
7107}; 7107};
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 13bb69d2abb3..64a223b0cc22 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -735,21 +735,6 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
735 part_stat_unlock(); 735 part_stat_unlock();
736} 736}
737 737
738/*
739 * Ensure we don't create aliases in VI caches
740 */
741static inline void
742killalias(struct bio *bio)
743{
744 struct bio_vec *bv;
745 int i;
746
747 if (bio_data_dir(bio) == READ)
748 __bio_for_each_segment(bv, bio, i, 0) {
749 flush_dcache_page(bv->bv_page);
750 }
751}
752
753void 738void
754aoecmd_ata_rsp(struct sk_buff *skb) 739aoecmd_ata_rsp(struct sk_buff *skb)
755{ 740{
@@ -871,7 +856,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
871 if (buf->flags & BUFFL_FAIL) 856 if (buf->flags & BUFFL_FAIL)
872 bio_endio(buf->bio, -EIO); 857 bio_endio(buf->bio, -EIO);
873 else { 858 else {
874 killalias(buf->bio); 859 bio_flush_dcache_pages(buf->bio);
875 bio_endio(buf->bio, 0); 860 bio_endio(buf->bio, 0);
876 } 861 }
877 mempool_free(buf, d->bufpool); 862 mempool_free(buf, d->bufpool);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 2312d782fe99..c97558763430 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1490,7 +1490,7 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
1490 1490
1491/* drbd_proc.c */ 1491/* drbd_proc.c */
1492extern struct proc_dir_entry *drbd_proc; 1492extern struct proc_dir_entry *drbd_proc;
1493extern struct file_operations drbd_proc_fops; 1493extern const struct file_operations drbd_proc_fops;
1494extern const char *drbd_conn_str(enum drbd_conns s); 1494extern const char *drbd_conn_str(enum drbd_conns s);
1495extern const char *drbd_role_str(enum drbd_role s); 1495extern const char *drbd_role_str(enum drbd_role s);
1496 1496
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 157d1e4343c2..9348f33f6242 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -27,7 +27,6 @@
27 */ 27 */
28 28
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/version.h>
31#include <linux/drbd.h> 30#include <linux/drbd.h>
32#include <asm/uaccess.h> 31#include <asm/uaccess.h>
33#include <asm/types.h> 32#include <asm/types.h>
@@ -151,7 +150,7 @@ wait_queue_head_t drbd_pp_wait;
151 150
152DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); 151DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
153 152
154static struct block_device_operations drbd_ops = { 153static const struct block_device_operations drbd_ops = {
155 .owner = THIS_MODULE, 154 .owner = THIS_MODULE,
156 .open = drbd_open, 155 .open = drbd_open,
157 .release = drbd_release, 156 .release = drbd_release,
@@ -3623,7 +3622,7 @@ _drbd_fault_random(struct fault_random_state *rsp)
3623{ 3622{
3624 long refresh; 3623 long refresh;
3625 3624
3626 if (--rsp->count < 0) { 3625 if (!rsp->count--) {
3627 get_random_bytes(&refresh, sizeof(refresh)); 3626 get_random_bytes(&refresh, sizeof(refresh));
3628 rsp->state += refresh; 3627 rsp->state += refresh;
3629 rsp->count = FAULT_RANDOM_REFRESH; 3628 rsp->count = FAULT_RANDOM_REFRESH;
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index bdd0b4943b10..df8ad9660d8f 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -38,7 +38,7 @@ static int drbd_proc_open(struct inode *inode, struct file *file);
38 38
39 39
40struct proc_dir_entry *drbd_proc; 40struct proc_dir_entry *drbd_proc;
41struct file_operations drbd_proc_fops = { 41const struct file_operations drbd_proc_fops = {
42 .owner = THIS_MODULE, 42 .owner = THIS_MODULE,
43 .open = drbd_proc_open, 43 .open = drbd_proc_open,
44 .read = seq_read, 44 .read = seq_read,
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c548f24f54a1..259c1351b152 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -28,7 +28,6 @@
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <net/sock.h> 29#include <net/sock.h>
30 30
31#include <linux/version.h>
32#include <linux/drbd.h> 31#include <linux/drbd.h>
33#include <linux/fs.h> 32#include <linux/fs.h>
34#include <linux/file.h> 33#include <linux/file.h>
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ed8796f1112d..b453c2bca3be 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/version.h>
28#include <linux/drbd.h> 27#include <linux/drbd.h>
29#include <linux/sched.h> 28#include <linux/sched.h>
30#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
@@ -34,7 +33,6 @@
34#include <linux/mm_inline.h> 33#include <linux/mm_inline.h>
35#include <linux/slab.h> 34#include <linux/slab.h>
36#include <linux/random.h> 35#include <linux/random.h>
37#include <linux/mm.h>
38#include <linux/string.h> 36#include <linux/string.h>
39#include <linux/scatterlist.h> 37#include <linux/scatterlist.h>
40 38
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index e0339aaa1815..02b2583df7fc 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -860,7 +860,7 @@ static int mg_probe(struct platform_device *plat_dev)
860 err = -EINVAL; 860 err = -EINVAL;
861 goto probe_err_2; 861 goto probe_err_2;
862 } 862 }
863 host->dev_base = ioremap(rsc->start , rsc->end + 1); 863 host->dev_base = ioremap(rsc->start, resource_size(rsc));
864 if (!host->dev_base) { 864 if (!host->dev_base) {
865 printk(KERN_ERR "%s:%d ioremap fail\n", 865 printk(KERN_ERR "%s:%d ioremap fail\n",
866 __func__, __LINE__); 866 __func__, __LINE__);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 784a919aa0d0..9b98173a8184 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -845,7 +845,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
845 * blk_rq_err_bytes() : bytes left till the next error boundary 845 * blk_rq_err_bytes() : bytes left till the next error boundary
846 * blk_rq_sectors() : sectors left in the entire request 846 * blk_rq_sectors() : sectors left in the entire request
847 * blk_rq_cur_sectors() : sectors left in the current segment 847 * blk_rq_cur_sectors() : sectors left in the current segment
848 * blk_rq_err_sectors() : sectors left till the next error boundary
849 */ 848 */
850static inline sector_t blk_rq_pos(const struct request *rq) 849static inline sector_t blk_rq_pos(const struct request *rq)
851{ 850{
@@ -874,11 +873,6 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
874 return blk_rq_cur_bytes(rq) >> 9; 873 return blk_rq_cur_bytes(rq) >> 9;
875} 874}
876 875
877static inline unsigned int blk_rq_err_sectors(const struct request *rq)
878{
879 return blk_rq_err_bytes(rq) >> 9;
880}
881
882/* 876/*
883 * Request issue related functions. 877 * Request issue related functions.
884 */ 878 */
@@ -1116,11 +1110,18 @@ static inline int queue_alignment_offset(struct request_queue *q)
1116 return q->limits.alignment_offset; 1110 return q->limits.alignment_offset;
1117} 1111}
1118 1112
1113static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t offset)
1114{
1115 unsigned int granularity = max(lim->physical_block_size, lim->io_min);
1116
1117 offset &= granularity - 1;
1118 return (granularity + lim->alignment_offset - offset) & (granularity - 1);
1119}
1120
1119static inline int queue_sector_alignment_offset(struct request_queue *q, 1121static inline int queue_sector_alignment_offset(struct request_queue *q,
1120 sector_t sector) 1122 sector_t sector)
1121{ 1123{
1122 return ((sector << 9) - q->limits.alignment_offset) 1124 return queue_limit_alignment_offset(&q->limits, sector << 9);
1123 & (q->limits.io_min - 1);
1124} 1125}
1125 1126
1126static inline int bdev_alignment_offset(struct block_device *bdev) 1127static inline int bdev_alignment_offset(struct block_device *bdev)