diff options
-rw-r--r-- | Documentation/block/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/block/as-iosched.txt | 172 | ||||
-rw-r--r-- | block/blk-barrier.c | 2 | ||||
-rw-r--r-- | block/blk-settings.c | 121 | ||||
-rw-r--r-- | block/cfq-iosched.c | 67 | ||||
-rw-r--r-- | drivers/block/DAC960.c | 2 | ||||
-rw-r--r-- | drivers/block/aoe/aoecmd.c | 17 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 5 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 2 | ||||
-rw-r--r-- | drivers/block/mg_disk.c | 2 | ||||
-rw-r--r-- | include/linux/blkdev.h | 17 |
14 files changed, 115 insertions, 299 deletions
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index 961a0513f8c3..a406286f6f3e 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX | |||
@@ -1,7 +1,5 @@ | |||
1 | 00-INDEX | 1 | 00-INDEX |
2 | - This file | 2 | - This file |
3 | as-iosched.txt | ||
4 | - Anticipatory IO scheduler | ||
5 | barrier.txt | 3 | barrier.txt |
6 | - I/O Barriers | 4 | - I/O Barriers |
7 | biodoc.txt | 5 | biodoc.txt |
diff --git a/Documentation/block/as-iosched.txt b/Documentation/block/as-iosched.txt deleted file mode 100644 index 738b72be128e..000000000000 --- a/Documentation/block/as-iosched.txt +++ /dev/null | |||
@@ -1,172 +0,0 @@ | |||
1 | Anticipatory IO scheduler | ||
2 | ------------------------- | ||
3 | Nick Piggin <piggin@cyberone.com.au> 13 Sep 2003 | ||
4 | |||
5 | Attention! Database servers, especially those using "TCQ" disks should | ||
6 | investigate performance with the 'deadline' IO scheduler. Any system with high | ||
7 | disk performance requirements should do so, in fact. | ||
8 | |||
9 | If you see unusual performance characteristics of your disk systems, or you | ||
10 | see big performance regressions versus the deadline scheduler, please email | ||
11 | me. Database users don't bother unless you're willing to test a lot of patches | ||
12 | from me ;) its a known issue. | ||
13 | |||
14 | Also, users with hardware RAID controllers, doing striping, may find | ||
15 | highly variable performance results with using the as-iosched. The | ||
16 | as-iosched anticipatory implementation is based on the notion that a disk | ||
17 | device has only one physical seeking head. A striped RAID controller | ||
18 | actually has a head for each physical device in the logical RAID device. | ||
19 | |||
20 | However, setting the antic_expire (see tunable parameters below) produces | ||
21 | very similar behavior to the deadline IO scheduler. | ||
22 | |||
23 | Selecting IO schedulers | ||
24 | ----------------------- | ||
25 | Refer to Documentation/block/switching-sched.txt for information on | ||
26 | selecting an io scheduler on a per-device basis. | ||
27 | |||
28 | Anticipatory IO scheduler Policies | ||
29 | ---------------------------------- | ||
30 | The as-iosched implementation implements several layers of policies | ||
31 | to determine when an IO request is dispatched to the disk controller. | ||
32 | Here are the policies outlined, in order of application. | ||
33 | |||
34 | 1. one-way Elevator algorithm. | ||
35 | |||
36 | The elevator algorithm is similar to that used in deadline scheduler, with | ||
37 | the addition that it allows limited backward movement of the elevator | ||
38 | (i.e. seeks backwards). A seek backwards can occur when choosing between | ||
39 | two IO requests where one is behind the elevator's current position, and | ||
40 | the other is in front of the elevator's position. If the seek distance to | ||
41 | the request in back of the elevator is less than half the seek distance to | ||
42 | the request in front of the elevator, then the request in back can be chosen. | ||
43 | Backward seeks are also limited to a maximum of MAXBACK (1024*1024) sectors. | ||
44 | This favors forward movement of the elevator, while allowing opportunistic | ||
45 | "short" backward seeks. | ||
46 | |||
47 | 2. FIFO expiration times for reads and for writes. | ||
48 | |||
49 | This is again very similar to the deadline IO scheduler. The expiration | ||
50 | times for requests on these lists is tunable using the parameters read_expire | ||
51 | and write_expire discussed below. When a read or a write expires in this way, | ||
52 | the IO scheduler will interrupt its current elevator sweep or read anticipation | ||
53 | to service the expired request. | ||
54 | |||
55 | 3. Read and write request batching | ||
56 | |||
57 | A batch is a collection of read requests or a collection of write | ||
58 | requests. The as scheduler alternates dispatching read and write batches | ||
59 | to the driver. In the case a read batch, the scheduler submits read | ||
60 | requests to the driver as long as there are read requests to submit, and | ||
61 | the read batch time limit has not been exceeded (read_batch_expire). | ||
62 | The read batch time limit begins counting down only when there are | ||
63 | competing write requests pending. | ||
64 | |||
65 | In the case of a write batch, the scheduler submits write requests to | ||
66 | the driver as long as there are write requests available, and the | ||
67 | write batch time limit has not been exceeded (write_batch_expire). | ||
68 | However, the length of write batches will be gradually shortened | ||
69 | when read batches frequently exceed their time limit. | ||
70 | |||
71 | When changing between batch types, the scheduler waits for all requests | ||
72 | from the previous batch to complete before scheduling requests for the | ||
73 | next batch. | ||
74 | |||
75 | The read and write fifo expiration times described in policy 2 above | ||
76 | are checked only when in scheduling IO of a batch for the corresponding | ||
77 | (read/write) type. So for example, the read FIFO timeout values are | ||
78 | tested only during read batches. Likewise, the write FIFO timeout | ||
79 | values are tested only during write batches. For this reason, | ||
80 | it is generally not recommended for the read batch time | ||
81 | to be longer than the write expiration time, nor for the write batch | ||
82 | time to exceed the read expiration time (see tunable parameters below). | ||
83 | |||
84 | When the IO scheduler changes from a read to a write batch, | ||
85 | it begins the elevator from the request that is on the head of the | ||
86 | write expiration FIFO. Likewise, when changing from a write batch to | ||
87 | a read batch, scheduler begins the elevator from the first entry | ||
88 | on the read expiration FIFO. | ||
89 | |||
90 | 4. Read anticipation. | ||
91 | |||
92 | Read anticipation occurs only when scheduling a read batch. | ||
93 | This implementation of read anticipation allows only one read request | ||
94 | to be dispatched to the disk controller at a time. In | ||
95 | contrast, many write requests may be dispatched to the disk controller | ||
96 | at a time during a write batch. It is this characteristic that can make | ||
97 | the anticipatory scheduler perform anomalously with controllers supporting | ||
98 | TCQ, or with hardware striped RAID devices. Setting the antic_expire | ||
99 | queue parameter (see below) to zero disables this behavior, and the | ||
100 | anticipatory scheduler behaves essentially like the deadline scheduler. | ||
101 | |||
102 | When read anticipation is enabled (antic_expire is not zero), reads | ||
103 | are dispatched to the disk controller one at a time. | ||
104 | At the end of each read request, the IO scheduler examines its next | ||
105 | candidate read request from its sorted read list. If that next request | ||
106 | is from the same process as the request that just completed, | ||
107 | or if the next request in the queue is "very close" to the | ||
108 | just completed request, it is dispatched immediately. Otherwise, | ||
109 | statistics (average think time, average seek distance) on the process | ||
110 | that submitted the just completed request are examined. If it seems | ||
111 | likely that that process will submit another request soon, and that | ||
112 | request is likely to be near the just completed request, then the IO | ||
113 | scheduler will stop dispatching more read requests for up to (antic_expire) | ||
114 | milliseconds, hoping that process will submit a new request near the one | ||
115 | that just completed. If such a request is made, then it is dispatched | ||
116 | immediately. If the antic_expire wait time expires, then the IO scheduler | ||
117 | will dispatch the next read request from the sorted read queue. | ||
118 | |||
119 | To decide whether an anticipatory wait is worthwhile, the scheduler | ||
120 | maintains statistics for each process that can be used to compute | ||
121 | mean "think time" (the time between read requests), and mean seek | ||
122 | distance for that process. One observation is that these statistics | ||
123 | are associated with each process, but those statistics are not associated | ||
124 | with a specific IO device. So for example, if a process is doing IO | ||
125 | on several file systems on separate devices, the statistics will be | ||
126 | a combination of IO behavior from all those devices. | ||
127 | |||
128 | |||
129 | Tuning the anticipatory IO scheduler | ||
130 | ------------------------------------ | ||
131 | When using 'as', the anticipatory IO scheduler there are 5 parameters under | ||
132 | /sys/block/*/queue/iosched/. All are units of milliseconds. | ||
133 | |||
134 | The parameters are: | ||
135 | * read_expire | ||
136 | Controls how long until a read request becomes "expired". It also controls the | ||
137 | interval between which expired requests are served, so set to 50, a request | ||
138 | might take anywhere < 100ms to be serviced _if_ it is the next on the | ||
139 | expired list. Obviously request expiration strategies won't make the disk | ||
140 | go faster. The result basically equates to the timeslice a single reader | ||
141 | gets in the presence of other IO. 100*((seek time / read_expire) + 1) is | ||
142 | very roughly the % streaming read efficiency your disk should get with | ||
143 | multiple readers. | ||
144 | |||
145 | * read_batch_expire | ||
146 | Controls how much time a batch of reads is given before pending writes are | ||
147 | served. A higher value is more efficient. This might be set below read_expire | ||
148 | if writes are to be given higher priority than reads, but reads are to be | ||
149 | as efficient as possible when there are no writes. Generally though, it | ||
150 | should be some multiple of read_expire. | ||
151 | |||
152 | * write_expire, and | ||
153 | * write_batch_expire are equivalent to the above, for writes. | ||
154 | |||
155 | * antic_expire | ||
156 | Controls the maximum amount of time we can anticipate a good read (one | ||
157 | with a short seek distance from the most recently completed request) before | ||
158 | giving up. Many other factors may cause anticipation to be stopped early, | ||
159 | or some processes will not be "anticipated" at all. Should be a bit higher | ||
160 | for big seek time devices though not a linear correspondence - most | ||
161 | processes have only a few ms thinktime. | ||
162 | |||
163 | In addition to the tunables above there is a read-only file named est_time | ||
164 | which, when read, will show: | ||
165 | |||
166 | - The probability of a task exiting without a cooperating task | ||
167 | submitting an anticipated IO. | ||
168 | |||
169 | - The current mean think time. | ||
170 | |||
171 | - The seek distance used to determine if an incoming IO is better. | ||
172 | |||
diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 8873b9b439ff..8618d8996fea 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c | |||
@@ -402,7 +402,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
402 | * our current implementations need. If we'll ever need | 402 | * our current implementations need. If we'll ever need |
403 | * more the interface will need revisiting. | 403 | * more the interface will need revisiting. |
404 | */ | 404 | */ |
405 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | 405 | page = alloc_page(gfp_mask | __GFP_ZERO); |
406 | if (!page) | 406 | if (!page) |
407 | goto out_free_bio; | 407 | goto out_free_bio; |
408 | if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) | 408 | if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) |
diff --git a/block/blk-settings.c b/block/blk-settings.c index 6ae118d6e193..d52d4adc440b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c | |||
@@ -505,21 +505,30 @@ static unsigned int lcm(unsigned int a, unsigned int b) | |||
505 | 505 | ||
506 | /** | 506 | /** |
507 | * blk_stack_limits - adjust queue_limits for stacked devices | 507 | * blk_stack_limits - adjust queue_limits for stacked devices |
508 | * @t: the stacking driver limits (top) | 508 | * @t: the stacking driver limits (top device) |
509 | * @b: the underlying queue limits (bottom) | 509 | * @b: the underlying queue limits (bottom, component device) |
510 | * @offset: offset to beginning of data within component device | 510 | * @offset: offset to beginning of data within component device |
511 | * | 511 | * |
512 | * Description: | 512 | * Description: |
513 | * Merges two queue_limit structs. Returns 0 if alignment didn't | 513 | * This function is used by stacking drivers like MD and DM to ensure |
514 | * change. Returns -1 if adding the bottom device caused | 514 | * that all component devices have compatible block sizes and |
515 | * misalignment. | 515 | * alignments. The stacking driver must provide a queue_limits |
516 | * struct (top) and then iteratively call the stacking function for | ||
517 | * all component (bottom) devices. The stacking function will | ||
518 | * attempt to combine the values and ensure proper alignment. | ||
519 | * | ||
520 | * Returns 0 if the top and bottom queue_limits are compatible. The | ||
521 | * top device's block sizes and alignment offsets may be adjusted to | ||
522 | * ensure alignment with the bottom device. If no compatible sizes | ||
523 | * and alignments exist, -1 is returned and the resulting top | ||
524 | * queue_limits will have the misaligned flag set to indicate that | ||
525 | * the alignment_offset is undefined. | ||
516 | */ | 526 | */ |
517 | int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | 527 | int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, |
518 | sector_t offset) | 528 | sector_t offset) |
519 | { | 529 | { |
520 | int ret; | 530 | sector_t alignment; |
521 | 531 | unsigned int top, bottom; | |
522 | ret = 0; | ||
523 | 532 | ||
524 | t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); | 533 | t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); |
525 | t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); | 534 | t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); |
@@ -537,6 +546,22 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | |||
537 | t->max_segment_size = min_not_zero(t->max_segment_size, | 546 | t->max_segment_size = min_not_zero(t->max_segment_size, |
538 | b->max_segment_size); | 547 | b->max_segment_size); |
539 | 548 | ||
549 | alignment = queue_limit_alignment_offset(b, offset); | ||
550 | |||
551 | /* Bottom device has different alignment. Check that it is | ||
552 | * compatible with the current top alignment. | ||
553 | */ | ||
554 | if (t->alignment_offset != alignment) { | ||
555 | |||
556 | top = max(t->physical_block_size, t->io_min) | ||
557 | + t->alignment_offset; | ||
558 | bottom = max(b->physical_block_size, b->io_min) + alignment; | ||
559 | |||
560 | /* Verify that top and bottom intervals line up */ | ||
561 | if (max(top, bottom) & (min(top, bottom) - 1)) | ||
562 | t->misaligned = 1; | ||
563 | } | ||
564 | |||
540 | t->logical_block_size = max(t->logical_block_size, | 565 | t->logical_block_size = max(t->logical_block_size, |
541 | b->logical_block_size); | 566 | b->logical_block_size); |
542 | 567 | ||
@@ -544,54 +569,64 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | |||
544 | b->physical_block_size); | 569 | b->physical_block_size); |
545 | 570 | ||
546 | t->io_min = max(t->io_min, b->io_min); | 571 | t->io_min = max(t->io_min, b->io_min); |
572 | t->io_opt = lcm(t->io_opt, b->io_opt); | ||
573 | |||
547 | t->no_cluster |= b->no_cluster; | 574 | t->no_cluster |= b->no_cluster; |
548 | t->discard_zeroes_data &= b->discard_zeroes_data; | 575 | t->discard_zeroes_data &= b->discard_zeroes_data; |
549 | 576 | ||
550 | /* Bottom device offset aligned? */ | 577 | /* Physical block size a multiple of the logical block size? */ |
551 | if (offset && | 578 | if (t->physical_block_size & (t->logical_block_size - 1)) { |
552 | (offset & (b->physical_block_size - 1)) != b->alignment_offset) { | 579 | t->physical_block_size = t->logical_block_size; |
553 | t->misaligned = 1; | 580 | t->misaligned = 1; |
554 | ret = -1; | ||
555 | } | 581 | } |
556 | 582 | ||
557 | /* | 583 | /* Minimum I/O a multiple of the physical block size? */ |
558 | * Temporarily disable discard granularity. It's currently buggy | 584 | if (t->io_min & (t->physical_block_size - 1)) { |
559 | * since we default to 0 for discard_granularity, hence this | 585 | t->io_min = t->physical_block_size; |
560 | * "failure" will always trigger for non-zero offsets. | 586 | t->misaligned = 1; |
561 | */ | ||
562 | #if 0 | ||
563 | if (offset && | ||
564 | (offset & (b->discard_granularity - 1)) != b->discard_alignment) { | ||
565 | t->discard_misaligned = 1; | ||
566 | ret = -1; | ||
567 | } | 587 | } |
568 | #endif | ||
569 | |||
570 | /* If top has no alignment offset, inherit from bottom */ | ||
571 | if (!t->alignment_offset) | ||
572 | t->alignment_offset = | ||
573 | b->alignment_offset & (b->physical_block_size - 1); | ||
574 | 588 | ||
575 | if (!t->discard_alignment) | 589 | /* Optimal I/O a multiple of the physical block size? */ |
576 | t->discard_alignment = | 590 | if (t->io_opt & (t->physical_block_size - 1)) { |
577 | b->discard_alignment & (b->discard_granularity - 1); | 591 | t->io_opt = 0; |
578 | |||
579 | /* Top device aligned on logical block boundary? */ | ||
580 | if (t->alignment_offset & (t->logical_block_size - 1)) { | ||
581 | t->misaligned = 1; | 592 | t->misaligned = 1; |
582 | ret = -1; | ||
583 | } | 593 | } |
584 | 594 | ||
585 | /* Find lcm() of optimal I/O size and granularity */ | 595 | /* Find lowest common alignment_offset */ |
586 | t->io_opt = lcm(t->io_opt, b->io_opt); | 596 | t->alignment_offset = lcm(t->alignment_offset, alignment) |
587 | t->discard_granularity = lcm(t->discard_granularity, | 597 | & (max(t->physical_block_size, t->io_min) - 1); |
588 | b->discard_granularity); | ||
589 | 598 | ||
590 | /* Verify that optimal I/O size is a multiple of io_min */ | 599 | /* Verify that new alignment_offset is on a logical block boundary */ |
591 | if (t->io_min && t->io_opt % t->io_min) | 600 | if (t->alignment_offset & (t->logical_block_size - 1)) |
592 | ret = -1; | 601 | t->misaligned = 1; |
602 | |||
603 | /* Discard alignment and granularity */ | ||
604 | if (b->discard_granularity) { | ||
605 | unsigned int granularity = b->discard_granularity; | ||
606 | offset &= granularity - 1; | ||
607 | |||
608 | alignment = (granularity + b->discard_alignment - offset) | ||
609 | & (granularity - 1); | ||
610 | |||
611 | if (t->discard_granularity != 0 && | ||
612 | t->discard_alignment != alignment) { | ||
613 | top = t->discard_granularity + t->discard_alignment; | ||
614 | bottom = b->discard_granularity + alignment; | ||
615 | |||
616 | /* Verify that top and bottom intervals line up */ | ||
617 | if (max(top, bottom) & (min(top, bottom) - 1)) | ||
618 | t->discard_misaligned = 1; | ||
619 | } | ||
620 | |||
621 | t->max_discard_sectors = min_not_zero(t->max_discard_sectors, | ||
622 | b->max_discard_sectors); | ||
623 | t->discard_granularity = max(t->discard_granularity, | ||
624 | b->discard_granularity); | ||
625 | t->discard_alignment = lcm(t->discard_alignment, alignment) & | ||
626 | (t->discard_granularity - 1); | ||
627 | } | ||
593 | 628 | ||
594 | return ret; | 629 | return t->misaligned ? -1 : 0; |
595 | } | 630 | } |
596 | EXPORT_SYMBOL(blk_stack_limits); | 631 | EXPORT_SYMBOL(blk_stack_limits); |
597 | 632 | ||
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e2f80463ed0d..918c7fd9aeb1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -208,8 +208,6 @@ struct cfq_data { | |||
208 | /* Root service tree for cfq_groups */ | 208 | /* Root service tree for cfq_groups */ |
209 | struct cfq_rb_root grp_service_tree; | 209 | struct cfq_rb_root grp_service_tree; |
210 | struct cfq_group root_group; | 210 | struct cfq_group root_group; |
211 | /* Number of active cfq groups on group service tree */ | ||
212 | int nr_groups; | ||
213 | 211 | ||
214 | /* | 212 | /* |
215 | * The priority currently being served | 213 | * The priority currently being served |
@@ -294,8 +292,7 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); | |||
294 | 292 | ||
295 | static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, | 293 | static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, |
296 | enum wl_prio_t prio, | 294 | enum wl_prio_t prio, |
297 | enum wl_type_t type, | 295 | enum wl_type_t type) |
298 | struct cfq_data *cfqd) | ||
299 | { | 296 | { |
300 | if (!cfqg) | 297 | if (!cfqg) |
301 | return NULL; | 298 | return NULL; |
@@ -842,7 +839,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
842 | 839 | ||
843 | __cfq_group_service_tree_add(st, cfqg); | 840 | __cfq_group_service_tree_add(st, cfqg); |
844 | cfqg->on_st = true; | 841 | cfqg->on_st = true; |
845 | cfqd->nr_groups++; | ||
846 | st->total_weight += cfqg->weight; | 842 | st->total_weight += cfqg->weight; |
847 | } | 843 | } |
848 | 844 | ||
@@ -863,7 +859,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
863 | 859 | ||
864 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); | 860 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); |
865 | cfqg->on_st = false; | 861 | cfqg->on_st = false; |
866 | cfqd->nr_groups--; | ||
867 | st->total_weight -= cfqg->weight; | 862 | st->total_weight -= cfqg->weight; |
868 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | 863 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) |
869 | cfq_rb_erase(&cfqg->rb_node, st); | 864 | cfq_rb_erase(&cfqg->rb_node, st); |
@@ -1150,7 +1145,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1150 | #endif | 1145 | #endif |
1151 | 1146 | ||
1152 | service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), | 1147 | service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), |
1153 | cfqq_type(cfqq), cfqd); | 1148 | cfqq_type(cfqq)); |
1154 | if (cfq_class_idle(cfqq)) { | 1149 | if (cfq_class_idle(cfqq)) { |
1155 | rb_key = CFQ_IDLE_DELAY; | 1150 | rb_key = CFQ_IDLE_DELAY; |
1156 | parent = rb_last(&service_tree->rb); | 1151 | parent = rb_last(&service_tree->rb); |
@@ -1513,9 +1508,6 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, | |||
1513 | struct cfq_io_context *cic; | 1508 | struct cfq_io_context *cic; |
1514 | struct cfq_queue *cfqq; | 1509 | struct cfq_queue *cfqq; |
1515 | 1510 | ||
1516 | /* Deny merge if bio and rq don't belong to same cfq group */ | ||
1517 | if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0)) | ||
1518 | return false; | ||
1519 | /* | 1511 | /* |
1520 | * Disallow merge of a sync bio into an async request. | 1512 | * Disallow merge of a sync bio into an async request. |
1521 | */ | 1513 | */ |
@@ -1616,7 +1608,7 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) | |||
1616 | { | 1608 | { |
1617 | struct cfq_rb_root *service_tree = | 1609 | struct cfq_rb_root *service_tree = |
1618 | service_tree_for(cfqd->serving_group, cfqd->serving_prio, | 1610 | service_tree_for(cfqd->serving_group, cfqd->serving_prio, |
1619 | cfqd->serving_type, cfqd); | 1611 | cfqd->serving_type); |
1620 | 1612 | ||
1621 | if (!cfqd->rq_queued) | 1613 | if (!cfqd->rq_queued) |
1622 | return NULL; | 1614 | return NULL; |
@@ -1675,13 +1667,17 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, | |||
1675 | #define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) | 1667 | #define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) |
1676 | 1668 | ||
1677 | static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, | 1669 | static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, |
1678 | struct request *rq) | 1670 | struct request *rq, bool for_preempt) |
1679 | { | 1671 | { |
1680 | sector_t sdist = cfqq->seek_mean; | 1672 | sector_t sdist = cfqq->seek_mean; |
1681 | 1673 | ||
1682 | if (!sample_valid(cfqq->seek_samples)) | 1674 | if (!sample_valid(cfqq->seek_samples)) |
1683 | sdist = CFQQ_SEEK_THR; | 1675 | sdist = CFQQ_SEEK_THR; |
1684 | 1676 | ||
1677 | /* if seek_mean is big, using it as close criteria is meaningless */ | ||
1678 | if (sdist > CFQQ_SEEK_THR && !for_preempt) | ||
1679 | sdist = CFQQ_SEEK_THR; | ||
1680 | |||
1685 | return cfq_dist_from_last(cfqd, rq) <= sdist; | 1681 | return cfq_dist_from_last(cfqd, rq) <= sdist; |
1686 | } | 1682 | } |
1687 | 1683 | ||
@@ -1709,7 +1705,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, | |||
1709 | * will contain the closest sector. | 1705 | * will contain the closest sector. |
1710 | */ | 1706 | */ |
1711 | __cfqq = rb_entry(parent, struct cfq_queue, p_node); | 1707 | __cfqq = rb_entry(parent, struct cfq_queue, p_node); |
1712 | if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) | 1708 | if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) |
1713 | return __cfqq; | 1709 | return __cfqq; |
1714 | 1710 | ||
1715 | if (blk_rq_pos(__cfqq->next_rq) < sector) | 1711 | if (blk_rq_pos(__cfqq->next_rq) < sector) |
@@ -1720,7 +1716,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, | |||
1720 | return NULL; | 1716 | return NULL; |
1721 | 1717 | ||
1722 | __cfqq = rb_entry(node, struct cfq_queue, p_node); | 1718 | __cfqq = rb_entry(node, struct cfq_queue, p_node); |
1723 | if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) | 1719 | if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) |
1724 | return __cfqq; | 1720 | return __cfqq; |
1725 | 1721 | ||
1726 | return NULL; | 1722 | return NULL; |
@@ -1963,8 +1959,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) | |||
1963 | } | 1959 | } |
1964 | 1960 | ||
1965 | static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, | 1961 | static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, |
1966 | struct cfq_group *cfqg, enum wl_prio_t prio, | 1962 | struct cfq_group *cfqg, enum wl_prio_t prio) |
1967 | bool prio_changed) | ||
1968 | { | 1963 | { |
1969 | struct cfq_queue *queue; | 1964 | struct cfq_queue *queue; |
1970 | int i; | 1965 | int i; |
@@ -1972,24 +1967,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, | |||
1972 | unsigned long lowest_key = 0; | 1967 | unsigned long lowest_key = 0; |
1973 | enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; | 1968 | enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; |
1974 | 1969 | ||
1975 | if (prio_changed) { | 1970 | for (i = 0; i <= SYNC_WORKLOAD; ++i) { |
1976 | /* | 1971 | /* select the one with lowest rb_key */ |
1977 | * When priorities switched, we prefer starting | 1972 | queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); |
1978 | * from SYNC_NOIDLE (first choice), or just SYNC | ||
1979 | * over ASYNC | ||
1980 | */ | ||
1981 | if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) | ||
1982 | return cur_best; | ||
1983 | cur_best = SYNC_WORKLOAD; | ||
1984 | if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) | ||
1985 | return cur_best; | ||
1986 | |||
1987 | return ASYNC_WORKLOAD; | ||
1988 | } | ||
1989 | |||
1990 | for (i = 0; i < 3; ++i) { | ||
1991 | /* otherwise, select the one with lowest rb_key */ | ||
1992 | queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd)); | ||
1993 | if (queue && | 1973 | if (queue && |
1994 | (!key_valid || time_before(queue->rb_key, lowest_key))) { | 1974 | (!key_valid || time_before(queue->rb_key, lowest_key))) { |
1995 | lowest_key = queue->rb_key; | 1975 | lowest_key = queue->rb_key; |
@@ -2003,8 +1983,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, | |||
2003 | 1983 | ||
2004 | static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) | 1984 | static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) |
2005 | { | 1985 | { |
2006 | enum wl_prio_t previous_prio = cfqd->serving_prio; | ||
2007 | bool prio_changed; | ||
2008 | unsigned slice; | 1986 | unsigned slice; |
2009 | unsigned count; | 1987 | unsigned count; |
2010 | struct cfq_rb_root *st; | 1988 | struct cfq_rb_root *st; |
@@ -2032,24 +2010,19 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
2032 | * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload | 2010 | * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload |
2033 | * expiration time | 2011 | * expiration time |
2034 | */ | 2012 | */ |
2035 | prio_changed = (cfqd->serving_prio != previous_prio); | 2013 | st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); |
2036 | st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, | ||
2037 | cfqd); | ||
2038 | count = st->count; | 2014 | count = st->count; |
2039 | 2015 | ||
2040 | /* | 2016 | /* |
2041 | * If priority didn't change, check workload expiration, | 2017 | * check workload expiration, and that we still have other queues ready |
2042 | * and that we still have other queues ready | ||
2043 | */ | 2018 | */ |
2044 | if (!prio_changed && count && | 2019 | if (count && !time_after(jiffies, cfqd->workload_expires)) |
2045 | !time_after(jiffies, cfqd->workload_expires)) | ||
2046 | return; | 2020 | return; |
2047 | 2021 | ||
2048 | /* otherwise select new workload type */ | 2022 | /* otherwise select new workload type */ |
2049 | cfqd->serving_type = | 2023 | cfqd->serving_type = |
2050 | cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed); | 2024 | cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); |
2051 | st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, | 2025 | st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); |
2052 | cfqd); | ||
2053 | count = st->count; | 2026 | count = st->count; |
2054 | 2027 | ||
2055 | /* | 2028 | /* |
@@ -3143,7 +3116,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, | |||
3143 | * if this request is as-good as one we would expect from the | 3116 | * if this request is as-good as one we would expect from the |
3144 | * current cfqq, let it preempt | 3117 | * current cfqq, let it preempt |
3145 | */ | 3118 | */ |
3146 | if (cfq_rq_close(cfqd, cfqq, rq)) | 3119 | if (cfq_rq_close(cfqd, cfqq, rq, true)) |
3147 | return true; | 3120 | return true; |
3148 | 3121 | ||
3149 | return false; | 3122 | return false; |
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index eb4fa1943944..ce1fa923c414 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c | |||
@@ -7101,7 +7101,7 @@ static struct DAC960_privdata DAC960_BA_privdata = { | |||
7101 | 7101 | ||
7102 | static struct DAC960_privdata DAC960_LP_privdata = { | 7102 | static struct DAC960_privdata DAC960_LP_privdata = { |
7103 | .HardwareType = DAC960_LP_Controller, | 7103 | .HardwareType = DAC960_LP_Controller, |
7104 | .FirmwareType = DAC960_LP_Controller, | 7104 | .FirmwareType = DAC960_V2_Controller, |
7105 | .InterruptHandler = DAC960_LP_InterruptHandler, | 7105 | .InterruptHandler = DAC960_LP_InterruptHandler, |
7106 | .MemoryWindowSize = DAC960_LP_RegisterWindowSize, | 7106 | .MemoryWindowSize = DAC960_LP_RegisterWindowSize, |
7107 | }; | 7107 | }; |
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 13bb69d2abb3..64a223b0cc22 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c | |||
@@ -735,21 +735,6 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector | |||
735 | part_stat_unlock(); | 735 | part_stat_unlock(); |
736 | } | 736 | } |
737 | 737 | ||
738 | /* | ||
739 | * Ensure we don't create aliases in VI caches | ||
740 | */ | ||
741 | static inline void | ||
742 | killalias(struct bio *bio) | ||
743 | { | ||
744 | struct bio_vec *bv; | ||
745 | int i; | ||
746 | |||
747 | if (bio_data_dir(bio) == READ) | ||
748 | __bio_for_each_segment(bv, bio, i, 0) { | ||
749 | flush_dcache_page(bv->bv_page); | ||
750 | } | ||
751 | } | ||
752 | |||
753 | void | 738 | void |
754 | aoecmd_ata_rsp(struct sk_buff *skb) | 739 | aoecmd_ata_rsp(struct sk_buff *skb) |
755 | { | 740 | { |
@@ -871,7 +856,7 @@ aoecmd_ata_rsp(struct sk_buff *skb) | |||
871 | if (buf->flags & BUFFL_FAIL) | 856 | if (buf->flags & BUFFL_FAIL) |
872 | bio_endio(buf->bio, -EIO); | 857 | bio_endio(buf->bio, -EIO); |
873 | else { | 858 | else { |
874 | killalias(buf->bio); | 859 | bio_flush_dcache_pages(buf->bio); |
875 | bio_endio(buf->bio, 0); | 860 | bio_endio(buf->bio, 0); |
876 | } | 861 | } |
877 | mempool_free(buf, d->bufpool); | 862 | mempool_free(buf, d->bufpool); |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 2312d782fe99..c97558763430 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -1490,7 +1490,7 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); | |||
1490 | 1490 | ||
1491 | /* drbd_proc.c */ | 1491 | /* drbd_proc.c */ |
1492 | extern struct proc_dir_entry *drbd_proc; | 1492 | extern struct proc_dir_entry *drbd_proc; |
1493 | extern struct file_operations drbd_proc_fops; | 1493 | extern const struct file_operations drbd_proc_fops; |
1494 | extern const char *drbd_conn_str(enum drbd_conns s); | 1494 | extern const char *drbd_conn_str(enum drbd_conns s); |
1495 | extern const char *drbd_role_str(enum drbd_role s); | 1495 | extern const char *drbd_role_str(enum drbd_role s); |
1496 | 1496 | ||
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 157d1e4343c2..9348f33f6242 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -27,7 +27,6 @@ | |||
27 | */ | 27 | */ |
28 | 28 | ||
29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/version.h> | ||
31 | #include <linux/drbd.h> | 30 | #include <linux/drbd.h> |
32 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
33 | #include <asm/types.h> | 32 | #include <asm/types.h> |
@@ -151,7 +150,7 @@ wait_queue_head_t drbd_pp_wait; | |||
151 | 150 | ||
152 | DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); | 151 | DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); |
153 | 152 | ||
154 | static struct block_device_operations drbd_ops = { | 153 | static const struct block_device_operations drbd_ops = { |
155 | .owner = THIS_MODULE, | 154 | .owner = THIS_MODULE, |
156 | .open = drbd_open, | 155 | .open = drbd_open, |
157 | .release = drbd_release, | 156 | .release = drbd_release, |
@@ -3623,7 +3622,7 @@ _drbd_fault_random(struct fault_random_state *rsp) | |||
3623 | { | 3622 | { |
3624 | long refresh; | 3623 | long refresh; |
3625 | 3624 | ||
3626 | if (--rsp->count < 0) { | 3625 | if (!rsp->count--) { |
3627 | get_random_bytes(&refresh, sizeof(refresh)); | 3626 | get_random_bytes(&refresh, sizeof(refresh)); |
3628 | rsp->state += refresh; | 3627 | rsp->state += refresh; |
3629 | rsp->count = FAULT_RANDOM_REFRESH; | 3628 | rsp->count = FAULT_RANDOM_REFRESH; |
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index bdd0b4943b10..df8ad9660d8f 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -38,7 +38,7 @@ static int drbd_proc_open(struct inode *inode, struct file *file); | |||
38 | 38 | ||
39 | 39 | ||
40 | struct proc_dir_entry *drbd_proc; | 40 | struct proc_dir_entry *drbd_proc; |
41 | struct file_operations drbd_proc_fops = { | 41 | const struct file_operations drbd_proc_fops = { |
42 | .owner = THIS_MODULE, | 42 | .owner = THIS_MODULE, |
43 | .open = drbd_proc_open, | 43 | .open = drbd_proc_open, |
44 | .read = seq_read, | 44 | .read = seq_read, |
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c548f24f54a1..259c1351b152 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -28,7 +28,6 @@ | |||
28 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
29 | #include <net/sock.h> | 29 | #include <net/sock.h> |
30 | 30 | ||
31 | #include <linux/version.h> | ||
32 | #include <linux/drbd.h> | 31 | #include <linux/drbd.h> |
33 | #include <linux/fs.h> | 32 | #include <linux/fs.h> |
34 | #include <linux/file.h> | 33 | #include <linux/file.h> |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index ed8796f1112d..b453c2bca3be 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -24,7 +24,6 @@ | |||
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
27 | #include <linux/version.h> | ||
28 | #include <linux/drbd.h> | 27 | #include <linux/drbd.h> |
29 | #include <linux/sched.h> | 28 | #include <linux/sched.h> |
30 | #include <linux/smp_lock.h> | 29 | #include <linux/smp_lock.h> |
@@ -34,7 +33,6 @@ | |||
34 | #include <linux/mm_inline.h> | 33 | #include <linux/mm_inline.h> |
35 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
36 | #include <linux/random.h> | 35 | #include <linux/random.h> |
37 | #include <linux/mm.h> | ||
38 | #include <linux/string.h> | 36 | #include <linux/string.h> |
39 | #include <linux/scatterlist.h> | 37 | #include <linux/scatterlist.h> |
40 | 38 | ||
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index e0339aaa1815..02b2583df7fc 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c | |||
@@ -860,7 +860,7 @@ static int mg_probe(struct platform_device *plat_dev) | |||
860 | err = -EINVAL; | 860 | err = -EINVAL; |
861 | goto probe_err_2; | 861 | goto probe_err_2; |
862 | } | 862 | } |
863 | host->dev_base = ioremap(rsc->start , rsc->end + 1); | 863 | host->dev_base = ioremap(rsc->start, resource_size(rsc)); |
864 | if (!host->dev_base) { | 864 | if (!host->dev_base) { |
865 | printk(KERN_ERR "%s:%d ioremap fail\n", | 865 | printk(KERN_ERR "%s:%d ioremap fail\n", |
866 | __func__, __LINE__); | 866 | __func__, __LINE__); |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 784a919aa0d0..9b98173a8184 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -845,7 +845,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) | |||
845 | * blk_rq_err_bytes() : bytes left till the next error boundary | 845 | * blk_rq_err_bytes() : bytes left till the next error boundary |
846 | * blk_rq_sectors() : sectors left in the entire request | 846 | * blk_rq_sectors() : sectors left in the entire request |
847 | * blk_rq_cur_sectors() : sectors left in the current segment | 847 | * blk_rq_cur_sectors() : sectors left in the current segment |
848 | * blk_rq_err_sectors() : sectors left till the next error boundary | ||
849 | */ | 848 | */ |
850 | static inline sector_t blk_rq_pos(const struct request *rq) | 849 | static inline sector_t blk_rq_pos(const struct request *rq) |
851 | { | 850 | { |
@@ -874,11 +873,6 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) | |||
874 | return blk_rq_cur_bytes(rq) >> 9; | 873 | return blk_rq_cur_bytes(rq) >> 9; |
875 | } | 874 | } |
876 | 875 | ||
877 | static inline unsigned int blk_rq_err_sectors(const struct request *rq) | ||
878 | { | ||
879 | return blk_rq_err_bytes(rq) >> 9; | ||
880 | } | ||
881 | |||
882 | /* | 876 | /* |
883 | * Request issue related functions. | 877 | * Request issue related functions. |
884 | */ | 878 | */ |
@@ -1116,11 +1110,18 @@ static inline int queue_alignment_offset(struct request_queue *q) | |||
1116 | return q->limits.alignment_offset; | 1110 | return q->limits.alignment_offset; |
1117 | } | 1111 | } |
1118 | 1112 | ||
1113 | static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t offset) | ||
1114 | { | ||
1115 | unsigned int granularity = max(lim->physical_block_size, lim->io_min); | ||
1116 | |||
1117 | offset &= granularity - 1; | ||
1118 | return (granularity + lim->alignment_offset - offset) & (granularity - 1); | ||
1119 | } | ||
1120 | |||
1119 | static inline int queue_sector_alignment_offset(struct request_queue *q, | 1121 | static inline int queue_sector_alignment_offset(struct request_queue *q, |
1120 | sector_t sector) | 1122 | sector_t sector) |
1121 | { | 1123 | { |
1122 | return ((sector << 9) - q->limits.alignment_offset) | 1124 | return queue_limit_alignment_offset(&q->limits, sector << 9); |
1123 | & (q->limits.io_min - 1); | ||
1124 | } | 1125 | } |
1125 | 1126 | ||
1126 | static inline int bdev_alignment_offset(struct block_device *bdev) | 1127 | static inline int bdev_alignment_offset(struct block_device *bdev) |