diff options
Diffstat (limited to 'block')
55 files changed, 1946 insertions, 10228 deletions
diff --git a/block/Kconfig b/block/Kconfig index f7045aa47edb..8044452a4fd3 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
@@ -155,12 +155,6 @@ config BLK_CGROUP_IOLATENCY | |||
155 | 155 | ||
156 | Note, this is an experimental interface and could be changed someday. | 156 | Note, this is an experimental interface and could be changed someday. |
157 | 157 | ||
158 | config BLK_WBT_SQ | ||
159 | bool "Single queue writeback throttling" | ||
160 | depends on BLK_WBT | ||
161 | ---help--- | ||
162 | Enable writeback throttling by default on legacy single queue devices | ||
163 | |||
164 | config BLK_WBT_MQ | 158 | config BLK_WBT_MQ |
165 | bool "Multiqueue writeback throttling" | 159 | bool "Multiqueue writeback throttling" |
166 | default y | 160 | default y |
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index f95a48b0d7b2..4626b88b2d5a 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched | |||
@@ -3,67 +3,6 @@ if BLOCK | |||
3 | 3 | ||
4 | menu "IO Schedulers" | 4 | menu "IO Schedulers" |
5 | 5 | ||
6 | config IOSCHED_NOOP | ||
7 | bool | ||
8 | default y | ||
9 | ---help--- | ||
10 | The no-op I/O scheduler is a minimal scheduler that does basic merging | ||
11 | and sorting. Its main uses include non-disk based block devices like | ||
12 | memory devices, and specialised software or hardware environments | ||
13 | that do their own scheduling and require only minimal assistance from | ||
14 | the kernel. | ||
15 | |||
16 | config IOSCHED_DEADLINE | ||
17 | tristate "Deadline I/O scheduler" | ||
18 | default y | ||
19 | ---help--- | ||
20 | The deadline I/O scheduler is simple and compact. It will provide | ||
21 | CSCAN service with FIFO expiration of requests, switching to | ||
22 | a new point in the service tree and doing a batch of IO from there | ||
23 | in case of expiry. | ||
24 | |||
25 | config IOSCHED_CFQ | ||
26 | tristate "CFQ I/O scheduler" | ||
27 | default y | ||
28 | ---help--- | ||
29 | The CFQ I/O scheduler tries to distribute bandwidth equally | ||
30 | among all processes in the system. It should provide a fair | ||
31 | and low latency working environment, suitable for both desktop | ||
32 | and server systems. | ||
33 | |||
34 | This is the default I/O scheduler. | ||
35 | |||
36 | config CFQ_GROUP_IOSCHED | ||
37 | bool "CFQ Group Scheduling support" | ||
38 | depends on IOSCHED_CFQ && BLK_CGROUP | ||
39 | ---help--- | ||
40 | Enable group IO scheduling in CFQ. | ||
41 | |||
42 | choice | ||
43 | |||
44 | prompt "Default I/O scheduler" | ||
45 | default DEFAULT_CFQ | ||
46 | help | ||
47 | Select the I/O scheduler which will be used by default for all | ||
48 | block devices. | ||
49 | |||
50 | config DEFAULT_DEADLINE | ||
51 | bool "Deadline" if IOSCHED_DEADLINE=y | ||
52 | |||
53 | config DEFAULT_CFQ | ||
54 | bool "CFQ" if IOSCHED_CFQ=y | ||
55 | |||
56 | config DEFAULT_NOOP | ||
57 | bool "No-op" | ||
58 | |||
59 | endchoice | ||
60 | |||
61 | config DEFAULT_IOSCHED | ||
62 | string | ||
63 | default "deadline" if DEFAULT_DEADLINE | ||
64 | default "cfq" if DEFAULT_CFQ | ||
65 | default "noop" if DEFAULT_NOOP | ||
66 | |||
67 | config MQ_IOSCHED_DEADLINE | 6 | config MQ_IOSCHED_DEADLINE |
68 | tristate "MQ deadline I/O scheduler" | 7 | tristate "MQ deadline I/O scheduler" |
69 | default y | 8 | default y |
diff --git a/block/Makefile b/block/Makefile index 27eac600474f..eee1b4ceecf9 100644 --- a/block/Makefile +++ b/block/Makefile | |||
@@ -3,7 +3,7 @@ | |||
3 | # Makefile for the kernel block layer | 3 | # Makefile for the kernel block layer |
4 | # | 4 | # |
5 | 5 | ||
6 | obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ | 6 | obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ |
7 | blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ | 7 | blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ |
8 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ | 8 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ |
9 | blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ | 9 | blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ |
@@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o | |||
18 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o | 18 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o |
19 | obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o | 19 | obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o |
20 | obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o | 20 | obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o |
21 | obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o | ||
22 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o | ||
23 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o | ||
24 | obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o | 21 | obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o |
25 | obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o | 22 | obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o |
26 | bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o | 23 | bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o |
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9fe5952d117d..c6113af31960 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c | |||
@@ -334,7 +334,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) | |||
334 | 334 | ||
335 | parent = bfqg_parent(bfqg); | 335 | parent = bfqg_parent(bfqg); |
336 | 336 | ||
337 | lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); | 337 | lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); |
338 | 338 | ||
339 | if (unlikely(!parent)) | 339 | if (unlikely(!parent)) |
340 | return; | 340 | return; |
@@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) | |||
642 | uint64_t serial_nr; | 642 | uint64_t serial_nr; |
643 | 643 | ||
644 | rcu_read_lock(); | 644 | rcu_read_lock(); |
645 | serial_nr = bio_blkcg(bio)->css.serial_nr; | 645 | serial_nr = __bio_blkcg(bio)->css.serial_nr; |
646 | 646 | ||
647 | /* | 647 | /* |
648 | * Check whether blkcg has changed. The condition may trigger | 648 | * Check whether blkcg has changed. The condition may trigger |
@@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) | |||
651 | if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) | 651 | if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) |
652 | goto out; | 652 | goto out; |
653 | 653 | ||
654 | bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); | 654 | bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); |
655 | /* | 655 | /* |
656 | * Update blkg_path for bfq_log_* functions. We cache this | 656 | * Update blkg_path for bfq_log_* functions. We cache this |
657 | * path, and update it here, for the following | 657 | * path, and update it here, for the following |
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3a27d31fcda6..cd307767a134 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
@@ -399,9 +399,9 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, | |||
399 | unsigned long flags; | 399 | unsigned long flags; |
400 | struct bfq_io_cq *icq; | 400 | struct bfq_io_cq *icq; |
401 | 401 | ||
402 | spin_lock_irqsave(q->queue_lock, flags); | 402 | spin_lock_irqsave(&q->queue_lock, flags); |
403 | icq = icq_to_bic(ioc_lookup_icq(ioc, q)); | 403 | icq = icq_to_bic(ioc_lookup_icq(ioc, q)); |
404 | spin_unlock_irqrestore(q->queue_lock, flags); | 404 | spin_unlock_irqrestore(&q->queue_lock, flags); |
405 | 405 | ||
406 | return icq; | 406 | return icq; |
407 | } | 407 | } |
@@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd) | |||
638 | bfqd->queue_weights_tree.rb_node->rb_right) | 638 | bfqd->queue_weights_tree.rb_node->rb_right) |
639 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | 639 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
640 | ) || | 640 | ) || |
641 | (bfqd->num_active_groups > 0 | 641 | (bfqd->num_groups_with_pending_reqs > 0 |
642 | #endif | 642 | #endif |
643 | ); | 643 | ); |
644 | } | 644 | } |
@@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, | |||
802 | */ | 802 | */ |
803 | break; | 803 | break; |
804 | } | 804 | } |
805 | bfqd->num_active_groups--; | 805 | |
806 | /* | ||
807 | * The decrement of num_groups_with_pending_reqs is | ||
808 | * not performed immediately upon the deactivation of | ||
809 | * entity, but it is delayed to when it also happens | ||
810 | * that the first leaf descendant bfqq of entity gets | ||
811 | * all its pending requests completed. The following | ||
812 | * instructions perform this delayed decrement, if | ||
813 | * needed. See the comments on | ||
814 | * num_groups_with_pending_reqs for details. | ||
815 | */ | ||
816 | if (entity->in_groups_with_pending_reqs) { | ||
817 | entity->in_groups_with_pending_reqs = false; | ||
818 | bfqd->num_groups_with_pending_reqs--; | ||
819 | } | ||
806 | } | 820 | } |
807 | } | 821 | } |
808 | 822 | ||
@@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) | |||
3529 | * fact, if there are active groups, then, for condition (i) | 3543 | * fact, if there are active groups, then, for condition (i) |
3530 | * to become false, it is enough that an active group contains | 3544 | * to become false, it is enough that an active group contains |
3531 | * more active processes or sub-groups than some other active | 3545 | * more active processes or sub-groups than some other active |
3532 | * group. We address this issue with the following bi-modal | 3546 | * group. More precisely, for condition (i) to hold because of |
3533 | * behavior, implemented in the function | 3547 | * such a group, it is not even necessary that the group is |
3548 | * (still) active: it is sufficient that, even if the group | ||
3549 | * has become inactive, some of its descendant processes still | ||
3550 | * have some request already dispatched but still waiting for | ||
3551 | * completion. In fact, requests have still to be guaranteed | ||
3552 | * their share of the throughput even after being | ||
3553 | * dispatched. In this respect, it is easy to show that, if a | ||
3554 | * group frequently becomes inactive while still having | ||
3555 | * in-flight requests, and if, when this happens, the group is | ||
3556 | * not considered in the calculation of whether the scenario | ||
3557 | * is asymmetric, then the group may fail to be guaranteed its | ||
3558 | * fair share of the throughput (basically because idling may | ||
3559 | * not be performed for the descendant processes of the group, | ||
3560 | * but it had to be). We address this issue with the | ||
3561 | * following bi-modal behavior, implemented in the function | ||
3534 | * bfq_symmetric_scenario(). | 3562 | * bfq_symmetric_scenario(). |
3535 | * | 3563 | * |
3536 | * If there are active groups, then the scenario is tagged as | 3564 | * If there are groups with requests waiting for completion |
3565 | * (as commented above, some of these groups may even be | ||
3566 | * already inactive), then the scenario is tagged as | ||
3537 | * asymmetric, conservatively, without checking any of the | 3567 | * asymmetric, conservatively, without checking any of the |
3538 | * conditions (i) and (ii). So the device is idled for bfqq. | 3568 | * conditions (i) and (ii). So the device is idled for bfqq. |
3539 | * This behavior matches also the fact that groups are created | 3569 | * This behavior matches also the fact that groups are created |
3540 | * exactly if controlling I/O (to preserve bandwidth and | 3570 | * exactly if controlling I/O is a primary concern (to |
3541 | * latency guarantees) is a primary concern. | 3571 | * preserve bandwidth and latency guarantees). |
3542 | * | 3572 | * |
3543 | * On the opposite end, if there are no active groups, then | 3573 | * On the opposite end, if there are no groups with requests |
3544 | * only condition (i) is actually controlled, i.e., provided | 3574 | * waiting for completion, then only condition (i) is actually |
3545 | * that condition (i) holds, idling is not performed, | 3575 | * controlled, i.e., provided that condition (i) holds, idling |
3546 | * regardless of whether condition (ii) holds. In other words, | 3576 | * is not performed, regardless of whether condition (ii) |
3547 | * only if condition (i) does not hold, then idling is | 3577 | * holds. In other words, only if condition (i) does not hold, |
3548 | * allowed, and the device tends to be prevented from queueing | 3578 | * then idling is allowed, and the device tends to be |
3549 | * many requests, possibly of several processes. Since there | 3579 | * prevented from queueing many requests, possibly of several |
3550 | * are no active groups, then, to control condition (i) it is | 3580 | * processes. Since there are no groups with requests waiting |
3551 | * enough to check whether all active queues have the same | 3581 | * for completion, then, to control condition (i) it is enough |
3552 | * weight. | 3582 | * to check just whether all the queues with requests waiting |
3583 | * for completion also have the same weight. | ||
3553 | * | 3584 | * |
3554 | * Not checking condition (ii) evidently exposes bfqq to the | 3585 | * Not checking condition (ii) evidently exposes bfqq to the |
3555 | * risk of getting less throughput than its fair share. | 3586 | * risk of getting less throughput than its fair share. |
@@ -3607,10 +3638,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) | |||
3607 | * bfqq is weight-raised is checked explicitly here. More | 3638 | * bfqq is weight-raised is checked explicitly here. More |
3608 | * precisely, the compound condition below takes into account | 3639 | * precisely, the compound condition below takes into account |
3609 | * also the fact that, even if bfqq is being weight-raised, | 3640 | * also the fact that, even if bfqq is being weight-raised, |
3610 | * the scenario is still symmetric if all active queues happen | 3641 | * the scenario is still symmetric if all queues with requests |
3611 | * to be weight-raised. Actually, we should be even more | 3642 | * waiting for completion happen to be |
3612 | * precise here, and differentiate between interactive weight | 3643 | * weight-raised. Actually, we should be even more precise |
3613 | * raising and soft real-time weight raising. | 3644 | * here, and differentiate between interactive weight raising |
3645 | * and soft real-time weight raising. | ||
3614 | * | 3646 | * |
3615 | * As a side note, it is worth considering that the above | 3647 | * As a side note, it is worth considering that the above |
3616 | * device-idling countermeasures may however fail in the | 3648 | * device-idling countermeasures may however fail in the |
@@ -4034,7 +4066,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q, | |||
4034 | * In addition, the following queue lock guarantees that | 4066 | * In addition, the following queue lock guarantees that |
4035 | * bfqq_group(bfqq) exists as well. | 4067 | * bfqq_group(bfqq) exists as well. |
4036 | */ | 4068 | */ |
4037 | spin_lock_irq(q->queue_lock); | 4069 | spin_lock_irq(&q->queue_lock); |
4038 | if (idle_timer_disabled) | 4070 | if (idle_timer_disabled) |
4039 | /* | 4071 | /* |
4040 | * Since the idle timer has been disabled, | 4072 | * Since the idle timer has been disabled, |
@@ -4053,7 +4085,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q, | |||
4053 | bfqg_stats_set_start_empty_time(bfqg); | 4085 | bfqg_stats_set_start_empty_time(bfqg); |
4054 | bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); | 4086 | bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); |
4055 | } | 4087 | } |
4056 | spin_unlock_irq(q->queue_lock); | 4088 | spin_unlock_irq(&q->queue_lock); |
4057 | } | 4089 | } |
4058 | #else | 4090 | #else |
4059 | static inline void bfq_update_dispatch_stats(struct request_queue *q, | 4091 | static inline void bfq_update_dispatch_stats(struct request_queue *q, |
@@ -4384,7 +4416,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, | |||
4384 | 4416 | ||
4385 | rcu_read_lock(); | 4417 | rcu_read_lock(); |
4386 | 4418 | ||
4387 | bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); | 4419 | bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); |
4388 | if (!bfqg) { | 4420 | if (!bfqg) { |
4389 | bfqq = &bfqd->oom_bfqq; | 4421 | bfqq = &bfqd->oom_bfqq; |
4390 | goto out; | 4422 | goto out; |
@@ -4637,11 +4669,11 @@ static void bfq_update_insert_stats(struct request_queue *q, | |||
4637 | * In addition, the following queue lock guarantees that | 4669 | * In addition, the following queue lock guarantees that |
4638 | * bfqq_group(bfqq) exists as well. | 4670 | * bfqq_group(bfqq) exists as well. |
4639 | */ | 4671 | */ |
4640 | spin_lock_irq(q->queue_lock); | 4672 | spin_lock_irq(&q->queue_lock); |
4641 | bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); | 4673 | bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); |
4642 | if (idle_timer_disabled) | 4674 | if (idle_timer_disabled) |
4643 | bfqg_stats_update_idle_time(bfqq_group(bfqq)); | 4675 | bfqg_stats_update_idle_time(bfqq_group(bfqq)); |
4644 | spin_unlock_irq(q->queue_lock); | 4676 | spin_unlock_irq(&q->queue_lock); |
4645 | } | 4677 | } |
4646 | #else | 4678 | #else |
4647 | static inline void bfq_update_insert_stats(struct request_queue *q, | 4679 | static inline void bfq_update_insert_stats(struct request_queue *q, |
@@ -5382,9 +5414,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) | |||
5382 | } | 5414 | } |
5383 | eq->elevator_data = bfqd; | 5415 | eq->elevator_data = bfqd; |
5384 | 5416 | ||
5385 | spin_lock_irq(q->queue_lock); | 5417 | spin_lock_irq(&q->queue_lock); |
5386 | q->elevator = eq; | 5418 | q->elevator = eq; |
5387 | spin_unlock_irq(q->queue_lock); | 5419 | spin_unlock_irq(&q->queue_lock); |
5388 | 5420 | ||
5389 | /* | 5421 | /* |
5390 | * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. | 5422 | * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. |
@@ -5417,7 +5449,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) | |||
5417 | bfqd->idle_slice_timer.function = bfq_idle_slice_timer; | 5449 | bfqd->idle_slice_timer.function = bfq_idle_slice_timer; |
5418 | 5450 | ||
5419 | bfqd->queue_weights_tree = RB_ROOT; | 5451 | bfqd->queue_weights_tree = RB_ROOT; |
5420 | bfqd->num_active_groups = 0; | 5452 | bfqd->num_groups_with_pending_reqs = 0; |
5421 | 5453 | ||
5422 | INIT_LIST_HEAD(&bfqd->active_list); | 5454 | INIT_LIST_HEAD(&bfqd->active_list); |
5423 | INIT_LIST_HEAD(&bfqd->idle_list); | 5455 | INIT_LIST_HEAD(&bfqd->idle_list); |
@@ -5724,7 +5756,7 @@ static struct elv_fs_entry bfq_attrs[] = { | |||
5724 | }; | 5756 | }; |
5725 | 5757 | ||
5726 | static struct elevator_type iosched_bfq_mq = { | 5758 | static struct elevator_type iosched_bfq_mq = { |
5727 | .ops.mq = { | 5759 | .ops = { |
5728 | .limit_depth = bfq_limit_depth, | 5760 | .limit_depth = bfq_limit_depth, |
5729 | .prepare_request = bfq_prepare_request, | 5761 | .prepare_request = bfq_prepare_request, |
5730 | .requeue_request = bfq_finish_requeue_request, | 5762 | .requeue_request = bfq_finish_requeue_request, |
@@ -5745,7 +5777,6 @@ static struct elevator_type iosched_bfq_mq = { | |||
5745 | .exit_sched = bfq_exit_queue, | 5777 | .exit_sched = bfq_exit_queue, |
5746 | }, | 5778 | }, |
5747 | 5779 | ||
5748 | .uses_mq = true, | ||
5749 | .icq_size = sizeof(struct bfq_io_cq), | 5780 | .icq_size = sizeof(struct bfq_io_cq), |
5750 | .icq_align = __alignof__(struct bfq_io_cq), | 5781 | .icq_align = __alignof__(struct bfq_io_cq), |
5751 | .elevator_attrs = bfq_attrs, | 5782 | .elevator_attrs = bfq_attrs, |
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 77651d817ecd..0b02bf302de0 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h | |||
@@ -196,6 +196,9 @@ struct bfq_entity { | |||
196 | 196 | ||
197 | /* flag, set to request a weight, ioprio or ioprio_class change */ | 197 | /* flag, set to request a weight, ioprio or ioprio_class change */ |
198 | int prio_changed; | 198 | int prio_changed; |
199 | |||
200 | /* flag, set if the entity is counted in groups_with_pending_reqs */ | ||
201 | bool in_groups_with_pending_reqs; | ||
199 | }; | 202 | }; |
200 | 203 | ||
201 | struct bfq_group; | 204 | struct bfq_group; |
@@ -448,10 +451,54 @@ struct bfq_data { | |||
448 | * bfq_weights_tree_[add|remove] for further details). | 451 | * bfq_weights_tree_[add|remove] for further details). |
449 | */ | 452 | */ |
450 | struct rb_root queue_weights_tree; | 453 | struct rb_root queue_weights_tree; |
454 | |||
451 | /* | 455 | /* |
452 | * number of groups with requests still waiting for completion | 456 | * Number of groups with at least one descendant process that |
457 | * has at least one request waiting for completion. Note that | ||
458 | * this accounts for also requests already dispatched, but not | ||
459 | * yet completed. Therefore this number of groups may differ | ||
460 | * (be larger) than the number of active groups, as a group is | ||
461 | * considered active only if its corresponding entity has | ||
462 | * descendant queues with at least one request queued. This | ||
463 | * number is used to decide whether a scenario is symmetric. | ||
464 | * For a detailed explanation see comments on the computation | ||
465 | * of the variable asymmetric_scenario in the function | ||
466 | * bfq_better_to_idle(). | ||
467 | * | ||
468 | * However, it is hard to compute this number exactly, for | ||
469 | * groups with multiple descendant processes. Consider a group | ||
470 | * that is inactive, i.e., that has no descendant process with | ||
471 | * pending I/O inside BFQ queues. Then suppose that | ||
472 | * num_groups_with_pending_reqs is still accounting for this | ||
473 | * group, because the group has descendant processes with some | ||
474 | * I/O request still in flight. num_groups_with_pending_reqs | ||
475 | * should be decremented when the in-flight request of the | ||
476 | * last descendant process is finally completed (assuming that | ||
477 | * nothing else has changed for the group in the meantime, in | ||
478 | * terms of composition of the group and active/inactive state of child | ||
479 | * groups and processes). To accomplish this, an additional | ||
480 | * pending-request counter must be added to entities, and must | ||
481 | * be updated correctly. To avoid this additional field and operations, | ||
482 | * we resort to the following tradeoff between simplicity and | ||
483 | * accuracy: for an inactive group that is still counted in | ||
484 | * num_groups_with_pending_reqs, we decrement | ||
485 | * num_groups_with_pending_reqs when the first descendant | ||
486 | * process of the group remains with no request waiting for | ||
487 | * completion. | ||
488 | * | ||
489 | * Even this simpler decrement strategy requires a little | ||
490 | * carefulness: to avoid multiple decrements, we flag a group, | ||
491 | * more precisely an entity representing a group, as still | ||
492 | * counted in num_groups_with_pending_reqs when it becomes | ||
493 | * inactive. Then, when the first descendant queue of the | ||
494 | * entity remains with no request waiting for completion, | ||
495 | * num_groups_with_pending_reqs is decremented, and this flag | ||
496 | * is reset. After this flag is reset for the entity, | ||
497 | * num_groups_with_pending_reqs won't be decremented any | ||
498 | * longer in case a new descendant queue of the entity remains | ||
499 | * with no request waiting for completion. | ||
453 | */ | 500 | */ |
454 | unsigned int num_active_groups; | 501 | unsigned int num_groups_with_pending_reqs; |
455 | 502 | ||
456 | /* | 503 | /* |
457 | * Number of bfq_queues containing requests (including the | 504 | * Number of bfq_queues containing requests (including the |
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4b0d5fb69160..63e0f12be7c9 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c | |||
@@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct bfq_entity *entity, | |||
1012 | container_of(entity, struct bfq_group, entity); | 1012 | container_of(entity, struct bfq_group, entity); |
1013 | struct bfq_data *bfqd = bfqg->bfqd; | 1013 | struct bfq_data *bfqd = bfqg->bfqd; |
1014 | 1014 | ||
1015 | bfqd->num_active_groups++; | 1015 | if (!entity->in_groups_with_pending_reqs) { |
1016 | entity->in_groups_with_pending_reqs = true; | ||
1017 | bfqd->num_groups_with_pending_reqs++; | ||
1018 | } | ||
1016 | } | 1019 | } |
1017 | #endif | 1020 | #endif |
1018 | 1021 | ||
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 290af497997b..1b633a3526d4 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c | |||
@@ -390,7 +390,6 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) | |||
390 | bip->bip_iter.bi_sector += bytes_done >> 9; | 390 | bip->bip_iter.bi_sector += bytes_done >> 9; |
391 | bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); | 391 | bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); |
392 | } | 392 | } |
393 | EXPORT_SYMBOL(bio_integrity_advance); | ||
394 | 393 | ||
395 | /** | 394 | /** |
396 | * bio_integrity_trim - Trim integrity vector | 395 | * bio_integrity_trim - Trim integrity vector |
@@ -460,7 +459,6 @@ void bioset_integrity_free(struct bio_set *bs) | |||
460 | mempool_exit(&bs->bio_integrity_pool); | 459 | mempool_exit(&bs->bio_integrity_pool); |
461 | mempool_exit(&bs->bvec_integrity_pool); | 460 | mempool_exit(&bs->bvec_integrity_pool); |
462 | } | 461 | } |
463 | EXPORT_SYMBOL(bioset_integrity_free); | ||
464 | 462 | ||
465 | void __init bio_integrity_init(void) | 463 | void __init bio_integrity_init(void) |
466 | { | 464 | { |
diff --git a/block/bio.c b/block/bio.c index d5368a445561..8281bfcbc265 100644 --- a/block/bio.c +++ b/block/bio.c | |||
@@ -244,7 +244,7 @@ fallback: | |||
244 | 244 | ||
245 | void bio_uninit(struct bio *bio) | 245 | void bio_uninit(struct bio *bio) |
246 | { | 246 | { |
247 | bio_disassociate_task(bio); | 247 | bio_disassociate_blkg(bio); |
248 | } | 248 | } |
249 | EXPORT_SYMBOL(bio_uninit); | 249 | EXPORT_SYMBOL(bio_uninit); |
250 | 250 | ||
@@ -571,14 +571,13 @@ void bio_put(struct bio *bio) | |||
571 | } | 571 | } |
572 | EXPORT_SYMBOL(bio_put); | 572 | EXPORT_SYMBOL(bio_put); |
573 | 573 | ||
574 | inline int bio_phys_segments(struct request_queue *q, struct bio *bio) | 574 | int bio_phys_segments(struct request_queue *q, struct bio *bio) |
575 | { | 575 | { |
576 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) | 576 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) |
577 | blk_recount_segments(q, bio); | 577 | blk_recount_segments(q, bio); |
578 | 578 | ||
579 | return bio->bi_phys_segments; | 579 | return bio->bi_phys_segments; |
580 | } | 580 | } |
581 | EXPORT_SYMBOL(bio_phys_segments); | ||
582 | 581 | ||
583 | /** | 582 | /** |
584 | * __bio_clone_fast - clone a bio that shares the original bio's biovec | 583 | * __bio_clone_fast - clone a bio that shares the original bio's biovec |
@@ -605,11 +604,13 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) | |||
605 | if (bio_flagged(bio_src, BIO_THROTTLED)) | 604 | if (bio_flagged(bio_src, BIO_THROTTLED)) |
606 | bio_set_flag(bio, BIO_THROTTLED); | 605 | bio_set_flag(bio, BIO_THROTTLED); |
607 | bio->bi_opf = bio_src->bi_opf; | 606 | bio->bi_opf = bio_src->bi_opf; |
607 | bio->bi_ioprio = bio_src->bi_ioprio; | ||
608 | bio->bi_write_hint = bio_src->bi_write_hint; | 608 | bio->bi_write_hint = bio_src->bi_write_hint; |
609 | bio->bi_iter = bio_src->bi_iter; | 609 | bio->bi_iter = bio_src->bi_iter; |
610 | bio->bi_io_vec = bio_src->bi_io_vec; | 610 | bio->bi_io_vec = bio_src->bi_io_vec; |
611 | 611 | ||
612 | bio_clone_blkcg_association(bio, bio_src); | 612 | bio_clone_blkg_association(bio, bio_src); |
613 | blkcg_bio_issue_init(bio); | ||
613 | } | 614 | } |
614 | EXPORT_SYMBOL(__bio_clone_fast); | 615 | EXPORT_SYMBOL(__bio_clone_fast); |
615 | 616 | ||
@@ -900,7 +901,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) | |||
900 | 901 | ||
901 | return 0; | 902 | return 0; |
902 | } | 903 | } |
903 | EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); | ||
904 | 904 | ||
905 | static void submit_bio_wait_endio(struct bio *bio) | 905 | static void submit_bio_wait_endio(struct bio *bio) |
906 | { | 906 | { |
@@ -1260,6 +1260,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q, | |||
1260 | if (ret) | 1260 | if (ret) |
1261 | goto cleanup; | 1261 | goto cleanup; |
1262 | } else { | 1262 | } else { |
1263 | if (bmd->is_our_pages) | ||
1264 | zero_fill_bio(bio); | ||
1263 | iov_iter_advance(iter, bio->bi_iter.bi_size); | 1265 | iov_iter_advance(iter, bio->bi_iter.bi_size); |
1264 | } | 1266 | } |
1265 | 1267 | ||
@@ -1589,7 +1591,6 @@ void bio_set_pages_dirty(struct bio *bio) | |||
1589 | set_page_dirty_lock(bvec->bv_page); | 1591 | set_page_dirty_lock(bvec->bv_page); |
1590 | } | 1592 | } |
1591 | } | 1593 | } |
1592 | EXPORT_SYMBOL_GPL(bio_set_pages_dirty); | ||
1593 | 1594 | ||
1594 | static void bio_release_pages(struct bio *bio) | 1595 | static void bio_release_pages(struct bio *bio) |
1595 | { | 1596 | { |
@@ -1659,17 +1660,33 @@ defer: | |||
1659 | spin_unlock_irqrestore(&bio_dirty_lock, flags); | 1660 | spin_unlock_irqrestore(&bio_dirty_lock, flags); |
1660 | schedule_work(&bio_dirty_work); | 1661 | schedule_work(&bio_dirty_work); |
1661 | } | 1662 | } |
1662 | EXPORT_SYMBOL_GPL(bio_check_pages_dirty); | 1663 | |
1664 | void update_io_ticks(struct hd_struct *part, unsigned long now) | ||
1665 | { | ||
1666 | unsigned long stamp; | ||
1667 | again: | ||
1668 | stamp = READ_ONCE(part->stamp); | ||
1669 | if (unlikely(stamp != now)) { | ||
1670 | if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) { | ||
1671 | __part_stat_add(part, io_ticks, 1); | ||
1672 | } | ||
1673 | } | ||
1674 | if (part->partno) { | ||
1675 | part = &part_to_disk(part)->part0; | ||
1676 | goto again; | ||
1677 | } | ||
1678 | } | ||
1663 | 1679 | ||
1664 | void generic_start_io_acct(struct request_queue *q, int op, | 1680 | void generic_start_io_acct(struct request_queue *q, int op, |
1665 | unsigned long sectors, struct hd_struct *part) | 1681 | unsigned long sectors, struct hd_struct *part) |
1666 | { | 1682 | { |
1667 | const int sgrp = op_stat_group(op); | 1683 | const int sgrp = op_stat_group(op); |
1668 | int cpu = part_stat_lock(); | ||
1669 | 1684 | ||
1670 | part_round_stats(q, cpu, part); | 1685 | part_stat_lock(); |
1671 | part_stat_inc(cpu, part, ios[sgrp]); | 1686 | |
1672 | part_stat_add(cpu, part, sectors[sgrp], sectors); | 1687 | update_io_ticks(part, jiffies); |
1688 | part_stat_inc(part, ios[sgrp]); | ||
1689 | part_stat_add(part, sectors[sgrp], sectors); | ||
1673 | part_inc_in_flight(q, part, op_is_write(op)); | 1690 | part_inc_in_flight(q, part, op_is_write(op)); |
1674 | 1691 | ||
1675 | part_stat_unlock(); | 1692 | part_stat_unlock(); |
@@ -1679,12 +1696,15 @@ EXPORT_SYMBOL(generic_start_io_acct); | |||
1679 | void generic_end_io_acct(struct request_queue *q, int req_op, | 1696 | void generic_end_io_acct(struct request_queue *q, int req_op, |
1680 | struct hd_struct *part, unsigned long start_time) | 1697 | struct hd_struct *part, unsigned long start_time) |
1681 | { | 1698 | { |
1682 | unsigned long duration = jiffies - start_time; | 1699 | unsigned long now = jiffies; |
1700 | unsigned long duration = now - start_time; | ||
1683 | const int sgrp = op_stat_group(req_op); | 1701 | const int sgrp = op_stat_group(req_op); |
1684 | int cpu = part_stat_lock(); | ||
1685 | 1702 | ||
1686 | part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); | 1703 | part_stat_lock(); |
1687 | part_round_stats(q, cpu, part); | 1704 | |
1705 | update_io_ticks(part, now); | ||
1706 | part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); | ||
1707 | part_stat_add(part, time_in_queue, duration); | ||
1688 | part_dec_in_flight(q, part, op_is_write(req_op)); | 1708 | part_dec_in_flight(q, part, op_is_write(req_op)); |
1689 | 1709 | ||
1690 | part_stat_unlock(); | 1710 | part_stat_unlock(); |
@@ -1954,102 +1974,133 @@ EXPORT_SYMBOL(bioset_init_from_src); | |||
1954 | 1974 | ||
1955 | #ifdef CONFIG_BLK_CGROUP | 1975 | #ifdef CONFIG_BLK_CGROUP |
1956 | 1976 | ||
1957 | #ifdef CONFIG_MEMCG | ||
1958 | /** | 1977 | /** |
1959 | * bio_associate_blkcg_from_page - associate a bio with the page's blkcg | 1978 | * bio_disassociate_blkg - puts back the blkg reference if associated |
1960 | * @bio: target bio | 1979 | * @bio: target bio |
1961 | * @page: the page to lookup the blkcg from | ||
1962 | * | 1980 | * |
1963 | * Associate @bio with the blkcg from @page's owning memcg. This works like | 1981 | * Helper to disassociate the blkg from @bio if a blkg is associated. |
1964 | * every other associate function wrt references. | ||
1965 | */ | 1982 | */ |
1966 | int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) | 1983 | void bio_disassociate_blkg(struct bio *bio) |
1967 | { | 1984 | { |
1968 | struct cgroup_subsys_state *blkcg_css; | 1985 | if (bio->bi_blkg) { |
1969 | 1986 | blkg_put(bio->bi_blkg); | |
1970 | if (unlikely(bio->bi_css)) | 1987 | bio->bi_blkg = NULL; |
1971 | return -EBUSY; | 1988 | } |
1972 | if (!page->mem_cgroup) | ||
1973 | return 0; | ||
1974 | blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, | ||
1975 | &io_cgrp_subsys); | ||
1976 | bio->bi_css = blkcg_css; | ||
1977 | return 0; | ||
1978 | } | 1989 | } |
1979 | #endif /* CONFIG_MEMCG */ | 1990 | EXPORT_SYMBOL_GPL(bio_disassociate_blkg); |
1980 | 1991 | ||
1981 | /** | 1992 | /** |
1982 | * bio_associate_blkcg - associate a bio with the specified blkcg | 1993 | * __bio_associate_blkg - associate a bio with the a blkg |
1983 | * @bio: target bio | 1994 | * @bio: target bio |
1984 | * @blkcg_css: css of the blkcg to associate | 1995 | * @blkg: the blkg to associate |
1985 | * | 1996 | * |
1986 | * Associate @bio with the blkcg specified by @blkcg_css. Block layer will | 1997 | * This tries to associate @bio with the specified @blkg. Association failure |
1987 | * treat @bio as if it were issued by a task which belongs to the blkcg. | 1998 | * is handled by walking up the blkg tree. Therefore, the blkg associated can |
1999 | * be anything between @blkg and the root_blkg. This situation only happens | ||
2000 | * when a cgroup is dying and then the remaining bios will spill to the closest | ||
2001 | * alive blkg. | ||
1988 | * | 2002 | * |
1989 | * This function takes an extra reference of @blkcg_css which will be put | 2003 | * A reference will be taken on the @blkg and will be released when @bio is |
1990 | * when @bio is released. The caller must own @bio and is responsible for | 2004 | * freed. |
1991 | * synchronizing calls to this function. | ||
1992 | */ | 2005 | */ |
1993 | int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) | 2006 | static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) |
1994 | { | 2007 | { |
1995 | if (unlikely(bio->bi_css)) | 2008 | bio_disassociate_blkg(bio); |
1996 | return -EBUSY; | 2009 | |
1997 | css_get(blkcg_css); | 2010 | bio->bi_blkg = blkg_tryget_closest(blkg); |
1998 | bio->bi_css = blkcg_css; | ||
1999 | return 0; | ||
2000 | } | 2011 | } |
2001 | EXPORT_SYMBOL_GPL(bio_associate_blkcg); | ||
2002 | 2012 | ||
2003 | /** | 2013 | /** |
2004 | * bio_associate_blkg - associate a bio with the specified blkg | 2014 | * bio_associate_blkg_from_css - associate a bio with a specified css |
2005 | * @bio: target bio | 2015 | * @bio: target bio |
2006 | * @blkg: the blkg to associate | 2016 | * @css: target css |
2007 | * | 2017 | * |
2008 | * Associate @bio with the blkg specified by @blkg. This is the queue specific | 2018 | * Associate @bio with the blkg found by combining the css's blkg and the |
2009 | * blkcg information associated with the @bio, a reference will be taken on the | 2019 | * request_queue of the @bio. This falls back to the queue's root_blkg if |
2010 | * @blkg and will be freed when the bio is freed. | 2020 | * the association fails with the css. |
2011 | */ | 2021 | */ |
2012 | int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) | 2022 | void bio_associate_blkg_from_css(struct bio *bio, |
2023 | struct cgroup_subsys_state *css) | ||
2013 | { | 2024 | { |
2014 | if (unlikely(bio->bi_blkg)) | 2025 | struct request_queue *q = bio->bi_disk->queue; |
2015 | return -EBUSY; | 2026 | struct blkcg_gq *blkg; |
2016 | if (!blkg_try_get(blkg)) | 2027 | |
2017 | return -ENODEV; | 2028 | rcu_read_lock(); |
2018 | bio->bi_blkg = blkg; | 2029 | |
2019 | return 0; | 2030 | if (!css || !css->parent) |
2031 | blkg = q->root_blkg; | ||
2032 | else | ||
2033 | blkg = blkg_lookup_create(css_to_blkcg(css), q); | ||
2034 | |||
2035 | __bio_associate_blkg(bio, blkg); | ||
2036 | |||
2037 | rcu_read_unlock(); | ||
2020 | } | 2038 | } |
2039 | EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); | ||
2021 | 2040 | ||
2041 | #ifdef CONFIG_MEMCG | ||
2022 | /** | 2042 | /** |
2023 | * bio_disassociate_task - undo bio_associate_current() | 2043 | * bio_associate_blkg_from_page - associate a bio with the page's blkg |
2024 | * @bio: target bio | 2044 | * @bio: target bio |
2045 | * @page: the page to lookup the blkcg from | ||
2046 | * | ||
2047 | * Associate @bio with the blkg from @page's owning memcg and the respective | ||
2048 | * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's | ||
2049 | * root_blkg. | ||
2025 | */ | 2050 | */ |
2026 | void bio_disassociate_task(struct bio *bio) | 2051 | void bio_associate_blkg_from_page(struct bio *bio, struct page *page) |
2027 | { | 2052 | { |
2028 | if (bio->bi_ioc) { | 2053 | struct cgroup_subsys_state *css; |
2029 | put_io_context(bio->bi_ioc); | 2054 | |
2030 | bio->bi_ioc = NULL; | 2055 | if (!page->mem_cgroup) |
2031 | } | 2056 | return; |
2032 | if (bio->bi_css) { | 2057 | |
2033 | css_put(bio->bi_css); | 2058 | rcu_read_lock(); |
2034 | bio->bi_css = NULL; | 2059 | |
2035 | } | 2060 | css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); |
2036 | if (bio->bi_blkg) { | 2061 | bio_associate_blkg_from_css(bio, css); |
2037 | blkg_put(bio->bi_blkg); | 2062 | |
2038 | bio->bi_blkg = NULL; | 2063 | rcu_read_unlock(); |
2039 | } | 2064 | } |
2065 | #endif /* CONFIG_MEMCG */ | ||
2066 | |||
2067 | /** | ||
2068 | * bio_associate_blkg - associate a bio with a blkg | ||
2069 | * @bio: target bio | ||
2070 | * | ||
2071 | * Associate @bio with the blkg found from the bio's css and request_queue. | ||
2072 | * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is | ||
2073 | * already associated, the css is reused and association redone as the | ||
2074 | * request_queue may have changed. | ||
2075 | */ | ||
2076 | void bio_associate_blkg(struct bio *bio) | ||
2077 | { | ||
2078 | struct cgroup_subsys_state *css; | ||
2079 | |||
2080 | rcu_read_lock(); | ||
2081 | |||
2082 | if (bio->bi_blkg) | ||
2083 | css = &bio_blkcg(bio)->css; | ||
2084 | else | ||
2085 | css = blkcg_css(); | ||
2086 | |||
2087 | bio_associate_blkg_from_css(bio, css); | ||
2088 | |||
2089 | rcu_read_unlock(); | ||
2040 | } | 2090 | } |
2091 | EXPORT_SYMBOL_GPL(bio_associate_blkg); | ||
2041 | 2092 | ||
2042 | /** | 2093 | /** |
2043 | * bio_clone_blkcg_association - clone blkcg association from src to dst bio | 2094 | * bio_clone_blkg_association - clone blkg association from src to dst bio |
2044 | * @dst: destination bio | 2095 | * @dst: destination bio |
2045 | * @src: source bio | 2096 | * @src: source bio |
2046 | */ | 2097 | */ |
2047 | void bio_clone_blkcg_association(struct bio *dst, struct bio *src) | 2098 | void bio_clone_blkg_association(struct bio *dst, struct bio *src) |
2048 | { | 2099 | { |
2049 | if (src->bi_css) | 2100 | if (src->bi_blkg) |
2050 | WARN_ON(bio_associate_blkcg(dst, src->bi_css)); | 2101 | __bio_associate_blkg(dst, src->bi_blkg); |
2051 | } | 2102 | } |
2052 | EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); | 2103 | EXPORT_SYMBOL_GPL(bio_clone_blkg_association); |
2053 | #endif /* CONFIG_BLK_CGROUP */ | 2104 | #endif /* CONFIG_BLK_CGROUP */ |
2054 | 2105 | ||
2055 | static void __init biovec_init_slabs(void) | 2106 | static void __init biovec_init_slabs(void) |
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c630e02836a8..c8cc1cbb6370 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -76,14 +76,42 @@ static void blkg_free(struct blkcg_gq *blkg) | |||
76 | if (blkg->pd[i]) | 76 | if (blkg->pd[i]) |
77 | blkcg_policy[i]->pd_free_fn(blkg->pd[i]); | 77 | blkcg_policy[i]->pd_free_fn(blkg->pd[i]); |
78 | 78 | ||
79 | if (blkg->blkcg != &blkcg_root) | ||
80 | blk_exit_rl(blkg->q, &blkg->rl); | ||
81 | |||
82 | blkg_rwstat_exit(&blkg->stat_ios); | 79 | blkg_rwstat_exit(&blkg->stat_ios); |
83 | blkg_rwstat_exit(&blkg->stat_bytes); | 80 | blkg_rwstat_exit(&blkg->stat_bytes); |
84 | kfree(blkg); | 81 | kfree(blkg); |
85 | } | 82 | } |
86 | 83 | ||
84 | static void __blkg_release(struct rcu_head *rcu) | ||
85 | { | ||
86 | struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); | ||
87 | |||
88 | percpu_ref_exit(&blkg->refcnt); | ||
89 | |||
90 | /* release the blkcg and parent blkg refs this blkg has been holding */ | ||
91 | css_put(&blkg->blkcg->css); | ||
92 | if (blkg->parent) | ||
93 | blkg_put(blkg->parent); | ||
94 | |||
95 | wb_congested_put(blkg->wb_congested); | ||
96 | |||
97 | blkg_free(blkg); | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * A group is RCU protected, but having an rcu lock does not mean that one | ||
102 | * can access all the fields of blkg and assume these are valid. For | ||
103 | * example, don't try to follow throtl_data and request queue links. | ||
104 | * | ||
105 | * Having a reference to blkg under an rcu allows accesses to only values | ||
106 | * local to groups like group stats and group rate limits. | ||
107 | */ | ||
108 | static void blkg_release(struct percpu_ref *ref) | ||
109 | { | ||
110 | struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); | ||
111 | |||
112 | call_rcu(&blkg->rcu_head, __blkg_release); | ||
113 | } | ||
114 | |||
87 | /** | 115 | /** |
88 | * blkg_alloc - allocate a blkg | 116 | * blkg_alloc - allocate a blkg |
89 | * @blkcg: block cgroup the new blkg is associated with | 117 | * @blkcg: block cgroup the new blkg is associated with |
@@ -110,14 +138,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | |||
110 | blkg->q = q; | 138 | blkg->q = q; |
111 | INIT_LIST_HEAD(&blkg->q_node); | 139 | INIT_LIST_HEAD(&blkg->q_node); |
112 | blkg->blkcg = blkcg; | 140 | blkg->blkcg = blkcg; |
113 | atomic_set(&blkg->refcnt, 1); | ||
114 | |||
115 | /* root blkg uses @q->root_rl, init rl only for !root blkgs */ | ||
116 | if (blkcg != &blkcg_root) { | ||
117 | if (blk_init_rl(&blkg->rl, q, gfp_mask)) | ||
118 | goto err_free; | ||
119 | blkg->rl.blkg = blkg; | ||
120 | } | ||
121 | 141 | ||
122 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | 142 | for (i = 0; i < BLKCG_MAX_POLS; i++) { |
123 | struct blkcg_policy *pol = blkcg_policy[i]; | 143 | struct blkcg_policy *pol = blkcg_policy[i]; |
@@ -157,7 +177,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, | |||
157 | blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); | 177 | blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); |
158 | if (blkg && blkg->q == q) { | 178 | if (blkg && blkg->q == q) { |
159 | if (update_hint) { | 179 | if (update_hint) { |
160 | lockdep_assert_held(q->queue_lock); | 180 | lockdep_assert_held(&q->queue_lock); |
161 | rcu_assign_pointer(blkcg->blkg_hint, blkg); | 181 | rcu_assign_pointer(blkcg->blkg_hint, blkg); |
162 | } | 182 | } |
163 | return blkg; | 183 | return blkg; |
@@ -180,7 +200,13 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
180 | int i, ret; | 200 | int i, ret; |
181 | 201 | ||
182 | WARN_ON_ONCE(!rcu_read_lock_held()); | 202 | WARN_ON_ONCE(!rcu_read_lock_held()); |
183 | lockdep_assert_held(q->queue_lock); | 203 | lockdep_assert_held(&q->queue_lock); |
204 | |||
205 | /* request_queue is dying, do not create/recreate a blkg */ | ||
206 | if (blk_queue_dying(q)) { | ||
207 | ret = -ENODEV; | ||
208 | goto err_free_blkg; | ||
209 | } | ||
184 | 210 | ||
185 | /* blkg holds a reference to blkcg */ | 211 | /* blkg holds a reference to blkcg */ |
186 | if (!css_tryget_online(&blkcg->css)) { | 212 | if (!css_tryget_online(&blkcg->css)) { |
@@ -217,6 +243,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
217 | blkg_get(blkg->parent); | 243 | blkg_get(blkg->parent); |
218 | } | 244 | } |
219 | 245 | ||
246 | ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, | ||
247 | GFP_NOWAIT | __GFP_NOWARN); | ||
248 | if (ret) | ||
249 | goto err_cancel_ref; | ||
250 | |||
220 | /* invoke per-policy init */ | 251 | /* invoke per-policy init */ |
221 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | 252 | for (i = 0; i < BLKCG_MAX_POLS; i++) { |
222 | struct blkcg_policy *pol = blkcg_policy[i]; | 253 | struct blkcg_policy *pol = blkcg_policy[i]; |
@@ -249,6 +280,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
249 | blkg_put(blkg); | 280 | blkg_put(blkg); |
250 | return ERR_PTR(ret); | 281 | return ERR_PTR(ret); |
251 | 282 | ||
283 | err_cancel_ref: | ||
284 | percpu_ref_exit(&blkg->refcnt); | ||
252 | err_put_congested: | 285 | err_put_congested: |
253 | wb_congested_put(wb_congested); | 286 | wb_congested_put(wb_congested); |
254 | err_put_css: | 287 | err_put_css: |
@@ -259,7 +292,7 @@ err_free_blkg: | |||
259 | } | 292 | } |
260 | 293 | ||
261 | /** | 294 | /** |
262 | * blkg_lookup_create - lookup blkg, try to create one if not there | 295 | * __blkg_lookup_create - lookup blkg, try to create one if not there |
263 | * @blkcg: blkcg of interest | 296 | * @blkcg: blkcg of interest |
264 | * @q: request_queue of interest | 297 | * @q: request_queue of interest |
265 | * | 298 | * |
@@ -268,24 +301,16 @@ err_free_blkg: | |||
268 | * that all non-root blkg's have access to the parent blkg. This function | 301 | * that all non-root blkg's have access to the parent blkg. This function |
269 | * should be called under RCU read lock and @q->queue_lock. | 302 | * should be called under RCU read lock and @q->queue_lock. |
270 | * | 303 | * |
271 | * Returns pointer to the looked up or created blkg on success, ERR_PTR() | 304 | * Returns the blkg or the closest blkg if blkg_create() fails as it walks |
272 | * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not | 305 | * down from root. |
273 | * dead and bypassing, returns ERR_PTR(-EBUSY). | ||
274 | */ | 306 | */ |
275 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | 307 | struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, |
276 | struct request_queue *q) | 308 | struct request_queue *q) |
277 | { | 309 | { |
278 | struct blkcg_gq *blkg; | 310 | struct blkcg_gq *blkg; |
279 | 311 | ||
280 | WARN_ON_ONCE(!rcu_read_lock_held()); | 312 | WARN_ON_ONCE(!rcu_read_lock_held()); |
281 | lockdep_assert_held(q->queue_lock); | 313 | lockdep_assert_held(&q->queue_lock); |
282 | |||
283 | /* | ||
284 | * This could be the first entry point of blkcg implementation and | ||
285 | * we shouldn't allow anything to go through for a bypassing queue. | ||
286 | */ | ||
287 | if (unlikely(blk_queue_bypass(q))) | ||
288 | return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); | ||
289 | 314 | ||
290 | blkg = __blkg_lookup(blkcg, q, true); | 315 | blkg = __blkg_lookup(blkcg, q, true); |
291 | if (blkg) | 316 | if (blkg) |
@@ -293,30 +318,64 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | |||
293 | 318 | ||
294 | /* | 319 | /* |
295 | * Create blkgs walking down from blkcg_root to @blkcg, so that all | 320 | * Create blkgs walking down from blkcg_root to @blkcg, so that all |
296 | * non-root blkgs have access to their parents. | 321 | * non-root blkgs have access to their parents. Returns the closest |
322 | * blkg to the intended blkg should blkg_create() fail. | ||
297 | */ | 323 | */ |
298 | while (true) { | 324 | while (true) { |
299 | struct blkcg *pos = blkcg; | 325 | struct blkcg *pos = blkcg; |
300 | struct blkcg *parent = blkcg_parent(blkcg); | 326 | struct blkcg *parent = blkcg_parent(blkcg); |
301 | 327 | struct blkcg_gq *ret_blkg = q->root_blkg; | |
302 | while (parent && !__blkg_lookup(parent, q, false)) { | 328 | |
329 | while (parent) { | ||
330 | blkg = __blkg_lookup(parent, q, false); | ||
331 | if (blkg) { | ||
332 | /* remember closest blkg */ | ||
333 | ret_blkg = blkg; | ||
334 | break; | ||
335 | } | ||
303 | pos = parent; | 336 | pos = parent; |
304 | parent = blkcg_parent(parent); | 337 | parent = blkcg_parent(parent); |
305 | } | 338 | } |
306 | 339 | ||
307 | blkg = blkg_create(pos, q, NULL); | 340 | blkg = blkg_create(pos, q, NULL); |
308 | if (pos == blkcg || IS_ERR(blkg)) | 341 | if (IS_ERR(blkg)) |
342 | return ret_blkg; | ||
343 | if (pos == blkcg) | ||
309 | return blkg; | 344 | return blkg; |
310 | } | 345 | } |
311 | } | 346 | } |
312 | 347 | ||
348 | /** | ||
349 | * blkg_lookup_create - find or create a blkg | ||
350 | * @blkcg: target block cgroup | ||
351 | * @q: target request_queue | ||
352 | * | ||
353 | * This looks up or creates the blkg representing the unique pair | ||
354 | * of the blkcg and the request_queue. | ||
355 | */ | ||
356 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | ||
357 | struct request_queue *q) | ||
358 | { | ||
359 | struct blkcg_gq *blkg = blkg_lookup(blkcg, q); | ||
360 | |||
361 | if (unlikely(!blkg)) { | ||
362 | unsigned long flags; | ||
363 | |||
364 | spin_lock_irqsave(&q->queue_lock, flags); | ||
365 | blkg = __blkg_lookup_create(blkcg, q); | ||
366 | spin_unlock_irqrestore(&q->queue_lock, flags); | ||
367 | } | ||
368 | |||
369 | return blkg; | ||
370 | } | ||
371 | |||
313 | static void blkg_destroy(struct blkcg_gq *blkg) | 372 | static void blkg_destroy(struct blkcg_gq *blkg) |
314 | { | 373 | { |
315 | struct blkcg *blkcg = blkg->blkcg; | 374 | struct blkcg *blkcg = blkg->blkcg; |
316 | struct blkcg_gq *parent = blkg->parent; | 375 | struct blkcg_gq *parent = blkg->parent; |
317 | int i; | 376 | int i; |
318 | 377 | ||
319 | lockdep_assert_held(blkg->q->queue_lock); | 378 | lockdep_assert_held(&blkg->q->queue_lock); |
320 | lockdep_assert_held(&blkcg->lock); | 379 | lockdep_assert_held(&blkcg->lock); |
321 | 380 | ||
322 | /* Something wrong if we are trying to remove same group twice */ | 381 | /* Something wrong if we are trying to remove same group twice */ |
@@ -353,7 +412,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) | |||
353 | * Put the reference taken at the time of creation so that when all | 412 | * Put the reference taken at the time of creation so that when all |
354 | * queues are gone, group can be destroyed. | 413 | * queues are gone, group can be destroyed. |
355 | */ | 414 | */ |
356 | blkg_put(blkg); | 415 | percpu_ref_kill(&blkg->refcnt); |
357 | } | 416 | } |
358 | 417 | ||
359 | /** | 418 | /** |
@@ -366,8 +425,7 @@ static void blkg_destroy_all(struct request_queue *q) | |||
366 | { | 425 | { |
367 | struct blkcg_gq *blkg, *n; | 426 | struct blkcg_gq *blkg, *n; |
368 | 427 | ||
369 | lockdep_assert_held(q->queue_lock); | 428 | spin_lock_irq(&q->queue_lock); |
370 | |||
371 | list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { | 429 | list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { |
372 | struct blkcg *blkcg = blkg->blkcg; | 430 | struct blkcg *blkcg = blkg->blkcg; |
373 | 431 | ||
@@ -377,7 +435,7 @@ static void blkg_destroy_all(struct request_queue *q) | |||
377 | } | 435 | } |
378 | 436 | ||
379 | q->root_blkg = NULL; | 437 | q->root_blkg = NULL; |
380 | q->root_rl.blkg = NULL; | 438 | spin_unlock_irq(&q->queue_lock); |
381 | } | 439 | } |
382 | 440 | ||
383 | /* | 441 | /* |
@@ -403,41 +461,6 @@ void __blkg_release_rcu(struct rcu_head *rcu_head) | |||
403 | } | 461 | } |
404 | EXPORT_SYMBOL_GPL(__blkg_release_rcu); | 462 | EXPORT_SYMBOL_GPL(__blkg_release_rcu); |
405 | 463 | ||
406 | /* | ||
407 | * The next function used by blk_queue_for_each_rl(). It's a bit tricky | ||
408 | * because the root blkg uses @q->root_rl instead of its own rl. | ||
409 | */ | ||
410 | struct request_list *__blk_queue_next_rl(struct request_list *rl, | ||
411 | struct request_queue *q) | ||
412 | { | ||
413 | struct list_head *ent; | ||
414 | struct blkcg_gq *blkg; | ||
415 | |||
416 | /* | ||
417 | * Determine the current blkg list_head. The first entry is | ||
418 | * root_rl which is off @q->blkg_list and mapped to the head. | ||
419 | */ | ||
420 | if (rl == &q->root_rl) { | ||
421 | ent = &q->blkg_list; | ||
422 | /* There are no more block groups, hence no request lists */ | ||
423 | if (list_empty(ent)) | ||
424 | return NULL; | ||
425 | } else { | ||
426 | blkg = container_of(rl, struct blkcg_gq, rl); | ||
427 | ent = &blkg->q_node; | ||
428 | } | ||
429 | |||
430 | /* walk to the next list_head, skip root blkcg */ | ||
431 | ent = ent->next; | ||
432 | if (ent == &q->root_blkg->q_node) | ||
433 | ent = ent->next; | ||
434 | if (ent == &q->blkg_list) | ||
435 | return NULL; | ||
436 | |||
437 | blkg = container_of(ent, struct blkcg_gq, q_node); | ||
438 | return &blkg->rl; | ||
439 | } | ||
440 | |||
441 | static int blkcg_reset_stats(struct cgroup_subsys_state *css, | 464 | static int blkcg_reset_stats(struct cgroup_subsys_state *css, |
442 | struct cftype *cftype, u64 val) | 465 | struct cftype *cftype, u64 val) |
443 | { | 466 | { |
@@ -477,7 +500,6 @@ const char *blkg_dev_name(struct blkcg_gq *blkg) | |||
477 | return dev_name(blkg->q->backing_dev_info->dev); | 500 | return dev_name(blkg->q->backing_dev_info->dev); |
478 | return NULL; | 501 | return NULL; |
479 | } | 502 | } |
480 | EXPORT_SYMBOL_GPL(blkg_dev_name); | ||
481 | 503 | ||
482 | /** | 504 | /** |
483 | * blkcg_print_blkgs - helper for printing per-blkg data | 505 | * blkcg_print_blkgs - helper for printing per-blkg data |
@@ -508,10 +530,10 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, | |||
508 | 530 | ||
509 | rcu_read_lock(); | 531 | rcu_read_lock(); |
510 | hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { | 532 | hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { |
511 | spin_lock_irq(blkg->q->queue_lock); | 533 | spin_lock_irq(&blkg->q->queue_lock); |
512 | if (blkcg_policy_enabled(blkg->q, pol)) | 534 | if (blkcg_policy_enabled(blkg->q, pol)) |
513 | total += prfill(sf, blkg->pd[pol->plid], data); | 535 | total += prfill(sf, blkg->pd[pol->plid], data); |
514 | spin_unlock_irq(blkg->q->queue_lock); | 536 | spin_unlock_irq(&blkg->q->queue_lock); |
515 | } | 537 | } |
516 | rcu_read_unlock(); | 538 | rcu_read_unlock(); |
517 | 539 | ||
@@ -709,7 +731,7 @@ u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, | |||
709 | struct cgroup_subsys_state *pos_css; | 731 | struct cgroup_subsys_state *pos_css; |
710 | u64 sum = 0; | 732 | u64 sum = 0; |
711 | 733 | ||
712 | lockdep_assert_held(blkg->q->queue_lock); | 734 | lockdep_assert_held(&blkg->q->queue_lock); |
713 | 735 | ||
714 | rcu_read_lock(); | 736 | rcu_read_lock(); |
715 | blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { | 737 | blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { |
@@ -752,7 +774,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, | |||
752 | struct blkg_rwstat sum = { }; | 774 | struct blkg_rwstat sum = { }; |
753 | int i; | 775 | int i; |
754 | 776 | ||
755 | lockdep_assert_held(blkg->q->queue_lock); | 777 | lockdep_assert_held(&blkg->q->queue_lock); |
756 | 778 | ||
757 | rcu_read_lock(); | 779 | rcu_read_lock(); |
758 | blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { | 780 | blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { |
@@ -783,18 +805,10 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, | |||
783 | struct request_queue *q) | 805 | struct request_queue *q) |
784 | { | 806 | { |
785 | WARN_ON_ONCE(!rcu_read_lock_held()); | 807 | WARN_ON_ONCE(!rcu_read_lock_held()); |
786 | lockdep_assert_held(q->queue_lock); | 808 | lockdep_assert_held(&q->queue_lock); |
787 | 809 | ||
788 | if (!blkcg_policy_enabled(q, pol)) | 810 | if (!blkcg_policy_enabled(q, pol)) |
789 | return ERR_PTR(-EOPNOTSUPP); | 811 | return ERR_PTR(-EOPNOTSUPP); |
790 | |||
791 | /* | ||
792 | * This could be the first entry point of blkcg implementation and | ||
793 | * we shouldn't allow anything to go through for a bypassing queue. | ||
794 | */ | ||
795 | if (unlikely(blk_queue_bypass(q))) | ||
796 | return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); | ||
797 | |||
798 | return __blkg_lookup(blkcg, q, true /* update_hint */); | 812 | return __blkg_lookup(blkcg, q, true /* update_hint */); |
799 | } | 813 | } |
800 | 814 | ||
@@ -812,7 +826,7 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, | |||
812 | */ | 826 | */ |
813 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | 827 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, |
814 | char *input, struct blkg_conf_ctx *ctx) | 828 | char *input, struct blkg_conf_ctx *ctx) |
815 | __acquires(rcu) __acquires(disk->queue->queue_lock) | 829 | __acquires(rcu) __acquires(&disk->queue->queue_lock) |
816 | { | 830 | { |
817 | struct gendisk *disk; | 831 | struct gendisk *disk; |
818 | struct request_queue *q; | 832 | struct request_queue *q; |
@@ -840,7 +854,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | |||
840 | q = disk->queue; | 854 | q = disk->queue; |
841 | 855 | ||
842 | rcu_read_lock(); | 856 | rcu_read_lock(); |
843 | spin_lock_irq(q->queue_lock); | 857 | spin_lock_irq(&q->queue_lock); |
844 | 858 | ||
845 | blkg = blkg_lookup_check(blkcg, pol, q); | 859 | blkg = blkg_lookup_check(blkcg, pol, q); |
846 | if (IS_ERR(blkg)) { | 860 | if (IS_ERR(blkg)) { |
@@ -867,7 +881,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | |||
867 | } | 881 | } |
868 | 882 | ||
869 | /* Drop locks to do new blkg allocation with GFP_KERNEL. */ | 883 | /* Drop locks to do new blkg allocation with GFP_KERNEL. */ |
870 | spin_unlock_irq(q->queue_lock); | 884 | spin_unlock_irq(&q->queue_lock); |
871 | rcu_read_unlock(); | 885 | rcu_read_unlock(); |
872 | 886 | ||
873 | new_blkg = blkg_alloc(pos, q, GFP_KERNEL); | 887 | new_blkg = blkg_alloc(pos, q, GFP_KERNEL); |
@@ -877,7 +891,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | |||
877 | } | 891 | } |
878 | 892 | ||
879 | rcu_read_lock(); | 893 | rcu_read_lock(); |
880 | spin_lock_irq(q->queue_lock); | 894 | spin_lock_irq(&q->queue_lock); |
881 | 895 | ||
882 | blkg = blkg_lookup_check(pos, pol, q); | 896 | blkg = blkg_lookup_check(pos, pol, q); |
883 | if (IS_ERR(blkg)) { | 897 | if (IS_ERR(blkg)) { |
@@ -905,7 +919,7 @@ success: | |||
905 | return 0; | 919 | return 0; |
906 | 920 | ||
907 | fail_unlock: | 921 | fail_unlock: |
908 | spin_unlock_irq(q->queue_lock); | 922 | spin_unlock_irq(&q->queue_lock); |
909 | rcu_read_unlock(); | 923 | rcu_read_unlock(); |
910 | fail: | 924 | fail: |
911 | put_disk_and_module(disk); | 925 | put_disk_and_module(disk); |
@@ -921,7 +935,6 @@ fail: | |||
921 | } | 935 | } |
922 | return ret; | 936 | return ret; |
923 | } | 937 | } |
924 | EXPORT_SYMBOL_GPL(blkg_conf_prep); | ||
925 | 938 | ||
926 | /** | 939 | /** |
927 | * blkg_conf_finish - finish up per-blkg config update | 940 | * blkg_conf_finish - finish up per-blkg config update |
@@ -931,13 +944,12 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); | |||
931 | * with blkg_conf_prep(). | 944 | * with blkg_conf_prep(). |
932 | */ | 945 | */ |
933 | void blkg_conf_finish(struct blkg_conf_ctx *ctx) | 946 | void blkg_conf_finish(struct blkg_conf_ctx *ctx) |
934 | __releases(ctx->disk->queue->queue_lock) __releases(rcu) | 947 | __releases(&ctx->disk->queue->queue_lock) __releases(rcu) |
935 | { | 948 | { |
936 | spin_unlock_irq(ctx->disk->queue->queue_lock); | 949 | spin_unlock_irq(&ctx->disk->queue->queue_lock); |
937 | rcu_read_unlock(); | 950 | rcu_read_unlock(); |
938 | put_disk_and_module(ctx->disk); | 951 | put_disk_and_module(ctx->disk); |
939 | } | 952 | } |
940 | EXPORT_SYMBOL_GPL(blkg_conf_finish); | ||
941 | 953 | ||
942 | static int blkcg_print_stat(struct seq_file *sf, void *v) | 954 | static int blkcg_print_stat(struct seq_file *sf, void *v) |
943 | { | 955 | { |
@@ -967,7 +979,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) | |||
967 | */ | 979 | */ |
968 | off += scnprintf(buf+off, size-off, "%s ", dname); | 980 | off += scnprintf(buf+off, size-off, "%s ", dname); |
969 | 981 | ||
970 | spin_lock_irq(blkg->q->queue_lock); | 982 | spin_lock_irq(&blkg->q->queue_lock); |
971 | 983 | ||
972 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, | 984 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, |
973 | offsetof(struct blkcg_gq, stat_bytes)); | 985 | offsetof(struct blkcg_gq, stat_bytes)); |
@@ -981,7 +993,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) | |||
981 | wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); | 993 | wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); |
982 | dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); | 994 | dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); |
983 | 995 | ||
984 | spin_unlock_irq(blkg->q->queue_lock); | 996 | spin_unlock_irq(&blkg->q->queue_lock); |
985 | 997 | ||
986 | if (rbytes || wbytes || rios || wios) { | 998 | if (rbytes || wbytes || rios || wios) { |
987 | has_stats = true; | 999 | has_stats = true; |
@@ -1102,9 +1114,9 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg) | |||
1102 | struct blkcg_gq, blkcg_node); | 1114 | struct blkcg_gq, blkcg_node); |
1103 | struct request_queue *q = blkg->q; | 1115 | struct request_queue *q = blkg->q; |
1104 | 1116 | ||
1105 | if (spin_trylock(q->queue_lock)) { | 1117 | if (spin_trylock(&q->queue_lock)) { |
1106 | blkg_destroy(blkg); | 1118 | blkg_destroy(blkg); |
1107 | spin_unlock(q->queue_lock); | 1119 | spin_unlock(&q->queue_lock); |
1108 | } else { | 1120 | } else { |
1109 | spin_unlock_irq(&blkcg->lock); | 1121 | spin_unlock_irq(&blkcg->lock); |
1110 | cpu_relax(); | 1122 | cpu_relax(); |
@@ -1225,36 +1237,31 @@ int blkcg_init_queue(struct request_queue *q) | |||
1225 | 1237 | ||
1226 | /* Make sure the root blkg exists. */ | 1238 | /* Make sure the root blkg exists. */ |
1227 | rcu_read_lock(); | 1239 | rcu_read_lock(); |
1228 | spin_lock_irq(q->queue_lock); | 1240 | spin_lock_irq(&q->queue_lock); |
1229 | blkg = blkg_create(&blkcg_root, q, new_blkg); | 1241 | blkg = blkg_create(&blkcg_root, q, new_blkg); |
1230 | if (IS_ERR(blkg)) | 1242 | if (IS_ERR(blkg)) |
1231 | goto err_unlock; | 1243 | goto err_unlock; |
1232 | q->root_blkg = blkg; | 1244 | q->root_blkg = blkg; |
1233 | q->root_rl.blkg = blkg; | 1245 | spin_unlock_irq(&q->queue_lock); |
1234 | spin_unlock_irq(q->queue_lock); | ||
1235 | rcu_read_unlock(); | 1246 | rcu_read_unlock(); |
1236 | 1247 | ||
1237 | if (preloaded) | 1248 | if (preloaded) |
1238 | radix_tree_preload_end(); | 1249 | radix_tree_preload_end(); |
1239 | 1250 | ||
1240 | ret = blk_iolatency_init(q); | 1251 | ret = blk_iolatency_init(q); |
1241 | if (ret) { | 1252 | if (ret) |
1242 | spin_lock_irq(q->queue_lock); | 1253 | goto err_destroy_all; |
1243 | blkg_destroy_all(q); | ||
1244 | spin_unlock_irq(q->queue_lock); | ||
1245 | return ret; | ||
1246 | } | ||
1247 | 1254 | ||
1248 | ret = blk_throtl_init(q); | 1255 | ret = blk_throtl_init(q); |
1249 | if (ret) { | 1256 | if (ret) |
1250 | spin_lock_irq(q->queue_lock); | 1257 | goto err_destroy_all; |
1251 | blkg_destroy_all(q); | 1258 | return 0; |
1252 | spin_unlock_irq(q->queue_lock); | ||
1253 | } | ||
1254 | return ret; | ||
1255 | 1259 | ||
1260 | err_destroy_all: | ||
1261 | blkg_destroy_all(q); | ||
1262 | return ret; | ||
1256 | err_unlock: | 1263 | err_unlock: |
1257 | spin_unlock_irq(q->queue_lock); | 1264 | spin_unlock_irq(&q->queue_lock); |
1258 | rcu_read_unlock(); | 1265 | rcu_read_unlock(); |
1259 | if (preloaded) | 1266 | if (preloaded) |
1260 | radix_tree_preload_end(); | 1267 | radix_tree_preload_end(); |
@@ -1269,7 +1276,7 @@ err_unlock: | |||
1269 | */ | 1276 | */ |
1270 | void blkcg_drain_queue(struct request_queue *q) | 1277 | void blkcg_drain_queue(struct request_queue *q) |
1271 | { | 1278 | { |
1272 | lockdep_assert_held(q->queue_lock); | 1279 | lockdep_assert_held(&q->queue_lock); |
1273 | 1280 | ||
1274 | /* | 1281 | /* |
1275 | * @q could be exiting and already have destroyed all blkgs as | 1282 | * @q could be exiting and already have destroyed all blkgs as |
@@ -1289,10 +1296,7 @@ void blkcg_drain_queue(struct request_queue *q) | |||
1289 | */ | 1296 | */ |
1290 | void blkcg_exit_queue(struct request_queue *q) | 1297 | void blkcg_exit_queue(struct request_queue *q) |
1291 | { | 1298 | { |
1292 | spin_lock_irq(q->queue_lock); | ||
1293 | blkg_destroy_all(q); | 1299 | blkg_destroy_all(q); |
1294 | spin_unlock_irq(q->queue_lock); | ||
1295 | |||
1296 | blk_throtl_exit(q); | 1300 | blk_throtl_exit(q); |
1297 | } | 1301 | } |
1298 | 1302 | ||
@@ -1396,10 +1400,8 @@ int blkcg_activate_policy(struct request_queue *q, | |||
1396 | if (blkcg_policy_enabled(q, pol)) | 1400 | if (blkcg_policy_enabled(q, pol)) |
1397 | return 0; | 1401 | return 0; |
1398 | 1402 | ||
1399 | if (q->mq_ops) | 1403 | if (queue_is_mq(q)) |
1400 | blk_mq_freeze_queue(q); | 1404 | blk_mq_freeze_queue(q); |
1401 | else | ||
1402 | blk_queue_bypass_start(q); | ||
1403 | pd_prealloc: | 1405 | pd_prealloc: |
1404 | if (!pd_prealloc) { | 1406 | if (!pd_prealloc) { |
1405 | pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); | 1407 | pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); |
@@ -1409,7 +1411,7 @@ pd_prealloc: | |||
1409 | } | 1411 | } |
1410 | } | 1412 | } |
1411 | 1413 | ||
1412 | spin_lock_irq(q->queue_lock); | 1414 | spin_lock_irq(&q->queue_lock); |
1413 | 1415 | ||
1414 | list_for_each_entry(blkg, &q->blkg_list, q_node) { | 1416 | list_for_each_entry(blkg, &q->blkg_list, q_node) { |
1415 | struct blkg_policy_data *pd; | 1417 | struct blkg_policy_data *pd; |
@@ -1421,7 +1423,7 @@ pd_prealloc: | |||
1421 | if (!pd) | 1423 | if (!pd) |
1422 | swap(pd, pd_prealloc); | 1424 | swap(pd, pd_prealloc); |
1423 | if (!pd) { | 1425 | if (!pd) { |
1424 | spin_unlock_irq(q->queue_lock); | 1426 | spin_unlock_irq(&q->queue_lock); |
1425 | goto pd_prealloc; | 1427 | goto pd_prealloc; |
1426 | } | 1428 | } |
1427 | 1429 | ||
@@ -1435,12 +1437,10 @@ pd_prealloc: | |||
1435 | __set_bit(pol->plid, q->blkcg_pols); | 1437 | __set_bit(pol->plid, q->blkcg_pols); |
1436 | ret = 0; | 1438 | ret = 0; |
1437 | 1439 | ||
1438 | spin_unlock_irq(q->queue_lock); | 1440 | spin_unlock_irq(&q->queue_lock); |
1439 | out_bypass_end: | 1441 | out_bypass_end: |
1440 | if (q->mq_ops) | 1442 | if (queue_is_mq(q)) |
1441 | blk_mq_unfreeze_queue(q); | 1443 | blk_mq_unfreeze_queue(q); |
1442 | else | ||
1443 | blk_queue_bypass_end(q); | ||
1444 | if (pd_prealloc) | 1444 | if (pd_prealloc) |
1445 | pol->pd_free_fn(pd_prealloc); | 1445 | pol->pd_free_fn(pd_prealloc); |
1446 | return ret; | 1446 | return ret; |
@@ -1463,12 +1463,10 @@ void blkcg_deactivate_policy(struct request_queue *q, | |||
1463 | if (!blkcg_policy_enabled(q, pol)) | 1463 | if (!blkcg_policy_enabled(q, pol)) |
1464 | return; | 1464 | return; |
1465 | 1465 | ||
1466 | if (q->mq_ops) | 1466 | if (queue_is_mq(q)) |
1467 | blk_mq_freeze_queue(q); | 1467 | blk_mq_freeze_queue(q); |
1468 | else | ||
1469 | blk_queue_bypass_start(q); | ||
1470 | 1468 | ||
1471 | spin_lock_irq(q->queue_lock); | 1469 | spin_lock_irq(&q->queue_lock); |
1472 | 1470 | ||
1473 | __clear_bit(pol->plid, q->blkcg_pols); | 1471 | __clear_bit(pol->plid, q->blkcg_pols); |
1474 | 1472 | ||
@@ -1481,12 +1479,10 @@ void blkcg_deactivate_policy(struct request_queue *q, | |||
1481 | } | 1479 | } |
1482 | } | 1480 | } |
1483 | 1481 | ||
1484 | spin_unlock_irq(q->queue_lock); | 1482 | spin_unlock_irq(&q->queue_lock); |
1485 | 1483 | ||
1486 | if (q->mq_ops) | 1484 | if (queue_is_mq(q)) |
1487 | blk_mq_unfreeze_queue(q); | 1485 | blk_mq_unfreeze_queue(q); |
1488 | else | ||
1489 | blk_queue_bypass_end(q); | ||
1490 | } | 1486 | } |
1491 | EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); | 1487 | EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); |
1492 | 1488 | ||
@@ -1748,8 +1744,7 @@ void blkcg_maybe_throttle_current(void) | |||
1748 | blkg = blkg_lookup(blkcg, q); | 1744 | blkg = blkg_lookup(blkcg, q); |
1749 | if (!blkg) | 1745 | if (!blkg) |
1750 | goto out; | 1746 | goto out; |
1751 | blkg = blkg_try_get(blkg); | 1747 | if (!blkg_tryget(blkg)) |
1752 | if (!blkg) | ||
1753 | goto out; | 1748 | goto out; |
1754 | rcu_read_unlock(); | 1749 | rcu_read_unlock(); |
1755 | 1750 | ||
@@ -1761,7 +1756,6 @@ out: | |||
1761 | rcu_read_unlock(); | 1756 | rcu_read_unlock(); |
1762 | blk_put_queue(q); | 1757 | blk_put_queue(q); |
1763 | } | 1758 | } |
1764 | EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current); | ||
1765 | 1759 | ||
1766 | /** | 1760 | /** |
1767 | * blkcg_schedule_throttle - this task needs to check for throttling | 1761 | * blkcg_schedule_throttle - this task needs to check for throttling |
@@ -1795,7 +1789,6 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) | |||
1795 | current->use_memdelay = use_memdelay; | 1789 | current->use_memdelay = use_memdelay; |
1796 | set_notify_resume(current); | 1790 | set_notify_resume(current); |
1797 | } | 1791 | } |
1798 | EXPORT_SYMBOL_GPL(blkcg_schedule_throttle); | ||
1799 | 1792 | ||
1800 | /** | 1793 | /** |
1801 | * blkcg_add_delay - add delay to this blkg | 1794 | * blkcg_add_delay - add delay to this blkg |
@@ -1810,7 +1803,6 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) | |||
1810 | blkcg_scale_delay(blkg, now); | 1803 | blkcg_scale_delay(blkg, now); |
1811 | atomic64_add(delta, &blkg->delay_nsec); | 1804 | atomic64_add(delta, &blkg->delay_nsec); |
1812 | } | 1805 | } |
1813 | EXPORT_SYMBOL_GPL(blkcg_add_delay); | ||
1814 | 1806 | ||
1815 | module_param(blkcg_debug_stats, bool, 0644); | 1807 | module_param(blkcg_debug_stats, bool, 0644); |
1816 | MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); | 1808 | MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); |
diff --git a/block/blk-core.c b/block/blk-core.c index ce12515f9b9b..c78042975737 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -58,11 +58,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); | |||
58 | DEFINE_IDA(blk_queue_ida); | 58 | DEFINE_IDA(blk_queue_ida); |
59 | 59 | ||
60 | /* | 60 | /* |
61 | * For the allocated request tables | ||
62 | */ | ||
63 | struct kmem_cache *request_cachep; | ||
64 | |||
65 | /* | ||
66 | * For queue allocation | 61 | * For queue allocation |
67 | */ | 62 | */ |
68 | struct kmem_cache *blk_requestq_cachep; | 63 | struct kmem_cache *blk_requestq_cachep; |
@@ -79,11 +74,7 @@ static struct workqueue_struct *kblockd_workqueue; | |||
79 | */ | 74 | */ |
80 | void blk_queue_flag_set(unsigned int flag, struct request_queue *q) | 75 | void blk_queue_flag_set(unsigned int flag, struct request_queue *q) |
81 | { | 76 | { |
82 | unsigned long flags; | 77 | set_bit(flag, &q->queue_flags); |
83 | |||
84 | spin_lock_irqsave(q->queue_lock, flags); | ||
85 | queue_flag_set(flag, q); | ||
86 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
87 | } | 78 | } |
88 | EXPORT_SYMBOL(blk_queue_flag_set); | 79 | EXPORT_SYMBOL(blk_queue_flag_set); |
89 | 80 | ||
@@ -94,11 +85,7 @@ EXPORT_SYMBOL(blk_queue_flag_set); | |||
94 | */ | 85 | */ |
95 | void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) | 86 | void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) |
96 | { | 87 | { |
97 | unsigned long flags; | 88 | clear_bit(flag, &q->queue_flags); |
98 | |||
99 | spin_lock_irqsave(q->queue_lock, flags); | ||
100 | queue_flag_clear(flag, q); | ||
101 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
102 | } | 89 | } |
103 | EXPORT_SYMBOL(blk_queue_flag_clear); | 90 | EXPORT_SYMBOL(blk_queue_flag_clear); |
104 | 91 | ||
@@ -112,85 +99,15 @@ EXPORT_SYMBOL(blk_queue_flag_clear); | |||
112 | */ | 99 | */ |
113 | bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) | 100 | bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) |
114 | { | 101 | { |
115 | unsigned long flags; | 102 | return test_and_set_bit(flag, &q->queue_flags); |
116 | bool res; | ||
117 | |||
118 | spin_lock_irqsave(q->queue_lock, flags); | ||
119 | res = queue_flag_test_and_set(flag, q); | ||
120 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
121 | |||
122 | return res; | ||
123 | } | 103 | } |
124 | EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); | 104 | EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); |
125 | 105 | ||
126 | /** | ||
127 | * blk_queue_flag_test_and_clear - atomically test and clear a queue flag | ||
128 | * @flag: flag to be cleared | ||
129 | * @q: request queue | ||
130 | * | ||
131 | * Returns the previous value of @flag - 0 if the flag was not set and 1 if | ||
132 | * the flag was set. | ||
133 | */ | ||
134 | bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q) | ||
135 | { | ||
136 | unsigned long flags; | ||
137 | bool res; | ||
138 | |||
139 | spin_lock_irqsave(q->queue_lock, flags); | ||
140 | res = queue_flag_test_and_clear(flag, q); | ||
141 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
142 | |||
143 | return res; | ||
144 | } | ||
145 | EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear); | ||
146 | |||
147 | static void blk_clear_congested(struct request_list *rl, int sync) | ||
148 | { | ||
149 | #ifdef CONFIG_CGROUP_WRITEBACK | ||
150 | clear_wb_congested(rl->blkg->wb_congested, sync); | ||
151 | #else | ||
152 | /* | ||
153 | * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't | ||
154 | * flip its congestion state for events on other blkcgs. | ||
155 | */ | ||
156 | if (rl == &rl->q->root_rl) | ||
157 | clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync); | ||
158 | #endif | ||
159 | } | ||
160 | |||
161 | static void blk_set_congested(struct request_list *rl, int sync) | ||
162 | { | ||
163 | #ifdef CONFIG_CGROUP_WRITEBACK | ||
164 | set_wb_congested(rl->blkg->wb_congested, sync); | ||
165 | #else | ||
166 | /* see blk_clear_congested() */ | ||
167 | if (rl == &rl->q->root_rl) | ||
168 | set_wb_congested(rl->q->backing_dev_info->wb.congested, sync); | ||
169 | #endif | ||
170 | } | ||
171 | |||
172 | void blk_queue_congestion_threshold(struct request_queue *q) | ||
173 | { | ||
174 | int nr; | ||
175 | |||
176 | nr = q->nr_requests - (q->nr_requests / 8) + 1; | ||
177 | if (nr > q->nr_requests) | ||
178 | nr = q->nr_requests; | ||
179 | q->nr_congestion_on = nr; | ||
180 | |||
181 | nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; | ||
182 | if (nr < 1) | ||
183 | nr = 1; | ||
184 | q->nr_congestion_off = nr; | ||
185 | } | ||
186 | |||
187 | void blk_rq_init(struct request_queue *q, struct request *rq) | 106 | void blk_rq_init(struct request_queue *q, struct request *rq) |
188 | { | 107 | { |
189 | memset(rq, 0, sizeof(*rq)); | 108 | memset(rq, 0, sizeof(*rq)); |
190 | 109 | ||
191 | INIT_LIST_HEAD(&rq->queuelist); | 110 | INIT_LIST_HEAD(&rq->queuelist); |
192 | INIT_LIST_HEAD(&rq->timeout_list); | ||
193 | rq->cpu = -1; | ||
194 | rq->q = q; | 111 | rq->q = q; |
195 | rq->__sector = (sector_t) -1; | 112 | rq->__sector = (sector_t) -1; |
196 | INIT_HLIST_NODE(&rq->hash); | 113 | INIT_HLIST_NODE(&rq->hash); |
@@ -256,10 +173,11 @@ static void print_req_error(struct request *req, blk_status_t status) | |||
256 | if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) | 173 | if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) |
257 | return; | 174 | return; |
258 | 175 | ||
259 | printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", | 176 | printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n", |
260 | __func__, blk_errors[idx].name, req->rq_disk ? | 177 | __func__, blk_errors[idx].name, |
261 | req->rq_disk->disk_name : "?", | 178 | req->rq_disk ? req->rq_disk->disk_name : "?", |
262 | (unsigned long long)blk_rq_pos(req)); | 179 | (unsigned long long)blk_rq_pos(req), |
180 | req->cmd_flags); | ||
263 | } | 181 | } |
264 | 182 | ||
265 | static void req_bio_endio(struct request *rq, struct bio *bio, | 183 | static void req_bio_endio(struct request *rq, struct bio *bio, |
@@ -292,99 +210,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg) | |||
292 | } | 210 | } |
293 | EXPORT_SYMBOL(blk_dump_rq_flags); | 211 | EXPORT_SYMBOL(blk_dump_rq_flags); |
294 | 212 | ||
295 | static void blk_delay_work(struct work_struct *work) | ||
296 | { | ||
297 | struct request_queue *q; | ||
298 | |||
299 | q = container_of(work, struct request_queue, delay_work.work); | ||
300 | spin_lock_irq(q->queue_lock); | ||
301 | __blk_run_queue(q); | ||
302 | spin_unlock_irq(q->queue_lock); | ||
303 | } | ||
304 | |||
305 | /** | ||
306 | * blk_delay_queue - restart queueing after defined interval | ||
307 | * @q: The &struct request_queue in question | ||
308 | * @msecs: Delay in msecs | ||
309 | * | ||
310 | * Description: | ||
311 | * Sometimes queueing needs to be postponed for a little while, to allow | ||
312 | * resources to come back. This function will make sure that queueing is | ||
313 | * restarted around the specified time. | ||
314 | */ | ||
315 | void blk_delay_queue(struct request_queue *q, unsigned long msecs) | ||
316 | { | ||
317 | lockdep_assert_held(q->queue_lock); | ||
318 | WARN_ON_ONCE(q->mq_ops); | ||
319 | |||
320 | if (likely(!blk_queue_dead(q))) | ||
321 | queue_delayed_work(kblockd_workqueue, &q->delay_work, | ||
322 | msecs_to_jiffies(msecs)); | ||
323 | } | ||
324 | EXPORT_SYMBOL(blk_delay_queue); | ||
325 | |||
326 | /** | ||
327 | * blk_start_queue_async - asynchronously restart a previously stopped queue | ||
328 | * @q: The &struct request_queue in question | ||
329 | * | ||
330 | * Description: | ||
331 | * blk_start_queue_async() will clear the stop flag on the queue, and | ||
332 | * ensure that the request_fn for the queue is run from an async | ||
333 | * context. | ||
334 | **/ | ||
335 | void blk_start_queue_async(struct request_queue *q) | ||
336 | { | ||
337 | lockdep_assert_held(q->queue_lock); | ||
338 | WARN_ON_ONCE(q->mq_ops); | ||
339 | |||
340 | queue_flag_clear(QUEUE_FLAG_STOPPED, q); | ||
341 | blk_run_queue_async(q); | ||
342 | } | ||
343 | EXPORT_SYMBOL(blk_start_queue_async); | ||
344 | |||
345 | /** | ||
346 | * blk_start_queue - restart a previously stopped queue | ||
347 | * @q: The &struct request_queue in question | ||
348 | * | ||
349 | * Description: | ||
350 | * blk_start_queue() will clear the stop flag on the queue, and call | ||
351 | * the request_fn for the queue if it was in a stopped state when | ||
352 | * entered. Also see blk_stop_queue(). | ||
353 | **/ | ||
354 | void blk_start_queue(struct request_queue *q) | ||
355 | { | ||
356 | lockdep_assert_held(q->queue_lock); | ||
357 | WARN_ON_ONCE(q->mq_ops); | ||
358 | |||
359 | queue_flag_clear(QUEUE_FLAG_STOPPED, q); | ||
360 | __blk_run_queue(q); | ||
361 | } | ||
362 | EXPORT_SYMBOL(blk_start_queue); | ||
363 | |||
364 | /** | ||
365 | * blk_stop_queue - stop a queue | ||
366 | * @q: The &struct request_queue in question | ||
367 | * | ||
368 | * Description: | ||
369 | * The Linux block layer assumes that a block driver will consume all | ||
370 | * entries on the request queue when the request_fn strategy is called. | ||
371 | * Often this will not happen, because of hardware limitations (queue | ||
372 | * depth settings). If a device driver gets a 'queue full' response, | ||
373 | * or if it simply chooses not to queue more I/O at one point, it can | ||
374 | * call this function to prevent the request_fn from being called until | ||
375 | * the driver has signalled it's ready to go again. This happens by calling | ||
376 | * blk_start_queue() to restart queue operations. | ||
377 | **/ | ||
378 | void blk_stop_queue(struct request_queue *q) | ||
379 | { | ||
380 | lockdep_assert_held(q->queue_lock); | ||
381 | WARN_ON_ONCE(q->mq_ops); | ||
382 | |||
383 | cancel_delayed_work(&q->delay_work); | ||
384 | queue_flag_set(QUEUE_FLAG_STOPPED, q); | ||
385 | } | ||
386 | EXPORT_SYMBOL(blk_stop_queue); | ||
387 | |||
388 | /** | 213 | /** |
389 | * blk_sync_queue - cancel any pending callbacks on a queue | 214 | * blk_sync_queue - cancel any pending callbacks on a queue |
390 | * @q: the queue | 215 | * @q: the queue |
@@ -408,15 +233,13 @@ void blk_sync_queue(struct request_queue *q) | |||
408 | del_timer_sync(&q->timeout); | 233 | del_timer_sync(&q->timeout); |
409 | cancel_work_sync(&q->timeout_work); | 234 | cancel_work_sync(&q->timeout_work); |
410 | 235 | ||
411 | if (q->mq_ops) { | 236 | if (queue_is_mq(q)) { |
412 | struct blk_mq_hw_ctx *hctx; | 237 | struct blk_mq_hw_ctx *hctx; |
413 | int i; | 238 | int i; |
414 | 239 | ||
415 | cancel_delayed_work_sync(&q->requeue_work); | 240 | cancel_delayed_work_sync(&q->requeue_work); |
416 | queue_for_each_hw_ctx(q, hctx, i) | 241 | queue_for_each_hw_ctx(q, hctx, i) |
417 | cancel_delayed_work_sync(&hctx->run_work); | 242 | cancel_delayed_work_sync(&hctx->run_work); |
418 | } else { | ||
419 | cancel_delayed_work_sync(&q->delay_work); | ||
420 | } | 243 | } |
421 | } | 244 | } |
422 | EXPORT_SYMBOL(blk_sync_queue); | 245 | EXPORT_SYMBOL(blk_sync_queue); |
@@ -442,250 +265,12 @@ void blk_clear_pm_only(struct request_queue *q) | |||
442 | } | 265 | } |
443 | EXPORT_SYMBOL_GPL(blk_clear_pm_only); | 266 | EXPORT_SYMBOL_GPL(blk_clear_pm_only); |
444 | 267 | ||
445 | /** | ||
446 | * __blk_run_queue_uncond - run a queue whether or not it has been stopped | ||
447 | * @q: The queue to run | ||
448 | * | ||
449 | * Description: | ||
450 | * Invoke request handling on a queue if there are any pending requests. | ||
451 | * May be used to restart request handling after a request has completed. | ||
452 | * This variant runs the queue whether or not the queue has been | ||
453 | * stopped. Must be called with the queue lock held and interrupts | ||
454 | * disabled. See also @blk_run_queue. | ||
455 | */ | ||
456 | inline void __blk_run_queue_uncond(struct request_queue *q) | ||
457 | { | ||
458 | lockdep_assert_held(q->queue_lock); | ||
459 | WARN_ON_ONCE(q->mq_ops); | ||
460 | |||
461 | if (unlikely(blk_queue_dead(q))) | ||
462 | return; | ||
463 | |||
464 | /* | ||
465 | * Some request_fn implementations, e.g. scsi_request_fn(), unlock | ||
466 | * the queue lock internally. As a result multiple threads may be | ||
467 | * running such a request function concurrently. Keep track of the | ||
468 | * number of active request_fn invocations such that blk_drain_queue() | ||
469 | * can wait until all these request_fn calls have finished. | ||
470 | */ | ||
471 | q->request_fn_active++; | ||
472 | q->request_fn(q); | ||
473 | q->request_fn_active--; | ||
474 | } | ||
475 | EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); | ||
476 | |||
477 | /** | ||
478 | * __blk_run_queue - run a single device queue | ||
479 | * @q: The queue to run | ||
480 | * | ||
481 | * Description: | ||
482 | * See @blk_run_queue. | ||
483 | */ | ||
484 | void __blk_run_queue(struct request_queue *q) | ||
485 | { | ||
486 | lockdep_assert_held(q->queue_lock); | ||
487 | WARN_ON_ONCE(q->mq_ops); | ||
488 | |||
489 | if (unlikely(blk_queue_stopped(q))) | ||
490 | return; | ||
491 | |||
492 | __blk_run_queue_uncond(q); | ||
493 | } | ||
494 | EXPORT_SYMBOL(__blk_run_queue); | ||
495 | |||
496 | /** | ||
497 | * blk_run_queue_async - run a single device queue in workqueue context | ||
498 | * @q: The queue to run | ||
499 | * | ||
500 | * Description: | ||
501 | * Tells kblockd to perform the equivalent of @blk_run_queue on behalf | ||
502 | * of us. | ||
503 | * | ||
504 | * Note: | ||
505 | * Since it is not allowed to run q->delay_work after blk_cleanup_queue() | ||
506 | * has canceled q->delay_work, callers must hold the queue lock to avoid | ||
507 | * race conditions between blk_cleanup_queue() and blk_run_queue_async(). | ||
508 | */ | ||
509 | void blk_run_queue_async(struct request_queue *q) | ||
510 | { | ||
511 | lockdep_assert_held(q->queue_lock); | ||
512 | WARN_ON_ONCE(q->mq_ops); | ||
513 | |||
514 | if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) | ||
515 | mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); | ||
516 | } | ||
517 | EXPORT_SYMBOL(blk_run_queue_async); | ||
518 | |||
519 | /** | ||
520 | * blk_run_queue - run a single device queue | ||
521 | * @q: The queue to run | ||
522 | * | ||
523 | * Description: | ||
524 | * Invoke request handling on this queue, if it has pending work to do. | ||
525 | * May be used to restart queueing when a request has completed. | ||
526 | */ | ||
527 | void blk_run_queue(struct request_queue *q) | ||
528 | { | ||
529 | unsigned long flags; | ||
530 | |||
531 | WARN_ON_ONCE(q->mq_ops); | ||
532 | |||
533 | spin_lock_irqsave(q->queue_lock, flags); | ||
534 | __blk_run_queue(q); | ||
535 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
536 | } | ||
537 | EXPORT_SYMBOL(blk_run_queue); | ||
538 | |||
539 | void blk_put_queue(struct request_queue *q) | 268 | void blk_put_queue(struct request_queue *q) |
540 | { | 269 | { |
541 | kobject_put(&q->kobj); | 270 | kobject_put(&q->kobj); |
542 | } | 271 | } |
543 | EXPORT_SYMBOL(blk_put_queue); | 272 | EXPORT_SYMBOL(blk_put_queue); |
544 | 273 | ||
545 | /** | ||
546 | * __blk_drain_queue - drain requests from request_queue | ||
547 | * @q: queue to drain | ||
548 | * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV | ||
549 | * | ||
550 | * Drain requests from @q. If @drain_all is set, all requests are drained. | ||
551 | * If not, only ELVPRIV requests are drained. The caller is responsible | ||
552 | * for ensuring that no new requests which need to be drained are queued. | ||
553 | */ | ||
554 | static void __blk_drain_queue(struct request_queue *q, bool drain_all) | ||
555 | __releases(q->queue_lock) | ||
556 | __acquires(q->queue_lock) | ||
557 | { | ||
558 | int i; | ||
559 | |||
560 | lockdep_assert_held(q->queue_lock); | ||
561 | WARN_ON_ONCE(q->mq_ops); | ||
562 | |||
563 | while (true) { | ||
564 | bool drain = false; | ||
565 | |||
566 | /* | ||
567 | * The caller might be trying to drain @q before its | ||
568 | * elevator is initialized. | ||
569 | */ | ||
570 | if (q->elevator) | ||
571 | elv_drain_elevator(q); | ||
572 | |||
573 | blkcg_drain_queue(q); | ||
574 | |||
575 | /* | ||
576 | * This function might be called on a queue which failed | ||
577 | * driver init after queue creation or is not yet fully | ||
578 | * active yet. Some drivers (e.g. fd and loop) get unhappy | ||
579 | * in such cases. Kick queue iff dispatch queue has | ||
580 | * something on it and @q has request_fn set. | ||
581 | */ | ||
582 | if (!list_empty(&q->queue_head) && q->request_fn) | ||
583 | __blk_run_queue(q); | ||
584 | |||
585 | drain |= q->nr_rqs_elvpriv; | ||
586 | drain |= q->request_fn_active; | ||
587 | |||
588 | /* | ||
589 | * Unfortunately, requests are queued at and tracked from | ||
590 | * multiple places and there's no single counter which can | ||
591 | * be drained. Check all the queues and counters. | ||
592 | */ | ||
593 | if (drain_all) { | ||
594 | struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); | ||
595 | drain |= !list_empty(&q->queue_head); | ||
596 | for (i = 0; i < 2; i++) { | ||
597 | drain |= q->nr_rqs[i]; | ||
598 | drain |= q->in_flight[i]; | ||
599 | if (fq) | ||
600 | drain |= !list_empty(&fq->flush_queue[i]); | ||
601 | } | ||
602 | } | ||
603 | |||
604 | if (!drain) | ||
605 | break; | ||
606 | |||
607 | spin_unlock_irq(q->queue_lock); | ||
608 | |||
609 | msleep(10); | ||
610 | |||
611 | spin_lock_irq(q->queue_lock); | ||
612 | } | ||
613 | |||
614 | /* | ||
615 | * With queue marked dead, any woken up waiter will fail the | ||
616 | * allocation path, so the wakeup chaining is lost and we're | ||
617 | * left with hung waiters. We need to wake up those waiters. | ||
618 | */ | ||
619 | if (q->request_fn) { | ||
620 | struct request_list *rl; | ||
621 | |||
622 | blk_queue_for_each_rl(rl, q) | ||
623 | for (i = 0; i < ARRAY_SIZE(rl->wait); i++) | ||
624 | wake_up_all(&rl->wait[i]); | ||
625 | } | ||
626 | } | ||
627 | |||
628 | void blk_drain_queue(struct request_queue *q) | ||
629 | { | ||
630 | spin_lock_irq(q->queue_lock); | ||
631 | __blk_drain_queue(q, true); | ||
632 | spin_unlock_irq(q->queue_lock); | ||
633 | } | ||
634 | |||
635 | /** | ||
636 | * blk_queue_bypass_start - enter queue bypass mode | ||
637 | * @q: queue of interest | ||
638 | * | ||
639 | * In bypass mode, only the dispatch FIFO queue of @q is used. This | ||
640 | * function makes @q enter bypass mode and drains all requests which were | ||
641 | * throttled or issued before. On return, it's guaranteed that no request | ||
642 | * is being throttled or has ELVPRIV set and blk_queue_bypass() %true | ||
643 | * inside queue or RCU read lock. | ||
644 | */ | ||
645 | void blk_queue_bypass_start(struct request_queue *q) | ||
646 | { | ||
647 | WARN_ON_ONCE(q->mq_ops); | ||
648 | |||
649 | spin_lock_irq(q->queue_lock); | ||
650 | q->bypass_depth++; | ||
651 | queue_flag_set(QUEUE_FLAG_BYPASS, q); | ||
652 | spin_unlock_irq(q->queue_lock); | ||
653 | |||
654 | /* | ||
655 | * Queues start drained. Skip actual draining till init is | ||
656 | * complete. This avoids lenghty delays during queue init which | ||
657 | * can happen many times during boot. | ||
658 | */ | ||
659 | if (blk_queue_init_done(q)) { | ||
660 | spin_lock_irq(q->queue_lock); | ||
661 | __blk_drain_queue(q, false); | ||
662 | spin_unlock_irq(q->queue_lock); | ||
663 | |||
664 | /* ensure blk_queue_bypass() is %true inside RCU read lock */ | ||
665 | synchronize_rcu(); | ||
666 | } | ||
667 | } | ||
668 | EXPORT_SYMBOL_GPL(blk_queue_bypass_start); | ||
669 | |||
670 | /** | ||
671 | * blk_queue_bypass_end - leave queue bypass mode | ||
672 | * @q: queue of interest | ||
673 | * | ||
674 | * Leave bypass mode and restore the normal queueing behavior. | ||
675 | * | ||
676 | * Note: although blk_queue_bypass_start() is only called for blk-sq queues, | ||
677 | * this function is called for both blk-sq and blk-mq queues. | ||
678 | */ | ||
679 | void blk_queue_bypass_end(struct request_queue *q) | ||
680 | { | ||
681 | spin_lock_irq(q->queue_lock); | ||
682 | if (!--q->bypass_depth) | ||
683 | queue_flag_clear(QUEUE_FLAG_BYPASS, q); | ||
684 | WARN_ON_ONCE(q->bypass_depth < 0); | ||
685 | spin_unlock_irq(q->queue_lock); | ||
686 | } | ||
687 | EXPORT_SYMBOL_GPL(blk_queue_bypass_end); | ||
688 | |||
689 | void blk_set_queue_dying(struct request_queue *q) | 274 | void blk_set_queue_dying(struct request_queue *q) |
690 | { | 275 | { |
691 | blk_queue_flag_set(QUEUE_FLAG_DYING, q); | 276 | blk_queue_flag_set(QUEUE_FLAG_DYING, q); |
@@ -697,20 +282,8 @@ void blk_set_queue_dying(struct request_queue *q) | |||
697 | */ | 282 | */ |
698 | blk_freeze_queue_start(q); | 283 | blk_freeze_queue_start(q); |
699 | 284 | ||
700 | if (q->mq_ops) | 285 | if (queue_is_mq(q)) |
701 | blk_mq_wake_waiters(q); | 286 | blk_mq_wake_waiters(q); |
702 | else { | ||
703 | struct request_list *rl; | ||
704 | |||
705 | spin_lock_irq(q->queue_lock); | ||
706 | blk_queue_for_each_rl(rl, q) { | ||
707 | if (rl->rq_pool) { | ||
708 | wake_up_all(&rl->wait[BLK_RW_SYNC]); | ||
709 | wake_up_all(&rl->wait[BLK_RW_ASYNC]); | ||
710 | } | ||
711 | } | ||
712 | spin_unlock_irq(q->queue_lock); | ||
713 | } | ||
714 | 287 | ||
715 | /* Make blk_queue_enter() reexamine the DYING flag. */ | 288 | /* Make blk_queue_enter() reexamine the DYING flag. */ |
716 | wake_up_all(&q->mq_freeze_wq); | 289 | wake_up_all(&q->mq_freeze_wq); |
@@ -755,29 +328,13 @@ void blk_exit_queue(struct request_queue *q) | |||
755 | */ | 328 | */ |
756 | void blk_cleanup_queue(struct request_queue *q) | 329 | void blk_cleanup_queue(struct request_queue *q) |
757 | { | 330 | { |
758 | spinlock_t *lock = q->queue_lock; | ||
759 | |||
760 | /* mark @q DYING, no new request or merges will be allowed afterwards */ | 331 | /* mark @q DYING, no new request or merges will be allowed afterwards */ |
761 | mutex_lock(&q->sysfs_lock); | 332 | mutex_lock(&q->sysfs_lock); |
762 | blk_set_queue_dying(q); | 333 | blk_set_queue_dying(q); |
763 | spin_lock_irq(lock); | ||
764 | |||
765 | /* | ||
766 | * A dying queue is permanently in bypass mode till released. Note | ||
767 | * that, unlike blk_queue_bypass_start(), we aren't performing | ||
768 | * synchronize_rcu() after entering bypass mode to avoid the delay | ||
769 | * as some drivers create and destroy a lot of queues while | ||
770 | * probing. This is still safe because blk_release_queue() will be | ||
771 | * called only after the queue refcnt drops to zero and nothing, | ||
772 | * RCU or not, would be traversing the queue by then. | ||
773 | */ | ||
774 | q->bypass_depth++; | ||
775 | queue_flag_set(QUEUE_FLAG_BYPASS, q); | ||
776 | 334 | ||
777 | queue_flag_set(QUEUE_FLAG_NOMERGES, q); | 335 | blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); |
778 | queue_flag_set(QUEUE_FLAG_NOXMERGES, q); | 336 | blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); |
779 | queue_flag_set(QUEUE_FLAG_DYING, q); | 337 | blk_queue_flag_set(QUEUE_FLAG_DYING, q); |
780 | spin_unlock_irq(lock); | ||
781 | mutex_unlock(&q->sysfs_lock); | 338 | mutex_unlock(&q->sysfs_lock); |
782 | 339 | ||
783 | /* | 340 | /* |
@@ -788,9 +345,7 @@ void blk_cleanup_queue(struct request_queue *q) | |||
788 | 345 | ||
789 | rq_qos_exit(q); | 346 | rq_qos_exit(q); |
790 | 347 | ||
791 | spin_lock_irq(lock); | 348 | blk_queue_flag_set(QUEUE_FLAG_DEAD, q); |
792 | queue_flag_set(QUEUE_FLAG_DEAD, q); | ||
793 | spin_unlock_irq(lock); | ||
794 | 349 | ||
795 | /* | 350 | /* |
796 | * make sure all in-progress dispatch are completed because | 351 | * make sure all in-progress dispatch are completed because |
@@ -798,11 +353,10 @@ void blk_cleanup_queue(struct request_queue *q) | |||
798 | * dispatch may still be in-progress since we dispatch requests | 353 | * dispatch may still be in-progress since we dispatch requests |
799 | * from more than one contexts. | 354 | * from more than one contexts. |
800 | * | 355 | * |
801 | * No need to quiesce queue if it isn't initialized yet since | 356 | * We rely on driver to deal with the race in case that queue |
802 | * blk_freeze_queue() should be enough for cases of passthrough | 357 | * initialization isn't done. |
803 | * request. | ||
804 | */ | 358 | */ |
805 | if (q->mq_ops && blk_queue_init_done(q)) | 359 | if (queue_is_mq(q) && blk_queue_init_done(q)) |
806 | blk_mq_quiesce_queue(q); | 360 | blk_mq_quiesce_queue(q); |
807 | 361 | ||
808 | /* for synchronous bio-based driver finish in-flight integrity i/o */ | 362 | /* for synchronous bio-based driver finish in-flight integrity i/o */ |
@@ -820,98 +374,19 @@ void blk_cleanup_queue(struct request_queue *q) | |||
820 | 374 | ||
821 | blk_exit_queue(q); | 375 | blk_exit_queue(q); |
822 | 376 | ||
823 | if (q->mq_ops) | 377 | if (queue_is_mq(q)) |
824 | blk_mq_free_queue(q); | 378 | blk_mq_free_queue(q); |
825 | percpu_ref_exit(&q->q_usage_counter); | ||
826 | 379 | ||
827 | spin_lock_irq(lock); | 380 | percpu_ref_exit(&q->q_usage_counter); |
828 | if (q->queue_lock != &q->__queue_lock) | ||
829 | q->queue_lock = &q->__queue_lock; | ||
830 | spin_unlock_irq(lock); | ||
831 | 381 | ||
832 | /* @q is and will stay empty, shutdown and put */ | 382 | /* @q is and will stay empty, shutdown and put */ |
833 | blk_put_queue(q); | 383 | blk_put_queue(q); |
834 | } | 384 | } |
835 | EXPORT_SYMBOL(blk_cleanup_queue); | 385 | EXPORT_SYMBOL(blk_cleanup_queue); |
836 | 386 | ||
837 | /* Allocate memory local to the request queue */ | ||
838 | static void *alloc_request_simple(gfp_t gfp_mask, void *data) | ||
839 | { | ||
840 | struct request_queue *q = data; | ||
841 | |||
842 | return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node); | ||
843 | } | ||
844 | |||
845 | static void free_request_simple(void *element, void *data) | ||
846 | { | ||
847 | kmem_cache_free(request_cachep, element); | ||
848 | } | ||
849 | |||
850 | static void *alloc_request_size(gfp_t gfp_mask, void *data) | ||
851 | { | ||
852 | struct request_queue *q = data; | ||
853 | struct request *rq; | ||
854 | |||
855 | rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask, | ||
856 | q->node); | ||
857 | if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) { | ||
858 | kfree(rq); | ||
859 | rq = NULL; | ||
860 | } | ||
861 | return rq; | ||
862 | } | ||
863 | |||
864 | static void free_request_size(void *element, void *data) | ||
865 | { | ||
866 | struct request_queue *q = data; | ||
867 | |||
868 | if (q->exit_rq_fn) | ||
869 | q->exit_rq_fn(q, element); | ||
870 | kfree(element); | ||
871 | } | ||
872 | |||
873 | int blk_init_rl(struct request_list *rl, struct request_queue *q, | ||
874 | gfp_t gfp_mask) | ||
875 | { | ||
876 | if (unlikely(rl->rq_pool) || q->mq_ops) | ||
877 | return 0; | ||
878 | |||
879 | rl->q = q; | ||
880 | rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; | ||
881 | rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; | ||
882 | init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); | ||
883 | init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); | ||
884 | |||
885 | if (q->cmd_size) { | ||
886 | rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, | ||
887 | alloc_request_size, free_request_size, | ||
888 | q, gfp_mask, q->node); | ||
889 | } else { | ||
890 | rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, | ||
891 | alloc_request_simple, free_request_simple, | ||
892 | q, gfp_mask, q->node); | ||
893 | } | ||
894 | if (!rl->rq_pool) | ||
895 | return -ENOMEM; | ||
896 | |||
897 | if (rl != &q->root_rl) | ||
898 | WARN_ON_ONCE(!blk_get_queue(q)); | ||
899 | |||
900 | return 0; | ||
901 | } | ||
902 | |||
903 | void blk_exit_rl(struct request_queue *q, struct request_list *rl) | ||
904 | { | ||
905 | if (rl->rq_pool) { | ||
906 | mempool_destroy(rl->rq_pool); | ||
907 | if (rl != &q->root_rl) | ||
908 | blk_put_queue(q); | ||
909 | } | ||
910 | } | ||
911 | |||
912 | struct request_queue *blk_alloc_queue(gfp_t gfp_mask) | 387 | struct request_queue *blk_alloc_queue(gfp_t gfp_mask) |
913 | { | 388 | { |
914 | return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL); | 389 | return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); |
915 | } | 390 | } |
916 | EXPORT_SYMBOL(blk_alloc_queue); | 391 | EXPORT_SYMBOL(blk_alloc_queue); |
917 | 392 | ||
@@ -991,17 +466,8 @@ static void blk_rq_timed_out_timer(struct timer_list *t) | |||
991 | * blk_alloc_queue_node - allocate a request queue | 466 | * blk_alloc_queue_node - allocate a request queue |
992 | * @gfp_mask: memory allocation flags | 467 | * @gfp_mask: memory allocation flags |
993 | * @node_id: NUMA node to allocate memory from | 468 | * @node_id: NUMA node to allocate memory from |
994 | * @lock: For legacy queues, pointer to a spinlock that will be used to e.g. | ||
995 | * serialize calls to the legacy .request_fn() callback. Ignored for | ||
996 | * blk-mq request queues. | ||
997 | * | ||
998 | * Note: pass the queue lock as the third argument to this function instead of | ||
999 | * setting the queue lock pointer explicitly to avoid triggering a sporadic | ||
1000 | * crash in the blkcg code. This function namely calls blkcg_init_queue() and | ||
1001 | * the queue lock pointer must be set before blkcg_init_queue() is called. | ||
1002 | */ | 469 | */ |
1003 | struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, | 470 | struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) |
1004 | spinlock_t *lock) | ||
1005 | { | 471 | { |
1006 | struct request_queue *q; | 472 | struct request_queue *q; |
1007 | int ret; | 473 | int ret; |
@@ -1013,8 +479,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, | |||
1013 | 479 | ||
1014 | INIT_LIST_HEAD(&q->queue_head); | 480 | INIT_LIST_HEAD(&q->queue_head); |
1015 | q->last_merge = NULL; | 481 | q->last_merge = NULL; |
1016 | q->end_sector = 0; | ||
1017 | q->boundary_rq = NULL; | ||
1018 | 482 | ||
1019 | q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); | 483 | q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); |
1020 | if (q->id < 0) | 484 | if (q->id < 0) |
@@ -1042,12 +506,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, | |||
1042 | laptop_mode_timer_fn, 0); | 506 | laptop_mode_timer_fn, 0); |
1043 | timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); | 507 | timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); |
1044 | INIT_WORK(&q->timeout_work, NULL); | 508 | INIT_WORK(&q->timeout_work, NULL); |
1045 | INIT_LIST_HEAD(&q->timeout_list); | ||
1046 | INIT_LIST_HEAD(&q->icq_list); | 509 | INIT_LIST_HEAD(&q->icq_list); |
1047 | #ifdef CONFIG_BLK_CGROUP | 510 | #ifdef CONFIG_BLK_CGROUP |
1048 | INIT_LIST_HEAD(&q->blkg_list); | 511 | INIT_LIST_HEAD(&q->blkg_list); |
1049 | #endif | 512 | #endif |
1050 | INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); | ||
1051 | 513 | ||
1052 | kobject_init(&q->kobj, &blk_queue_ktype); | 514 | kobject_init(&q->kobj, &blk_queue_ktype); |
1053 | 515 | ||
@@ -1055,18 +517,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, | |||
1055 | mutex_init(&q->blk_trace_mutex); | 517 | mutex_init(&q->blk_trace_mutex); |
1056 | #endif | 518 | #endif |
1057 | mutex_init(&q->sysfs_lock); | 519 | mutex_init(&q->sysfs_lock); |
1058 | spin_lock_init(&q->__queue_lock); | 520 | spin_lock_init(&q->queue_lock); |
1059 | |||
1060 | q->queue_lock = lock ? : &q->__queue_lock; | ||
1061 | |||
1062 | /* | ||
1063 | * A queue starts its life with bypass turned on to avoid | ||
1064 | * unnecessary bypass on/off overhead and nasty surprises during | ||
1065 | * init. The initial bypass will be finished when the queue is | ||
1066 | * registered by blk_register_queue(). | ||
1067 | */ | ||
1068 | q->bypass_depth = 1; | ||
1069 | queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q); | ||
1070 | 521 | ||
1071 | init_waitqueue_head(&q->mq_freeze_wq); | 522 | init_waitqueue_head(&q->mq_freeze_wq); |
1072 | 523 | ||
@@ -1100,105 +551,6 @@ fail_q: | |||
1100 | } | 551 | } |
1101 | EXPORT_SYMBOL(blk_alloc_queue_node); | 552 | EXPORT_SYMBOL(blk_alloc_queue_node); |
1102 | 553 | ||
1103 | /** | ||
1104 | * blk_init_queue - prepare a request queue for use with a block device | ||
1105 | * @rfn: The function to be called to process requests that have been | ||
1106 | * placed on the queue. | ||
1107 | * @lock: Request queue spin lock | ||
1108 | * | ||
1109 | * Description: | ||
1110 | * If a block device wishes to use the standard request handling procedures, | ||
1111 | * which sorts requests and coalesces adjacent requests, then it must | ||
1112 | * call blk_init_queue(). The function @rfn will be called when there | ||
1113 | * are requests on the queue that need to be processed. If the device | ||
1114 | * supports plugging, then @rfn may not be called immediately when requests | ||
1115 | * are available on the queue, but may be called at some time later instead. | ||
1116 | * Plugged queues are generally unplugged when a buffer belonging to one | ||
1117 | * of the requests on the queue is needed, or due to memory pressure. | ||
1118 | * | ||
1119 | * @rfn is not required, or even expected, to remove all requests off the | ||
1120 | * queue, but only as many as it can handle at a time. If it does leave | ||
1121 | * requests on the queue, it is responsible for arranging that the requests | ||
1122 | * get dealt with eventually. | ||
1123 | * | ||
1124 | * The queue spin lock must be held while manipulating the requests on the | ||
1125 | * request queue; this lock will be taken also from interrupt context, so irq | ||
1126 | * disabling is needed for it. | ||
1127 | * | ||
1128 | * Function returns a pointer to the initialized request queue, or %NULL if | ||
1129 | * it didn't succeed. | ||
1130 | * | ||
1131 | * Note: | ||
1132 | * blk_init_queue() must be paired with a blk_cleanup_queue() call | ||
1133 | * when the block device is deactivated (such as at module unload). | ||
1134 | **/ | ||
1135 | |||
1136 | struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) | ||
1137 | { | ||
1138 | return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); | ||
1139 | } | ||
1140 | EXPORT_SYMBOL(blk_init_queue); | ||
1141 | |||
1142 | struct request_queue * | ||
1143 | blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) | ||
1144 | { | ||
1145 | struct request_queue *q; | ||
1146 | |||
1147 | q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock); | ||
1148 | if (!q) | ||
1149 | return NULL; | ||
1150 | |||
1151 | q->request_fn = rfn; | ||
1152 | if (blk_init_allocated_queue(q) < 0) { | ||
1153 | blk_cleanup_queue(q); | ||
1154 | return NULL; | ||
1155 | } | ||
1156 | |||
1157 | return q; | ||
1158 | } | ||
1159 | EXPORT_SYMBOL(blk_init_queue_node); | ||
1160 | |||
1161 | static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); | ||
1162 | |||
1163 | |||
1164 | int blk_init_allocated_queue(struct request_queue *q) | ||
1165 | { | ||
1166 | WARN_ON_ONCE(q->mq_ops); | ||
1167 | |||
1168 | q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL); | ||
1169 | if (!q->fq) | ||
1170 | return -ENOMEM; | ||
1171 | |||
1172 | if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL)) | ||
1173 | goto out_free_flush_queue; | ||
1174 | |||
1175 | if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) | ||
1176 | goto out_exit_flush_rq; | ||
1177 | |||
1178 | INIT_WORK(&q->timeout_work, blk_timeout_work); | ||
1179 | q->queue_flags |= QUEUE_FLAG_DEFAULT; | ||
1180 | |||
1181 | /* | ||
1182 | * This also sets hw/phys segments, boundary and size | ||
1183 | */ | ||
1184 | blk_queue_make_request(q, blk_queue_bio); | ||
1185 | |||
1186 | q->sg_reserved_size = INT_MAX; | ||
1187 | |||
1188 | if (elevator_init(q)) | ||
1189 | goto out_exit_flush_rq; | ||
1190 | return 0; | ||
1191 | |||
1192 | out_exit_flush_rq: | ||
1193 | if (q->exit_rq_fn) | ||
1194 | q->exit_rq_fn(q, q->fq->flush_rq); | ||
1195 | out_free_flush_queue: | ||
1196 | blk_free_flush_queue(q->fq); | ||
1197 | q->fq = NULL; | ||
1198 | return -ENOMEM; | ||
1199 | } | ||
1200 | EXPORT_SYMBOL(blk_init_allocated_queue); | ||
1201 | |||
1202 | bool blk_get_queue(struct request_queue *q) | 554 | bool blk_get_queue(struct request_queue *q) |
1203 | { | 555 | { |
1204 | if (likely(!blk_queue_dying(q))) { | 556 | if (likely(!blk_queue_dying(q))) { |
@@ -1210,406 +562,6 @@ bool blk_get_queue(struct request_queue *q) | |||
1210 | } | 562 | } |
1211 | EXPORT_SYMBOL(blk_get_queue); | 563 | EXPORT_SYMBOL(blk_get_queue); |
1212 | 564 | ||
1213 | static inline void blk_free_request(struct request_list *rl, struct request *rq) | ||
1214 | { | ||
1215 | if (rq->rq_flags & RQF_ELVPRIV) { | ||
1216 | elv_put_request(rl->q, rq); | ||
1217 | if (rq->elv.icq) | ||
1218 | put_io_context(rq->elv.icq->ioc); | ||
1219 | } | ||
1220 | |||
1221 | mempool_free(rq, rl->rq_pool); | ||
1222 | } | ||
1223 | |||
1224 | /* | ||
1225 | * ioc_batching returns true if the ioc is a valid batching request and | ||
1226 | * should be given priority access to a request. | ||
1227 | */ | ||
1228 | static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) | ||
1229 | { | ||
1230 | if (!ioc) | ||
1231 | return 0; | ||
1232 | |||
1233 | /* | ||
1234 | * Make sure the process is able to allocate at least 1 request | ||
1235 | * even if the batch times out, otherwise we could theoretically | ||
1236 | * lose wakeups. | ||
1237 | */ | ||
1238 | return ioc->nr_batch_requests == q->nr_batching || | ||
1239 | (ioc->nr_batch_requests > 0 | ||
1240 | && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); | ||
1241 | } | ||
1242 | |||
1243 | /* | ||
1244 | * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This | ||
1245 | * will cause the process to be a "batcher" on all queues in the system. This | ||
1246 | * is the behaviour we want though - once it gets a wakeup it should be given | ||
1247 | * a nice run. | ||
1248 | */ | ||
1249 | static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) | ||
1250 | { | ||
1251 | if (!ioc || ioc_batching(q, ioc)) | ||
1252 | return; | ||
1253 | |||
1254 | ioc->nr_batch_requests = q->nr_batching; | ||
1255 | ioc->last_waited = jiffies; | ||
1256 | } | ||
1257 | |||
1258 | static void __freed_request(struct request_list *rl, int sync) | ||
1259 | { | ||
1260 | struct request_queue *q = rl->q; | ||
1261 | |||
1262 | if (rl->count[sync] < queue_congestion_off_threshold(q)) | ||
1263 | blk_clear_congested(rl, sync); | ||
1264 | |||
1265 | if (rl->count[sync] + 1 <= q->nr_requests) { | ||
1266 | if (waitqueue_active(&rl->wait[sync])) | ||
1267 | wake_up(&rl->wait[sync]); | ||
1268 | |||
1269 | blk_clear_rl_full(rl, sync); | ||
1270 | } | ||
1271 | } | ||
1272 | |||
1273 | /* | ||
1274 | * A request has just been released. Account for it, update the full and | ||
1275 | * congestion status, wake up any waiters. Called under q->queue_lock. | ||
1276 | */ | ||
1277 | static void freed_request(struct request_list *rl, bool sync, | ||
1278 | req_flags_t rq_flags) | ||
1279 | { | ||
1280 | struct request_queue *q = rl->q; | ||
1281 | |||
1282 | q->nr_rqs[sync]--; | ||
1283 | rl->count[sync]--; | ||
1284 | if (rq_flags & RQF_ELVPRIV) | ||
1285 | q->nr_rqs_elvpriv--; | ||
1286 | |||
1287 | __freed_request(rl, sync); | ||
1288 | |||
1289 | if (unlikely(rl->starved[sync ^ 1])) | ||
1290 | __freed_request(rl, sync ^ 1); | ||
1291 | } | ||
1292 | |||
1293 | int blk_update_nr_requests(struct request_queue *q, unsigned int nr) | ||
1294 | { | ||
1295 | struct request_list *rl; | ||
1296 | int on_thresh, off_thresh; | ||
1297 | |||
1298 | WARN_ON_ONCE(q->mq_ops); | ||
1299 | |||
1300 | spin_lock_irq(q->queue_lock); | ||
1301 | q->nr_requests = nr; | ||
1302 | blk_queue_congestion_threshold(q); | ||
1303 | on_thresh = queue_congestion_on_threshold(q); | ||
1304 | off_thresh = queue_congestion_off_threshold(q); | ||
1305 | |||
1306 | blk_queue_for_each_rl(rl, q) { | ||
1307 | if (rl->count[BLK_RW_SYNC] >= on_thresh) | ||
1308 | blk_set_congested(rl, BLK_RW_SYNC); | ||
1309 | else if (rl->count[BLK_RW_SYNC] < off_thresh) | ||
1310 | blk_clear_congested(rl, BLK_RW_SYNC); | ||
1311 | |||
1312 | if (rl->count[BLK_RW_ASYNC] >= on_thresh) | ||
1313 | blk_set_congested(rl, BLK_RW_ASYNC); | ||
1314 | else if (rl->count[BLK_RW_ASYNC] < off_thresh) | ||
1315 | blk_clear_congested(rl, BLK_RW_ASYNC); | ||
1316 | |||
1317 | if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { | ||
1318 | blk_set_rl_full(rl, BLK_RW_SYNC); | ||
1319 | } else { | ||
1320 | blk_clear_rl_full(rl, BLK_RW_SYNC); | ||
1321 | wake_up(&rl->wait[BLK_RW_SYNC]); | ||
1322 | } | ||
1323 | |||
1324 | if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { | ||
1325 | blk_set_rl_full(rl, BLK_RW_ASYNC); | ||
1326 | } else { | ||
1327 | blk_clear_rl_full(rl, BLK_RW_ASYNC); | ||
1328 | wake_up(&rl->wait[BLK_RW_ASYNC]); | ||
1329 | } | ||
1330 | } | ||
1331 | |||
1332 | spin_unlock_irq(q->queue_lock); | ||
1333 | return 0; | ||
1334 | } | ||
1335 | |||
1336 | /** | ||
1337 | * __get_request - get a free request | ||
1338 | * @rl: request list to allocate from | ||
1339 | * @op: operation and flags | ||
1340 | * @bio: bio to allocate request for (can be %NULL) | ||
1341 | * @flags: BLQ_MQ_REQ_* flags | ||
1342 | * @gfp_mask: allocator flags | ||
1343 | * | ||
1344 | * Get a free request from @q. This function may fail under memory | ||
1345 | * pressure or if @q is dead. | ||
1346 | * | ||
1347 | * Must be called with @q->queue_lock held and, | ||
1348 | * Returns ERR_PTR on failure, with @q->queue_lock held. | ||
1349 | * Returns request pointer on success, with @q->queue_lock *not held*. | ||
1350 | */ | ||
1351 | static struct request *__get_request(struct request_list *rl, unsigned int op, | ||
1352 | struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask) | ||
1353 | { | ||
1354 | struct request_queue *q = rl->q; | ||
1355 | struct request *rq; | ||
1356 | struct elevator_type *et = q->elevator->type; | ||
1357 | struct io_context *ioc = rq_ioc(bio); | ||
1358 | struct io_cq *icq = NULL; | ||
1359 | const bool is_sync = op_is_sync(op); | ||
1360 | int may_queue; | ||
1361 | req_flags_t rq_flags = RQF_ALLOCED; | ||
1362 | |||
1363 | lockdep_assert_held(q->queue_lock); | ||
1364 | |||
1365 | if (unlikely(blk_queue_dying(q))) | ||
1366 | return ERR_PTR(-ENODEV); | ||
1367 | |||
1368 | may_queue = elv_may_queue(q, op); | ||
1369 | if (may_queue == ELV_MQUEUE_NO) | ||
1370 | goto rq_starved; | ||
1371 | |||
1372 | if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { | ||
1373 | if (rl->count[is_sync]+1 >= q->nr_requests) { | ||
1374 | /* | ||
1375 | * The queue will fill after this allocation, so set | ||
1376 | * it as full, and mark this process as "batching". | ||
1377 | * This process will be allowed to complete a batch of | ||
1378 | * requests, others will be blocked. | ||
1379 | */ | ||
1380 | if (!blk_rl_full(rl, is_sync)) { | ||
1381 | ioc_set_batching(q, ioc); | ||
1382 | blk_set_rl_full(rl, is_sync); | ||
1383 | } else { | ||
1384 | if (may_queue != ELV_MQUEUE_MUST | ||
1385 | && !ioc_batching(q, ioc)) { | ||
1386 | /* | ||
1387 | * The queue is full and the allocating | ||
1388 | * process is not a "batcher", and not | ||
1389 | * exempted by the IO scheduler | ||
1390 | */ | ||
1391 | return ERR_PTR(-ENOMEM); | ||
1392 | } | ||
1393 | } | ||
1394 | } | ||
1395 | blk_set_congested(rl, is_sync); | ||
1396 | } | ||
1397 | |||
1398 | /* | ||
1399 | * Only allow batching queuers to allocate up to 50% over the defined | ||
1400 | * limit of requests, otherwise we could have thousands of requests | ||
1401 | * allocated with any setting of ->nr_requests | ||
1402 | */ | ||
1403 | if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) | ||
1404 | return ERR_PTR(-ENOMEM); | ||
1405 | |||
1406 | q->nr_rqs[is_sync]++; | ||
1407 | rl->count[is_sync]++; | ||
1408 | rl->starved[is_sync] = 0; | ||
1409 | |||
1410 | /* | ||
1411 | * Decide whether the new request will be managed by elevator. If | ||
1412 | * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will | ||
1413 | * prevent the current elevator from being destroyed until the new | ||
1414 | * request is freed. This guarantees icq's won't be destroyed and | ||
1415 | * makes creating new ones safe. | ||
1416 | * | ||
1417 | * Flush requests do not use the elevator so skip initialization. | ||
1418 | * This allows a request to share the flush and elevator data. | ||
1419 | * | ||
1420 | * Also, lookup icq while holding queue_lock. If it doesn't exist, | ||
1421 | * it will be created after releasing queue_lock. | ||
1422 | */ | ||
1423 | if (!op_is_flush(op) && !blk_queue_bypass(q)) { | ||
1424 | rq_flags |= RQF_ELVPRIV; | ||
1425 | q->nr_rqs_elvpriv++; | ||
1426 | if (et->icq_cache && ioc) | ||
1427 | icq = ioc_lookup_icq(ioc, q); | ||
1428 | } | ||
1429 | |||
1430 | if (blk_queue_io_stat(q)) | ||
1431 | rq_flags |= RQF_IO_STAT; | ||
1432 | spin_unlock_irq(q->queue_lock); | ||
1433 | |||
1434 | /* allocate and init request */ | ||
1435 | rq = mempool_alloc(rl->rq_pool, gfp_mask); | ||
1436 | if (!rq) | ||
1437 | goto fail_alloc; | ||
1438 | |||
1439 | blk_rq_init(q, rq); | ||
1440 | blk_rq_set_rl(rq, rl); | ||
1441 | rq->cmd_flags = op; | ||
1442 | rq->rq_flags = rq_flags; | ||
1443 | if (flags & BLK_MQ_REQ_PREEMPT) | ||
1444 | rq->rq_flags |= RQF_PREEMPT; | ||
1445 | |||
1446 | /* init elvpriv */ | ||
1447 | if (rq_flags & RQF_ELVPRIV) { | ||
1448 | if (unlikely(et->icq_cache && !icq)) { | ||
1449 | if (ioc) | ||
1450 | icq = ioc_create_icq(ioc, q, gfp_mask); | ||
1451 | if (!icq) | ||
1452 | goto fail_elvpriv; | ||
1453 | } | ||
1454 | |||
1455 | rq->elv.icq = icq; | ||
1456 | if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) | ||
1457 | goto fail_elvpriv; | ||
1458 | |||
1459 | /* @rq->elv.icq holds io_context until @rq is freed */ | ||
1460 | if (icq) | ||
1461 | get_io_context(icq->ioc); | ||
1462 | } | ||
1463 | out: | ||
1464 | /* | ||
1465 | * ioc may be NULL here, and ioc_batching will be false. That's | ||
1466 | * OK, if the queue is under the request limit then requests need | ||
1467 | * not count toward the nr_batch_requests limit. There will always | ||
1468 | * be some limit enforced by BLK_BATCH_TIME. | ||
1469 | */ | ||
1470 | if (ioc_batching(q, ioc)) | ||
1471 | ioc->nr_batch_requests--; | ||
1472 | |||
1473 | trace_block_getrq(q, bio, op); | ||
1474 | return rq; | ||
1475 | |||
1476 | fail_elvpriv: | ||
1477 | /* | ||
1478 | * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed | ||
1479 | * and may fail indefinitely under memory pressure and thus | ||
1480 | * shouldn't stall IO. Treat this request as !elvpriv. This will | ||
1481 | * disturb iosched and blkcg but weird is bettern than dead. | ||
1482 | */ | ||
1483 | printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", | ||
1484 | __func__, dev_name(q->backing_dev_info->dev)); | ||
1485 | |||
1486 | rq->rq_flags &= ~RQF_ELVPRIV; | ||
1487 | rq->elv.icq = NULL; | ||
1488 | |||
1489 | spin_lock_irq(q->queue_lock); | ||
1490 | q->nr_rqs_elvpriv--; | ||
1491 | spin_unlock_irq(q->queue_lock); | ||
1492 | goto out; | ||
1493 | |||
1494 | fail_alloc: | ||
1495 | /* | ||
1496 | * Allocation failed presumably due to memory. Undo anything we | ||
1497 | * might have messed up. | ||
1498 | * | ||
1499 | * Allocating task should really be put onto the front of the wait | ||
1500 | * queue, but this is pretty rare. | ||
1501 | */ | ||
1502 | spin_lock_irq(q->queue_lock); | ||
1503 | freed_request(rl, is_sync, rq_flags); | ||
1504 | |||
1505 | /* | ||
1506 | * in the very unlikely event that allocation failed and no | ||
1507 | * requests for this direction was pending, mark us starved so that | ||
1508 | * freeing of a request in the other direction will notice | ||
1509 | * us. another possible fix would be to split the rq mempool into | ||
1510 | * READ and WRITE | ||
1511 | */ | ||
1512 | rq_starved: | ||
1513 | if (unlikely(rl->count[is_sync] == 0)) | ||
1514 | rl->starved[is_sync] = 1; | ||
1515 | return ERR_PTR(-ENOMEM); | ||
1516 | } | ||
1517 | |||
1518 | /** | ||
1519 | * get_request - get a free request | ||
1520 | * @q: request_queue to allocate request from | ||
1521 | * @op: operation and flags | ||
1522 | * @bio: bio to allocate request for (can be %NULL) | ||
1523 | * @flags: BLK_MQ_REQ_* flags. | ||
1524 | * @gfp: allocator flags | ||
1525 | * | ||
1526 | * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags, | ||
1527 | * this function keeps retrying under memory pressure and fails iff @q is dead. | ||
1528 | * | ||
1529 | * Must be called with @q->queue_lock held and, | ||
1530 | * Returns ERR_PTR on failure, with @q->queue_lock held. | ||
1531 | * Returns request pointer on success, with @q->queue_lock *not held*. | ||
1532 | */ | ||
1533 | static struct request *get_request(struct request_queue *q, unsigned int op, | ||
1534 | struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp) | ||
1535 | { | ||
1536 | const bool is_sync = op_is_sync(op); | ||
1537 | DEFINE_WAIT(wait); | ||
1538 | struct request_list *rl; | ||
1539 | struct request *rq; | ||
1540 | |||
1541 | lockdep_assert_held(q->queue_lock); | ||
1542 | WARN_ON_ONCE(q->mq_ops); | ||
1543 | |||
1544 | rl = blk_get_rl(q, bio); /* transferred to @rq on success */ | ||
1545 | retry: | ||
1546 | rq = __get_request(rl, op, bio, flags, gfp); | ||
1547 | if (!IS_ERR(rq)) | ||
1548 | return rq; | ||
1549 | |||
1550 | if (op & REQ_NOWAIT) { | ||
1551 | blk_put_rl(rl); | ||
1552 | return ERR_PTR(-EAGAIN); | ||
1553 | } | ||
1554 | |||
1555 | if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) { | ||
1556 | blk_put_rl(rl); | ||
1557 | return rq; | ||
1558 | } | ||
1559 | |||
1560 | /* wait on @rl and retry */ | ||
1561 | prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, | ||
1562 | TASK_UNINTERRUPTIBLE); | ||
1563 | |||
1564 | trace_block_sleeprq(q, bio, op); | ||
1565 | |||
1566 | spin_unlock_irq(q->queue_lock); | ||
1567 | io_schedule(); | ||
1568 | |||
1569 | /* | ||
1570 | * After sleeping, we become a "batching" process and will be able | ||
1571 | * to allocate at least one request, and up to a big batch of them | ||
1572 | * for a small period time. See ioc_batching, ioc_set_batching | ||
1573 | */ | ||
1574 | ioc_set_batching(q, current->io_context); | ||
1575 | |||
1576 | spin_lock_irq(q->queue_lock); | ||
1577 | finish_wait(&rl->wait[is_sync], &wait); | ||
1578 | |||
1579 | goto retry; | ||
1580 | } | ||
1581 | |||
1582 | /* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ | ||
1583 | static struct request *blk_old_get_request(struct request_queue *q, | ||
1584 | unsigned int op, blk_mq_req_flags_t flags) | ||
1585 | { | ||
1586 | struct request *rq; | ||
1587 | gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO; | ||
1588 | int ret = 0; | ||
1589 | |||
1590 | WARN_ON_ONCE(q->mq_ops); | ||
1591 | |||
1592 | /* create ioc upfront */ | ||
1593 | create_io_context(gfp_mask, q->node); | ||
1594 | |||
1595 | ret = blk_queue_enter(q, flags); | ||
1596 | if (ret) | ||
1597 | return ERR_PTR(ret); | ||
1598 | spin_lock_irq(q->queue_lock); | ||
1599 | rq = get_request(q, op, NULL, flags, gfp_mask); | ||
1600 | if (IS_ERR(rq)) { | ||
1601 | spin_unlock_irq(q->queue_lock); | ||
1602 | blk_queue_exit(q); | ||
1603 | return rq; | ||
1604 | } | ||
1605 | |||
1606 | /* q->queue_lock is unlocked at this point */ | ||
1607 | rq->__data_len = 0; | ||
1608 | rq->__sector = (sector_t) -1; | ||
1609 | rq->bio = rq->biotail = NULL; | ||
1610 | return rq; | ||
1611 | } | ||
1612 | |||
1613 | /** | 565 | /** |
1614 | * blk_get_request - allocate a request | 566 | * blk_get_request - allocate a request |
1615 | * @q: request queue to allocate a request for | 567 | * @q: request queue to allocate a request for |
@@ -1624,170 +576,17 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op, | |||
1624 | WARN_ON_ONCE(op & REQ_NOWAIT); | 576 | WARN_ON_ONCE(op & REQ_NOWAIT); |
1625 | WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); | 577 | WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); |
1626 | 578 | ||
1627 | if (q->mq_ops) { | 579 | req = blk_mq_alloc_request(q, op, flags); |
1628 | req = blk_mq_alloc_request(q, op, flags); | 580 | if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) |
1629 | if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) | 581 | q->mq_ops->initialize_rq_fn(req); |
1630 | q->mq_ops->initialize_rq_fn(req); | ||
1631 | } else { | ||
1632 | req = blk_old_get_request(q, op, flags); | ||
1633 | if (!IS_ERR(req) && q->initialize_rq_fn) | ||
1634 | q->initialize_rq_fn(req); | ||
1635 | } | ||
1636 | 582 | ||
1637 | return req; | 583 | return req; |
1638 | } | 584 | } |
1639 | EXPORT_SYMBOL(blk_get_request); | 585 | EXPORT_SYMBOL(blk_get_request); |
1640 | 586 | ||
1641 | /** | ||
1642 | * blk_requeue_request - put a request back on queue | ||
1643 | * @q: request queue where request should be inserted | ||
1644 | * @rq: request to be inserted | ||
1645 | * | ||
1646 | * Description: | ||
1647 | * Drivers often keep queueing requests until the hardware cannot accept | ||
1648 | * more, when that condition happens we need to put the request back | ||
1649 | * on the queue. Must be called with queue lock held. | ||
1650 | */ | ||
1651 | void blk_requeue_request(struct request_queue *q, struct request *rq) | ||
1652 | { | ||
1653 | lockdep_assert_held(q->queue_lock); | ||
1654 | WARN_ON_ONCE(q->mq_ops); | ||
1655 | |||
1656 | blk_delete_timer(rq); | ||
1657 | blk_clear_rq_complete(rq); | ||
1658 | trace_block_rq_requeue(q, rq); | ||
1659 | rq_qos_requeue(q, rq); | ||
1660 | |||
1661 | if (rq->rq_flags & RQF_QUEUED) | ||
1662 | blk_queue_end_tag(q, rq); | ||
1663 | |||
1664 | BUG_ON(blk_queued_rq(rq)); | ||
1665 | |||
1666 | elv_requeue_request(q, rq); | ||
1667 | } | ||
1668 | EXPORT_SYMBOL(blk_requeue_request); | ||
1669 | |||
1670 | static void add_acct_request(struct request_queue *q, struct request *rq, | ||
1671 | int where) | ||
1672 | { | ||
1673 | blk_account_io_start(rq, true); | ||
1674 | __elv_add_request(q, rq, where); | ||
1675 | } | ||
1676 | |||
1677 | static void part_round_stats_single(struct request_queue *q, int cpu, | ||
1678 | struct hd_struct *part, unsigned long now, | ||
1679 | unsigned int inflight) | ||
1680 | { | ||
1681 | if (inflight) { | ||
1682 | __part_stat_add(cpu, part, time_in_queue, | ||
1683 | inflight * (now - part->stamp)); | ||
1684 | __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); | ||
1685 | } | ||
1686 | part->stamp = now; | ||
1687 | } | ||
1688 | |||
1689 | /** | ||
1690 | * part_round_stats() - Round off the performance stats on a struct disk_stats. | ||
1691 | * @q: target block queue | ||
1692 | * @cpu: cpu number for stats access | ||
1693 | * @part: target partition | ||
1694 | * | ||
1695 | * The average IO queue length and utilisation statistics are maintained | ||
1696 | * by observing the current state of the queue length and the amount of | ||
1697 | * time it has been in this state for. | ||
1698 | * | ||
1699 | * Normally, that accounting is done on IO completion, but that can result | ||
1700 | * in more than a second's worth of IO being accounted for within any one | ||
1701 | * second, leading to >100% utilisation. To deal with that, we call this | ||
1702 | * function to do a round-off before returning the results when reading | ||
1703 | * /proc/diskstats. This accounts immediately for all queue usage up to | ||
1704 | * the current jiffies and restarts the counters again. | ||
1705 | */ | ||
1706 | void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) | ||
1707 | { | ||
1708 | struct hd_struct *part2 = NULL; | ||
1709 | unsigned long now = jiffies; | ||
1710 | unsigned int inflight[2]; | ||
1711 | int stats = 0; | ||
1712 | |||
1713 | if (part->stamp != now) | ||
1714 | stats |= 1; | ||
1715 | |||
1716 | if (part->partno) { | ||
1717 | part2 = &part_to_disk(part)->part0; | ||
1718 | if (part2->stamp != now) | ||
1719 | stats |= 2; | ||
1720 | } | ||
1721 | |||
1722 | if (!stats) | ||
1723 | return; | ||
1724 | |||
1725 | part_in_flight(q, part, inflight); | ||
1726 | |||
1727 | if (stats & 2) | ||
1728 | part_round_stats_single(q, cpu, part2, now, inflight[1]); | ||
1729 | if (stats & 1) | ||
1730 | part_round_stats_single(q, cpu, part, now, inflight[0]); | ||
1731 | } | ||
1732 | EXPORT_SYMBOL_GPL(part_round_stats); | ||
1733 | |||
1734 | void __blk_put_request(struct request_queue *q, struct request *req) | ||
1735 | { | ||
1736 | req_flags_t rq_flags = req->rq_flags; | ||
1737 | |||
1738 | if (unlikely(!q)) | ||
1739 | return; | ||
1740 | |||
1741 | if (q->mq_ops) { | ||
1742 | blk_mq_free_request(req); | ||
1743 | return; | ||
1744 | } | ||
1745 | |||
1746 | lockdep_assert_held(q->queue_lock); | ||
1747 | |||
1748 | blk_req_zone_write_unlock(req); | ||
1749 | blk_pm_put_request(req); | ||
1750 | blk_pm_mark_last_busy(req); | ||
1751 | |||
1752 | elv_completed_request(q, req); | ||
1753 | |||
1754 | /* this is a bio leak */ | ||
1755 | WARN_ON(req->bio != NULL); | ||
1756 | |||
1757 | rq_qos_done(q, req); | ||
1758 | |||
1759 | /* | ||
1760 | * Request may not have originated from ll_rw_blk. if not, | ||
1761 | * it didn't come out of our reserved rq pools | ||
1762 | */ | ||
1763 | if (rq_flags & RQF_ALLOCED) { | ||
1764 | struct request_list *rl = blk_rq_rl(req); | ||
1765 | bool sync = op_is_sync(req->cmd_flags); | ||
1766 | |||
1767 | BUG_ON(!list_empty(&req->queuelist)); | ||
1768 | BUG_ON(ELV_ON_HASH(req)); | ||
1769 | |||
1770 | blk_free_request(rl, req); | ||
1771 | freed_request(rl, sync, rq_flags); | ||
1772 | blk_put_rl(rl); | ||
1773 | blk_queue_exit(q); | ||
1774 | } | ||
1775 | } | ||
1776 | EXPORT_SYMBOL_GPL(__blk_put_request); | ||
1777 | |||
1778 | void blk_put_request(struct request *req) | 587 | void blk_put_request(struct request *req) |
1779 | { | 588 | { |
1780 | struct request_queue *q = req->q; | 589 | blk_mq_free_request(req); |
1781 | |||
1782 | if (q->mq_ops) | ||
1783 | blk_mq_free_request(req); | ||
1784 | else { | ||
1785 | unsigned long flags; | ||
1786 | |||
1787 | spin_lock_irqsave(q->queue_lock, flags); | ||
1788 | __blk_put_request(q, req); | ||
1789 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
1790 | } | ||
1791 | } | 590 | } |
1792 | EXPORT_SYMBOL(blk_put_request); | 591 | EXPORT_SYMBOL(blk_put_request); |
1793 | 592 | ||
@@ -1807,7 +606,6 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | |||
1807 | req->biotail->bi_next = bio; | 606 | req->biotail->bi_next = bio; |
1808 | req->biotail = bio; | 607 | req->biotail = bio; |
1809 | req->__data_len += bio->bi_iter.bi_size; | 608 | req->__data_len += bio->bi_iter.bi_size; |
1810 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | ||
1811 | 609 | ||
1812 | blk_account_io_start(req, false); | 610 | blk_account_io_start(req, false); |
1813 | return true; | 611 | return true; |
@@ -1831,7 +629,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, | |||
1831 | 629 | ||
1832 | req->__sector = bio->bi_iter.bi_sector; | 630 | req->__sector = bio->bi_iter.bi_sector; |
1833 | req->__data_len += bio->bi_iter.bi_size; | 631 | req->__data_len += bio->bi_iter.bi_size; |
1834 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | ||
1835 | 632 | ||
1836 | blk_account_io_start(req, false); | 633 | blk_account_io_start(req, false); |
1837 | return true; | 634 | return true; |
@@ -1851,7 +648,6 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, | |||
1851 | req->biotail->bi_next = bio; | 648 | req->biotail->bi_next = bio; |
1852 | req->biotail = bio; | 649 | req->biotail = bio; |
1853 | req->__data_len += bio->bi_iter.bi_size; | 650 | req->__data_len += bio->bi_iter.bi_size; |
1854 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | ||
1855 | req->nr_phys_segments = segments + 1; | 651 | req->nr_phys_segments = segments + 1; |
1856 | 652 | ||
1857 | blk_account_io_start(req, false); | 653 | blk_account_io_start(req, false); |
@@ -1884,7 +680,6 @@ no_merge: | |||
1884 | * Caller must ensure !blk_queue_nomerges(q) beforehand. | 680 | * Caller must ensure !blk_queue_nomerges(q) beforehand. |
1885 | */ | 681 | */ |
1886 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | 682 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, |
1887 | unsigned int *request_count, | ||
1888 | struct request **same_queue_rq) | 683 | struct request **same_queue_rq) |
1889 | { | 684 | { |
1890 | struct blk_plug *plug; | 685 | struct blk_plug *plug; |
@@ -1894,25 +689,19 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | |||
1894 | plug = current->plug; | 689 | plug = current->plug; |
1895 | if (!plug) | 690 | if (!plug) |
1896 | return false; | 691 | return false; |
1897 | *request_count = 0; | ||
1898 | 692 | ||
1899 | if (q->mq_ops) | 693 | plug_list = &plug->mq_list; |
1900 | plug_list = &plug->mq_list; | ||
1901 | else | ||
1902 | plug_list = &plug->list; | ||
1903 | 694 | ||
1904 | list_for_each_entry_reverse(rq, plug_list, queuelist) { | 695 | list_for_each_entry_reverse(rq, plug_list, queuelist) { |
1905 | bool merged = false; | 696 | bool merged = false; |
1906 | 697 | ||
1907 | if (rq->q == q) { | 698 | if (rq->q == q && same_queue_rq) { |
1908 | (*request_count)++; | ||
1909 | /* | 699 | /* |
1910 | * Only blk-mq multiple hardware queues case checks the | 700 | * Only blk-mq multiple hardware queues case checks the |
1911 | * rq in the same queue, there should be only one such | 701 | * rq in the same queue, there should be only one such |
1912 | * rq in a queue | 702 | * rq in a queue |
1913 | **/ | 703 | **/ |
1914 | if (same_queue_rq) | 704 | *same_queue_rq = rq; |
1915 | *same_queue_rq = rq; | ||
1916 | } | 705 | } |
1917 | 706 | ||
1918 | if (rq->q != q || !blk_rq_merge_ok(rq, bio)) | 707 | if (rq->q != q || !blk_rq_merge_ok(rq, bio)) |
@@ -1939,176 +728,18 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | |||
1939 | return false; | 728 | return false; |
1940 | } | 729 | } |
1941 | 730 | ||
1942 | unsigned int blk_plug_queued_count(struct request_queue *q) | ||
1943 | { | ||
1944 | struct blk_plug *plug; | ||
1945 | struct request *rq; | ||
1946 | struct list_head *plug_list; | ||
1947 | unsigned int ret = 0; | ||
1948 | |||
1949 | plug = current->plug; | ||
1950 | if (!plug) | ||
1951 | goto out; | ||
1952 | |||
1953 | if (q->mq_ops) | ||
1954 | plug_list = &plug->mq_list; | ||
1955 | else | ||
1956 | plug_list = &plug->list; | ||
1957 | |||
1958 | list_for_each_entry(rq, plug_list, queuelist) { | ||
1959 | if (rq->q == q) | ||
1960 | ret++; | ||
1961 | } | ||
1962 | out: | ||
1963 | return ret; | ||
1964 | } | ||
1965 | |||
1966 | void blk_init_request_from_bio(struct request *req, struct bio *bio) | 731 | void blk_init_request_from_bio(struct request *req, struct bio *bio) |
1967 | { | 732 | { |
1968 | struct io_context *ioc = rq_ioc(bio); | ||
1969 | |||
1970 | if (bio->bi_opf & REQ_RAHEAD) | 733 | if (bio->bi_opf & REQ_RAHEAD) |
1971 | req->cmd_flags |= REQ_FAILFAST_MASK; | 734 | req->cmd_flags |= REQ_FAILFAST_MASK; |
1972 | 735 | ||
1973 | req->__sector = bio->bi_iter.bi_sector; | 736 | req->__sector = bio->bi_iter.bi_sector; |
1974 | if (ioprio_valid(bio_prio(bio))) | 737 | req->ioprio = bio_prio(bio); |
1975 | req->ioprio = bio_prio(bio); | ||
1976 | else if (ioc) | ||
1977 | req->ioprio = ioc->ioprio; | ||
1978 | else | ||
1979 | req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); | ||
1980 | req->write_hint = bio->bi_write_hint; | 738 | req->write_hint = bio->bi_write_hint; |
1981 | blk_rq_bio_prep(req->q, req, bio); | 739 | blk_rq_bio_prep(req->q, req, bio); |
1982 | } | 740 | } |
1983 | EXPORT_SYMBOL_GPL(blk_init_request_from_bio); | 741 | EXPORT_SYMBOL_GPL(blk_init_request_from_bio); |
1984 | 742 | ||
1985 | static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) | ||
1986 | { | ||
1987 | struct blk_plug *plug; | ||
1988 | int where = ELEVATOR_INSERT_SORT; | ||
1989 | struct request *req, *free; | ||
1990 | unsigned int request_count = 0; | ||
1991 | |||
1992 | /* | ||
1993 | * low level driver can indicate that it wants pages above a | ||
1994 | * certain limit bounced to low memory (ie for highmem, or even | ||
1995 | * ISA dma in theory) | ||
1996 | */ | ||
1997 | blk_queue_bounce(q, &bio); | ||
1998 | |||
1999 | blk_queue_split(q, &bio); | ||
2000 | |||
2001 | if (!bio_integrity_prep(bio)) | ||
2002 | return BLK_QC_T_NONE; | ||
2003 | |||
2004 | if (op_is_flush(bio->bi_opf)) { | ||
2005 | spin_lock_irq(q->queue_lock); | ||
2006 | where = ELEVATOR_INSERT_FLUSH; | ||
2007 | goto get_rq; | ||
2008 | } | ||
2009 | |||
2010 | /* | ||
2011 | * Check if we can merge with the plugged list before grabbing | ||
2012 | * any locks. | ||
2013 | */ | ||
2014 | if (!blk_queue_nomerges(q)) { | ||
2015 | if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) | ||
2016 | return BLK_QC_T_NONE; | ||
2017 | } else | ||
2018 | request_count = blk_plug_queued_count(q); | ||
2019 | |||
2020 | spin_lock_irq(q->queue_lock); | ||
2021 | |||
2022 | switch (elv_merge(q, &req, bio)) { | ||
2023 | case ELEVATOR_BACK_MERGE: | ||
2024 | if (!bio_attempt_back_merge(q, req, bio)) | ||
2025 | break; | ||
2026 | elv_bio_merged(q, req, bio); | ||
2027 | free = attempt_back_merge(q, req); | ||
2028 | if (free) | ||
2029 | __blk_put_request(q, free); | ||
2030 | else | ||
2031 | elv_merged_request(q, req, ELEVATOR_BACK_MERGE); | ||
2032 | goto out_unlock; | ||
2033 | case ELEVATOR_FRONT_MERGE: | ||
2034 | if (!bio_attempt_front_merge(q, req, bio)) | ||
2035 | break; | ||
2036 | elv_bio_merged(q, req, bio); | ||
2037 | free = attempt_front_merge(q, req); | ||
2038 | if (free) | ||
2039 | __blk_put_request(q, free); | ||
2040 | else | ||
2041 | elv_merged_request(q, req, ELEVATOR_FRONT_MERGE); | ||
2042 | goto out_unlock; | ||
2043 | default: | ||
2044 | break; | ||
2045 | } | ||
2046 | |||
2047 | get_rq: | ||
2048 | rq_qos_throttle(q, bio, q->queue_lock); | ||
2049 | |||
2050 | /* | ||
2051 | * Grab a free request. This is might sleep but can not fail. | ||
2052 | * Returns with the queue unlocked. | ||
2053 | */ | ||
2054 | blk_queue_enter_live(q); | ||
2055 | req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); | ||
2056 | if (IS_ERR(req)) { | ||
2057 | blk_queue_exit(q); | ||
2058 | rq_qos_cleanup(q, bio); | ||
2059 | if (PTR_ERR(req) == -ENOMEM) | ||
2060 | bio->bi_status = BLK_STS_RESOURCE; | ||
2061 | else | ||
2062 | bio->bi_status = BLK_STS_IOERR; | ||
2063 | bio_endio(bio); | ||
2064 | goto out_unlock; | ||
2065 | } | ||
2066 | |||
2067 | rq_qos_track(q, req, bio); | ||
2068 | |||
2069 | /* | ||
2070 | * After dropping the lock and possibly sleeping here, our request | ||
2071 | * may now be mergeable after it had proven unmergeable (above). | ||
2072 | * We don't worry about that case for efficiency. It won't happen | ||
2073 | * often, and the elevators are able to handle it. | ||
2074 | */ | ||
2075 | blk_init_request_from_bio(req, bio); | ||
2076 | |||
2077 | if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) | ||
2078 | req->cpu = raw_smp_processor_id(); | ||
2079 | |||
2080 | plug = current->plug; | ||
2081 | if (plug) { | ||
2082 | /* | ||
2083 | * If this is the first request added after a plug, fire | ||
2084 | * of a plug trace. | ||
2085 | * | ||
2086 | * @request_count may become stale because of schedule | ||
2087 | * out, so check plug list again. | ||
2088 | */ | ||
2089 | if (!request_count || list_empty(&plug->list)) | ||
2090 | trace_block_plug(q); | ||
2091 | else { | ||
2092 | struct request *last = list_entry_rq(plug->list.prev); | ||
2093 | if (request_count >= BLK_MAX_REQUEST_COUNT || | ||
2094 | blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) { | ||
2095 | blk_flush_plug_list(plug, false); | ||
2096 | trace_block_plug(q); | ||
2097 | } | ||
2098 | } | ||
2099 | list_add_tail(&req->queuelist, &plug->list); | ||
2100 | blk_account_io_start(req, true); | ||
2101 | } else { | ||
2102 | spin_lock_irq(q->queue_lock); | ||
2103 | add_acct_request(q, req, where); | ||
2104 | __blk_run_queue(q); | ||
2105 | out_unlock: | ||
2106 | spin_unlock_irq(q->queue_lock); | ||
2107 | } | ||
2108 | |||
2109 | return BLK_QC_T_NONE; | ||
2110 | } | ||
2111 | |||
2112 | static void handle_bad_sector(struct bio *bio, sector_t maxsector) | 743 | static void handle_bad_sector(struct bio *bio, sector_t maxsector) |
2113 | { | 744 | { |
2114 | char b[BDEVNAME_SIZE]; | 745 | char b[BDEVNAME_SIZE]; |
@@ -2260,7 +891,7 @@ generic_make_request_checks(struct bio *bio) | |||
2260 | * For a REQ_NOWAIT based request, return -EOPNOTSUPP | 891 | * For a REQ_NOWAIT based request, return -EOPNOTSUPP |
2261 | * if queue is not a request based queue. | 892 | * if queue is not a request based queue. |
2262 | */ | 893 | */ |
2263 | if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) | 894 | if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) |
2264 | goto not_supported; | 895 | goto not_supported; |
2265 | 896 | ||
2266 | if (should_fail_bio(bio)) | 897 | if (should_fail_bio(bio)) |
@@ -2290,6 +921,9 @@ generic_make_request_checks(struct bio *bio) | |||
2290 | } | 921 | } |
2291 | } | 922 | } |
2292 | 923 | ||
924 | if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) | ||
925 | bio->bi_opf &= ~REQ_HIPRI; | ||
926 | |||
2293 | switch (bio_op(bio)) { | 927 | switch (bio_op(bio)) { |
2294 | case REQ_OP_DISCARD: | 928 | case REQ_OP_DISCARD: |
2295 | if (!blk_queue_discard(q)) | 929 | if (!blk_queue_discard(q)) |
@@ -2562,17 +1196,6 @@ blk_qc_t submit_bio(struct bio *bio) | |||
2562 | } | 1196 | } |
2563 | EXPORT_SYMBOL(submit_bio); | 1197 | EXPORT_SYMBOL(submit_bio); |
2564 | 1198 | ||
2565 | bool blk_poll(struct request_queue *q, blk_qc_t cookie) | ||
2566 | { | ||
2567 | if (!q->poll_fn || !blk_qc_t_valid(cookie)) | ||
2568 | return false; | ||
2569 | |||
2570 | if (current->plug) | ||
2571 | blk_flush_plug_list(current->plug, false); | ||
2572 | return q->poll_fn(q, cookie); | ||
2573 | } | ||
2574 | EXPORT_SYMBOL_GPL(blk_poll); | ||
2575 | |||
2576 | /** | 1199 | /** |
2577 | * blk_cloned_rq_check_limits - Helper function to check a cloned request | 1200 | * blk_cloned_rq_check_limits - Helper function to check a cloned request |
2578 | * for new the queue limits | 1201 | * for new the queue limits |
@@ -2620,8 +1243,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, | |||
2620 | */ | 1243 | */ |
2621 | blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) | 1244 | blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) |
2622 | { | 1245 | { |
2623 | unsigned long flags; | 1246 | blk_qc_t unused; |
2624 | int where = ELEVATOR_INSERT_BACK; | ||
2625 | 1247 | ||
2626 | if (blk_cloned_rq_check_limits(q, rq)) | 1248 | if (blk_cloned_rq_check_limits(q, rq)) |
2627 | return BLK_STS_IOERR; | 1249 | return BLK_STS_IOERR; |
@@ -2630,38 +1252,15 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * | |||
2630 | should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) | 1252 | should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) |
2631 | return BLK_STS_IOERR; | 1253 | return BLK_STS_IOERR; |
2632 | 1254 | ||
2633 | if (q->mq_ops) { | 1255 | if (blk_queue_io_stat(q)) |
2634 | if (blk_queue_io_stat(q)) | 1256 | blk_account_io_start(rq, true); |
2635 | blk_account_io_start(rq, true); | ||
2636 | /* | ||
2637 | * Since we have a scheduler attached on the top device, | ||
2638 | * bypass a potential scheduler on the bottom device for | ||
2639 | * insert. | ||
2640 | */ | ||
2641 | return blk_mq_request_issue_directly(rq); | ||
2642 | } | ||
2643 | |||
2644 | spin_lock_irqsave(q->queue_lock, flags); | ||
2645 | if (unlikely(blk_queue_dying(q))) { | ||
2646 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2647 | return BLK_STS_IOERR; | ||
2648 | } | ||
2649 | 1257 | ||
2650 | /* | 1258 | /* |
2651 | * Submitting request must be dequeued before calling this function | 1259 | * Since we have a scheduler attached on the top device, |
2652 | * because it will be linked to another request_queue | 1260 | * bypass a potential scheduler on the bottom device for |
1261 | * insert. | ||
2653 | */ | 1262 | */ |
2654 | BUG_ON(blk_queued_rq(rq)); | 1263 | return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, true); |
2655 | |||
2656 | if (op_is_flush(rq->cmd_flags)) | ||
2657 | where = ELEVATOR_INSERT_FLUSH; | ||
2658 | |||
2659 | add_acct_request(q, rq, where); | ||
2660 | if (where == ELEVATOR_INSERT_FLUSH) | ||
2661 | __blk_run_queue(q); | ||
2662 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2663 | |||
2664 | return BLK_STS_OK; | ||
2665 | } | 1264 | } |
2666 | EXPORT_SYMBOL_GPL(blk_insert_cloned_request); | 1265 | EXPORT_SYMBOL_GPL(blk_insert_cloned_request); |
2667 | 1266 | ||
@@ -2711,11 +1310,10 @@ void blk_account_io_completion(struct request *req, unsigned int bytes) | |||
2711 | if (blk_do_io_stat(req)) { | 1310 | if (blk_do_io_stat(req)) { |
2712 | const int sgrp = op_stat_group(req_op(req)); | 1311 | const int sgrp = op_stat_group(req_op(req)); |
2713 | struct hd_struct *part; | 1312 | struct hd_struct *part; |
2714 | int cpu; | ||
2715 | 1313 | ||
2716 | cpu = part_stat_lock(); | 1314 | part_stat_lock(); |
2717 | part = req->part; | 1315 | part = req->part; |
2718 | part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); | 1316 | part_stat_add(part, sectors[sgrp], bytes >> 9); |
2719 | part_stat_unlock(); | 1317 | part_stat_unlock(); |
2720 | } | 1318 | } |
2721 | } | 1319 | } |
@@ -2730,14 +1328,14 @@ void blk_account_io_done(struct request *req, u64 now) | |||
2730 | if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { | 1328 | if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { |
2731 | const int sgrp = op_stat_group(req_op(req)); | 1329 | const int sgrp = op_stat_group(req_op(req)); |
2732 | struct hd_struct *part; | 1330 | struct hd_struct *part; |
2733 | int cpu; | ||
2734 | 1331 | ||
2735 | cpu = part_stat_lock(); | 1332 | part_stat_lock(); |
2736 | part = req->part; | 1333 | part = req->part; |
2737 | 1334 | ||
2738 | part_stat_inc(cpu, part, ios[sgrp]); | 1335 | update_io_ticks(part, jiffies); |
2739 | part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); | 1336 | part_stat_inc(part, ios[sgrp]); |
2740 | part_round_stats(req->q, cpu, part); | 1337 | part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); |
1338 | part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); | ||
2741 | part_dec_in_flight(req->q, part, rq_data_dir(req)); | 1339 | part_dec_in_flight(req->q, part, rq_data_dir(req)); |
2742 | 1340 | ||
2743 | hd_struct_put(part); | 1341 | hd_struct_put(part); |
@@ -2749,16 +1347,15 @@ void blk_account_io_start(struct request *rq, bool new_io) | |||
2749 | { | 1347 | { |
2750 | struct hd_struct *part; | 1348 | struct hd_struct *part; |
2751 | int rw = rq_data_dir(rq); | 1349 | int rw = rq_data_dir(rq); |
2752 | int cpu; | ||
2753 | 1350 | ||
2754 | if (!blk_do_io_stat(rq)) | 1351 | if (!blk_do_io_stat(rq)) |
2755 | return; | 1352 | return; |
2756 | 1353 | ||
2757 | cpu = part_stat_lock(); | 1354 | part_stat_lock(); |
2758 | 1355 | ||
2759 | if (!new_io) { | 1356 | if (!new_io) { |
2760 | part = rq->part; | 1357 | part = rq->part; |
2761 | part_stat_inc(cpu, part, merges[rw]); | 1358 | part_stat_inc(part, merges[rw]); |
2762 | } else { | 1359 | } else { |
2763 | part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); | 1360 | part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); |
2764 | if (!hd_struct_try_get(part)) { | 1361 | if (!hd_struct_try_get(part)) { |
@@ -2773,232 +1370,14 @@ void blk_account_io_start(struct request *rq, bool new_io) | |||
2773 | part = &rq->rq_disk->part0; | 1370 | part = &rq->rq_disk->part0; |
2774 | hd_struct_get(part); | 1371 | hd_struct_get(part); |
2775 | } | 1372 | } |
2776 | part_round_stats(rq->q, cpu, part); | ||
2777 | part_inc_in_flight(rq->q, part, rw); | 1373 | part_inc_in_flight(rq->q, part, rw); |
2778 | rq->part = part; | 1374 | rq->part = part; |
2779 | } | 1375 | } |
2780 | 1376 | ||
2781 | part_stat_unlock(); | 1377 | update_io_ticks(part, jiffies); |
2782 | } | ||
2783 | |||
2784 | static struct request *elv_next_request(struct request_queue *q) | ||
2785 | { | ||
2786 | struct request *rq; | ||
2787 | struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); | ||
2788 | |||
2789 | WARN_ON_ONCE(q->mq_ops); | ||
2790 | |||
2791 | while (1) { | ||
2792 | list_for_each_entry(rq, &q->queue_head, queuelist) { | ||
2793 | #ifdef CONFIG_PM | ||
2794 | /* | ||
2795 | * If a request gets queued in state RPM_SUSPENDED | ||
2796 | * then that's a kernel bug. | ||
2797 | */ | ||
2798 | WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED); | ||
2799 | #endif | ||
2800 | return rq; | ||
2801 | } | ||
2802 | |||
2803 | /* | ||
2804 | * Flush request is running and flush request isn't queueable | ||
2805 | * in the drive, we can hold the queue till flush request is | ||
2806 | * finished. Even we don't do this, driver can't dispatch next | ||
2807 | * requests and will requeue them. And this can improve | ||
2808 | * throughput too. For example, we have request flush1, write1, | ||
2809 | * flush 2. flush1 is dispatched, then queue is hold, write1 | ||
2810 | * isn't inserted to queue. After flush1 is finished, flush2 | ||
2811 | * will be dispatched. Since disk cache is already clean, | ||
2812 | * flush2 will be finished very soon, so looks like flush2 is | ||
2813 | * folded to flush1. | ||
2814 | * Since the queue is hold, a flag is set to indicate the queue | ||
2815 | * should be restarted later. Please see flush_end_io() for | ||
2816 | * details. | ||
2817 | */ | ||
2818 | if (fq->flush_pending_idx != fq->flush_running_idx && | ||
2819 | !queue_flush_queueable(q)) { | ||
2820 | fq->flush_queue_delayed = 1; | ||
2821 | return NULL; | ||
2822 | } | ||
2823 | if (unlikely(blk_queue_bypass(q)) || | ||
2824 | !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) | ||
2825 | return NULL; | ||
2826 | } | ||
2827 | } | ||
2828 | |||
2829 | /** | ||
2830 | * blk_peek_request - peek at the top of a request queue | ||
2831 | * @q: request queue to peek at | ||
2832 | * | ||
2833 | * Description: | ||
2834 | * Return the request at the top of @q. The returned request | ||
2835 | * should be started using blk_start_request() before LLD starts | ||
2836 | * processing it. | ||
2837 | * | ||
2838 | * Return: | ||
2839 | * Pointer to the request at the top of @q if available. Null | ||
2840 | * otherwise. | ||
2841 | */ | ||
2842 | struct request *blk_peek_request(struct request_queue *q) | ||
2843 | { | ||
2844 | struct request *rq; | ||
2845 | int ret; | ||
2846 | |||
2847 | lockdep_assert_held(q->queue_lock); | ||
2848 | WARN_ON_ONCE(q->mq_ops); | ||
2849 | |||
2850 | while ((rq = elv_next_request(q)) != NULL) { | ||
2851 | if (!(rq->rq_flags & RQF_STARTED)) { | ||
2852 | /* | ||
2853 | * This is the first time the device driver | ||
2854 | * sees this request (possibly after | ||
2855 | * requeueing). Notify IO scheduler. | ||
2856 | */ | ||
2857 | if (rq->rq_flags & RQF_SORTED) | ||
2858 | elv_activate_rq(q, rq); | ||
2859 | |||
2860 | /* | ||
2861 | * just mark as started even if we don't start | ||
2862 | * it, a request that has been delayed should | ||
2863 | * not be passed by new incoming requests | ||
2864 | */ | ||
2865 | rq->rq_flags |= RQF_STARTED; | ||
2866 | trace_block_rq_issue(q, rq); | ||
2867 | } | ||
2868 | |||
2869 | if (!q->boundary_rq || q->boundary_rq == rq) { | ||
2870 | q->end_sector = rq_end_sector(rq); | ||
2871 | q->boundary_rq = NULL; | ||
2872 | } | ||
2873 | |||
2874 | if (rq->rq_flags & RQF_DONTPREP) | ||
2875 | break; | ||
2876 | |||
2877 | if (q->dma_drain_size && blk_rq_bytes(rq)) { | ||
2878 | /* | ||
2879 | * make sure space for the drain appears we | ||
2880 | * know we can do this because max_hw_segments | ||
2881 | * has been adjusted to be one fewer than the | ||
2882 | * device can handle | ||
2883 | */ | ||
2884 | rq->nr_phys_segments++; | ||
2885 | } | ||
2886 | |||
2887 | if (!q->prep_rq_fn) | ||
2888 | break; | ||
2889 | |||
2890 | ret = q->prep_rq_fn(q, rq); | ||
2891 | if (ret == BLKPREP_OK) { | ||
2892 | break; | ||
2893 | } else if (ret == BLKPREP_DEFER) { | ||
2894 | /* | ||
2895 | * the request may have been (partially) prepped. | ||
2896 | * we need to keep this request in the front to | ||
2897 | * avoid resource deadlock. RQF_STARTED will | ||
2898 | * prevent other fs requests from passing this one. | ||
2899 | */ | ||
2900 | if (q->dma_drain_size && blk_rq_bytes(rq) && | ||
2901 | !(rq->rq_flags & RQF_DONTPREP)) { | ||
2902 | /* | ||
2903 | * remove the space for the drain we added | ||
2904 | * so that we don't add it again | ||
2905 | */ | ||
2906 | --rq->nr_phys_segments; | ||
2907 | } | ||
2908 | |||
2909 | rq = NULL; | ||
2910 | break; | ||
2911 | } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { | ||
2912 | rq->rq_flags |= RQF_QUIET; | ||
2913 | /* | ||
2914 | * Mark this request as started so we don't trigger | ||
2915 | * any debug logic in the end I/O path. | ||
2916 | */ | ||
2917 | blk_start_request(rq); | ||
2918 | __blk_end_request_all(rq, ret == BLKPREP_INVALID ? | ||
2919 | BLK_STS_TARGET : BLK_STS_IOERR); | ||
2920 | } else { | ||
2921 | printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); | ||
2922 | break; | ||
2923 | } | ||
2924 | } | ||
2925 | |||
2926 | return rq; | ||
2927 | } | ||
2928 | EXPORT_SYMBOL(blk_peek_request); | ||
2929 | |||
2930 | static void blk_dequeue_request(struct request *rq) | ||
2931 | { | ||
2932 | struct request_queue *q = rq->q; | ||
2933 | 1378 | ||
2934 | BUG_ON(list_empty(&rq->queuelist)); | 1379 | part_stat_unlock(); |
2935 | BUG_ON(ELV_ON_HASH(rq)); | ||
2936 | |||
2937 | list_del_init(&rq->queuelist); | ||
2938 | |||
2939 | /* | ||
2940 | * the time frame between a request being removed from the lists | ||
2941 | * and to it is freed is accounted as io that is in progress at | ||
2942 | * the driver side. | ||
2943 | */ | ||
2944 | if (blk_account_rq(rq)) | ||
2945 | q->in_flight[rq_is_sync(rq)]++; | ||
2946 | } | ||
2947 | |||
2948 | /** | ||
2949 | * blk_start_request - start request processing on the driver | ||
2950 | * @req: request to dequeue | ||
2951 | * | ||
2952 | * Description: | ||
2953 | * Dequeue @req and start timeout timer on it. This hands off the | ||
2954 | * request to the driver. | ||
2955 | */ | ||
2956 | void blk_start_request(struct request *req) | ||
2957 | { | ||
2958 | lockdep_assert_held(req->q->queue_lock); | ||
2959 | WARN_ON_ONCE(req->q->mq_ops); | ||
2960 | |||
2961 | blk_dequeue_request(req); | ||
2962 | |||
2963 | if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { | ||
2964 | req->io_start_time_ns = ktime_get_ns(); | ||
2965 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
2966 | req->throtl_size = blk_rq_sectors(req); | ||
2967 | #endif | ||
2968 | req->rq_flags |= RQF_STATS; | ||
2969 | rq_qos_issue(req->q, req); | ||
2970 | } | ||
2971 | |||
2972 | BUG_ON(blk_rq_is_complete(req)); | ||
2973 | blk_add_timer(req); | ||
2974 | } | ||
2975 | EXPORT_SYMBOL(blk_start_request); | ||
2976 | |||
2977 | /** | ||
2978 | * blk_fetch_request - fetch a request from a request queue | ||
2979 | * @q: request queue to fetch a request from | ||
2980 | * | ||
2981 | * Description: | ||
2982 | * Return the request at the top of @q. The request is started on | ||
2983 | * return and LLD can start processing it immediately. | ||
2984 | * | ||
2985 | * Return: | ||
2986 | * Pointer to the request at the top of @q if available. Null | ||
2987 | * otherwise. | ||
2988 | */ | ||
2989 | struct request *blk_fetch_request(struct request_queue *q) | ||
2990 | { | ||
2991 | struct request *rq; | ||
2992 | |||
2993 | lockdep_assert_held(q->queue_lock); | ||
2994 | WARN_ON_ONCE(q->mq_ops); | ||
2995 | |||
2996 | rq = blk_peek_request(q); | ||
2997 | if (rq) | ||
2998 | blk_start_request(rq); | ||
2999 | return rq; | ||
3000 | } | 1380 | } |
3001 | EXPORT_SYMBOL(blk_fetch_request); | ||
3002 | 1381 | ||
3003 | /* | 1382 | /* |
3004 | * Steal bios from a request and add them to a bio list. | 1383 | * Steal bios from a request and add them to a bio list. |
@@ -3125,255 +1504,6 @@ bool blk_update_request(struct request *req, blk_status_t error, | |||
3125 | } | 1504 | } |
3126 | EXPORT_SYMBOL_GPL(blk_update_request); | 1505 | EXPORT_SYMBOL_GPL(blk_update_request); |
3127 | 1506 | ||
3128 | static bool blk_update_bidi_request(struct request *rq, blk_status_t error, | ||
3129 | unsigned int nr_bytes, | ||
3130 | unsigned int bidi_bytes) | ||
3131 | { | ||
3132 | if (blk_update_request(rq, error, nr_bytes)) | ||
3133 | return true; | ||
3134 | |||
3135 | /* Bidi request must be completed as a whole */ | ||
3136 | if (unlikely(blk_bidi_rq(rq)) && | ||
3137 | blk_update_request(rq->next_rq, error, bidi_bytes)) | ||
3138 | return true; | ||
3139 | |||
3140 | if (blk_queue_add_random(rq->q)) | ||
3141 | add_disk_randomness(rq->rq_disk); | ||
3142 | |||
3143 | return false; | ||
3144 | } | ||
3145 | |||
3146 | /** | ||
3147 | * blk_unprep_request - unprepare a request | ||
3148 | * @req: the request | ||
3149 | * | ||
3150 | * This function makes a request ready for complete resubmission (or | ||
3151 | * completion). It happens only after all error handling is complete, | ||
3152 | * so represents the appropriate moment to deallocate any resources | ||
3153 | * that were allocated to the request in the prep_rq_fn. The queue | ||
3154 | * lock is held when calling this. | ||
3155 | */ | ||
3156 | void blk_unprep_request(struct request *req) | ||
3157 | { | ||
3158 | struct request_queue *q = req->q; | ||
3159 | |||
3160 | req->rq_flags &= ~RQF_DONTPREP; | ||
3161 | if (q->unprep_rq_fn) | ||
3162 | q->unprep_rq_fn(q, req); | ||
3163 | } | ||
3164 | EXPORT_SYMBOL_GPL(blk_unprep_request); | ||
3165 | |||
3166 | void blk_finish_request(struct request *req, blk_status_t error) | ||
3167 | { | ||
3168 | struct request_queue *q = req->q; | ||
3169 | u64 now = ktime_get_ns(); | ||
3170 | |||
3171 | lockdep_assert_held(req->q->queue_lock); | ||
3172 | WARN_ON_ONCE(q->mq_ops); | ||
3173 | |||
3174 | if (req->rq_flags & RQF_STATS) | ||
3175 | blk_stat_add(req, now); | ||
3176 | |||
3177 | if (req->rq_flags & RQF_QUEUED) | ||
3178 | blk_queue_end_tag(q, req); | ||
3179 | |||
3180 | BUG_ON(blk_queued_rq(req)); | ||
3181 | |||
3182 | if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req)) | ||
3183 | laptop_io_completion(req->q->backing_dev_info); | ||
3184 | |||
3185 | blk_delete_timer(req); | ||
3186 | |||
3187 | if (req->rq_flags & RQF_DONTPREP) | ||
3188 | blk_unprep_request(req); | ||
3189 | |||
3190 | blk_account_io_done(req, now); | ||
3191 | |||
3192 | if (req->end_io) { | ||
3193 | rq_qos_done(q, req); | ||
3194 | req->end_io(req, error); | ||
3195 | } else { | ||
3196 | if (blk_bidi_rq(req)) | ||
3197 | __blk_put_request(req->next_rq->q, req->next_rq); | ||
3198 | |||
3199 | __blk_put_request(q, req); | ||
3200 | } | ||
3201 | } | ||
3202 | EXPORT_SYMBOL(blk_finish_request); | ||
3203 | |||
3204 | /** | ||
3205 | * blk_end_bidi_request - Complete a bidi request | ||
3206 | * @rq: the request to complete | ||
3207 | * @error: block status code | ||
3208 | * @nr_bytes: number of bytes to complete @rq | ||
3209 | * @bidi_bytes: number of bytes to complete @rq->next_rq | ||
3210 | * | ||
3211 | * Description: | ||
3212 | * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. | ||
3213 | * Drivers that supports bidi can safely call this member for any | ||
3214 | * type of request, bidi or uni. In the later case @bidi_bytes is | ||
3215 | * just ignored. | ||
3216 | * | ||
3217 | * Return: | ||
3218 | * %false - we are done with this request | ||
3219 | * %true - still buffers pending for this request | ||
3220 | **/ | ||
3221 | static bool blk_end_bidi_request(struct request *rq, blk_status_t error, | ||
3222 | unsigned int nr_bytes, unsigned int bidi_bytes) | ||
3223 | { | ||
3224 | struct request_queue *q = rq->q; | ||
3225 | unsigned long flags; | ||
3226 | |||
3227 | WARN_ON_ONCE(q->mq_ops); | ||
3228 | |||
3229 | if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) | ||
3230 | return true; | ||
3231 | |||
3232 | spin_lock_irqsave(q->queue_lock, flags); | ||
3233 | blk_finish_request(rq, error); | ||
3234 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
3235 | |||
3236 | return false; | ||
3237 | } | ||
3238 | |||
3239 | /** | ||
3240 | * __blk_end_bidi_request - Complete a bidi request with queue lock held | ||
3241 | * @rq: the request to complete | ||
3242 | * @error: block status code | ||
3243 | * @nr_bytes: number of bytes to complete @rq | ||
3244 | * @bidi_bytes: number of bytes to complete @rq->next_rq | ||
3245 | * | ||
3246 | * Description: | ||
3247 | * Identical to blk_end_bidi_request() except that queue lock is | ||
3248 | * assumed to be locked on entry and remains so on return. | ||
3249 | * | ||
3250 | * Return: | ||
3251 | * %false - we are done with this request | ||
3252 | * %true - still buffers pending for this request | ||
3253 | **/ | ||
3254 | static bool __blk_end_bidi_request(struct request *rq, blk_status_t error, | ||
3255 | unsigned int nr_bytes, unsigned int bidi_bytes) | ||
3256 | { | ||
3257 | lockdep_assert_held(rq->q->queue_lock); | ||
3258 | WARN_ON_ONCE(rq->q->mq_ops); | ||
3259 | |||
3260 | if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) | ||
3261 | return true; | ||
3262 | |||
3263 | blk_finish_request(rq, error); | ||
3264 | |||
3265 | return false; | ||
3266 | } | ||
3267 | |||
3268 | /** | ||
3269 | * blk_end_request - Helper function for drivers to complete the request. | ||
3270 | * @rq: the request being processed | ||
3271 | * @error: block status code | ||
3272 | * @nr_bytes: number of bytes to complete | ||
3273 | * | ||
3274 | * Description: | ||
3275 | * Ends I/O on a number of bytes attached to @rq. | ||
3276 | * If @rq has leftover, sets it up for the next range of segments. | ||
3277 | * | ||
3278 | * Return: | ||
3279 | * %false - we are done with this request | ||
3280 | * %true - still buffers pending for this request | ||
3281 | **/ | ||
3282 | bool blk_end_request(struct request *rq, blk_status_t error, | ||
3283 | unsigned int nr_bytes) | ||
3284 | { | ||
3285 | WARN_ON_ONCE(rq->q->mq_ops); | ||
3286 | return blk_end_bidi_request(rq, error, nr_bytes, 0); | ||
3287 | } | ||
3288 | EXPORT_SYMBOL(blk_end_request); | ||
3289 | |||
3290 | /** | ||
3291 | * blk_end_request_all - Helper function for drives to finish the request. | ||
3292 | * @rq: the request to finish | ||
3293 | * @error: block status code | ||
3294 | * | ||
3295 | * Description: | ||
3296 | * Completely finish @rq. | ||
3297 | */ | ||
3298 | void blk_end_request_all(struct request *rq, blk_status_t error) | ||
3299 | { | ||
3300 | bool pending; | ||
3301 | unsigned int bidi_bytes = 0; | ||
3302 | |||
3303 | if (unlikely(blk_bidi_rq(rq))) | ||
3304 | bidi_bytes = blk_rq_bytes(rq->next_rq); | ||
3305 | |||
3306 | pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); | ||
3307 | BUG_ON(pending); | ||
3308 | } | ||
3309 | EXPORT_SYMBOL(blk_end_request_all); | ||
3310 | |||
3311 | /** | ||
3312 | * __blk_end_request - Helper function for drivers to complete the request. | ||
3313 | * @rq: the request being processed | ||
3314 | * @error: block status code | ||
3315 | * @nr_bytes: number of bytes to complete | ||
3316 | * | ||
3317 | * Description: | ||
3318 | * Must be called with queue lock held unlike blk_end_request(). | ||
3319 | * | ||
3320 | * Return: | ||
3321 | * %false - we are done with this request | ||
3322 | * %true - still buffers pending for this request | ||
3323 | **/ | ||
3324 | bool __blk_end_request(struct request *rq, blk_status_t error, | ||
3325 | unsigned int nr_bytes) | ||
3326 | { | ||
3327 | lockdep_assert_held(rq->q->queue_lock); | ||
3328 | WARN_ON_ONCE(rq->q->mq_ops); | ||
3329 | |||
3330 | return __blk_end_bidi_request(rq, error, nr_bytes, 0); | ||
3331 | } | ||
3332 | EXPORT_SYMBOL(__blk_end_request); | ||
3333 | |||
3334 | /** | ||
3335 | * __blk_end_request_all - Helper function for drives to finish the request. | ||
3336 | * @rq: the request to finish | ||
3337 | * @error: block status code | ||
3338 | * | ||
3339 | * Description: | ||
3340 | * Completely finish @rq. Must be called with queue lock held. | ||
3341 | */ | ||
3342 | void __blk_end_request_all(struct request *rq, blk_status_t error) | ||
3343 | { | ||
3344 | bool pending; | ||
3345 | unsigned int bidi_bytes = 0; | ||
3346 | |||
3347 | lockdep_assert_held(rq->q->queue_lock); | ||
3348 | WARN_ON_ONCE(rq->q->mq_ops); | ||
3349 | |||
3350 | if (unlikely(blk_bidi_rq(rq))) | ||
3351 | bidi_bytes = blk_rq_bytes(rq->next_rq); | ||
3352 | |||
3353 | pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); | ||
3354 | BUG_ON(pending); | ||
3355 | } | ||
3356 | EXPORT_SYMBOL(__blk_end_request_all); | ||
3357 | |||
3358 | /** | ||
3359 | * __blk_end_request_cur - Helper function to finish the current request chunk. | ||
3360 | * @rq: the request to finish the current chunk for | ||
3361 | * @error: block status code | ||
3362 | * | ||
3363 | * Description: | ||
3364 | * Complete the current consecutively mapped chunk from @rq. Must | ||
3365 | * be called with queue lock held. | ||
3366 | * | ||
3367 | * Return: | ||
3368 | * %false - we are done with this request | ||
3369 | * %true - still buffers pending for this request | ||
3370 | */ | ||
3371 | bool __blk_end_request_cur(struct request *rq, blk_status_t error) | ||
3372 | { | ||
3373 | return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); | ||
3374 | } | ||
3375 | EXPORT_SYMBOL(__blk_end_request_cur); | ||
3376 | |||
3377 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | 1507 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, |
3378 | struct bio *bio) | 1508 | struct bio *bio) |
3379 | { | 1509 | { |
@@ -3429,8 +1559,8 @@ EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); | |||
3429 | */ | 1559 | */ |
3430 | int blk_lld_busy(struct request_queue *q) | 1560 | int blk_lld_busy(struct request_queue *q) |
3431 | { | 1561 | { |
3432 | if (q->lld_busy_fn) | 1562 | if (queue_is_mq(q) && q->mq_ops->busy) |
3433 | return q->lld_busy_fn(q); | 1563 | return q->mq_ops->busy(q); |
3434 | 1564 | ||
3435 | return 0; | 1565 | return 0; |
3436 | } | 1566 | } |
@@ -3461,7 +1591,6 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); | |||
3461 | */ | 1591 | */ |
3462 | static void __blk_rq_prep_clone(struct request *dst, struct request *src) | 1592 | static void __blk_rq_prep_clone(struct request *dst, struct request *src) |
3463 | { | 1593 | { |
3464 | dst->cpu = src->cpu; | ||
3465 | dst->__sector = blk_rq_pos(src); | 1594 | dst->__sector = blk_rq_pos(src); |
3466 | dst->__data_len = blk_rq_bytes(src); | 1595 | dst->__data_len = blk_rq_bytes(src); |
3467 | if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { | 1596 | if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { |
@@ -3573,9 +1702,11 @@ void blk_start_plug(struct blk_plug *plug) | |||
3573 | if (tsk->plug) | 1702 | if (tsk->plug) |
3574 | return; | 1703 | return; |
3575 | 1704 | ||
3576 | INIT_LIST_HEAD(&plug->list); | ||
3577 | INIT_LIST_HEAD(&plug->mq_list); | 1705 | INIT_LIST_HEAD(&plug->mq_list); |
3578 | INIT_LIST_HEAD(&plug->cb_list); | 1706 | INIT_LIST_HEAD(&plug->cb_list); |
1707 | plug->rq_count = 0; | ||
1708 | plug->multiple_queues = false; | ||
1709 | |||
3579 | /* | 1710 | /* |
3580 | * Store ordering should not be needed here, since a potential | 1711 | * Store ordering should not be needed here, since a potential |
3581 | * preempt will imply a full memory barrier | 1712 | * preempt will imply a full memory barrier |
@@ -3584,36 +1715,6 @@ void blk_start_plug(struct blk_plug *plug) | |||
3584 | } | 1715 | } |
3585 | EXPORT_SYMBOL(blk_start_plug); | 1716 | EXPORT_SYMBOL(blk_start_plug); |
3586 | 1717 | ||
3587 | static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) | ||
3588 | { | ||
3589 | struct request *rqa = container_of(a, struct request, queuelist); | ||
3590 | struct request *rqb = container_of(b, struct request, queuelist); | ||
3591 | |||
3592 | return !(rqa->q < rqb->q || | ||
3593 | (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb))); | ||
3594 | } | ||
3595 | |||
3596 | /* | ||
3597 | * If 'from_schedule' is true, then postpone the dispatch of requests | ||
3598 | * until a safe kblockd context. We due this to avoid accidental big | ||
3599 | * additional stack usage in driver dispatch, in places where the originally | ||
3600 | * plugger did not intend it. | ||
3601 | */ | ||
3602 | static void queue_unplugged(struct request_queue *q, unsigned int depth, | ||
3603 | bool from_schedule) | ||
3604 | __releases(q->queue_lock) | ||
3605 | { | ||
3606 | lockdep_assert_held(q->queue_lock); | ||
3607 | |||
3608 | trace_block_unplug(q, depth, !from_schedule); | ||
3609 | |||
3610 | if (from_schedule) | ||
3611 | blk_run_queue_async(q); | ||
3612 | else | ||
3613 | __blk_run_queue(q); | ||
3614 | spin_unlock_irq(q->queue_lock); | ||
3615 | } | ||
3616 | |||
3617 | static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) | 1718 | static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) |
3618 | { | 1719 | { |
3619 | LIST_HEAD(callbacks); | 1720 | LIST_HEAD(callbacks); |
@@ -3658,65 +1759,10 @@ EXPORT_SYMBOL(blk_check_plugged); | |||
3658 | 1759 | ||
3659 | void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) | 1760 | void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) |
3660 | { | 1761 | { |
3661 | struct request_queue *q; | ||
3662 | struct request *rq; | ||
3663 | LIST_HEAD(list); | ||
3664 | unsigned int depth; | ||
3665 | |||
3666 | flush_plug_callbacks(plug, from_schedule); | 1762 | flush_plug_callbacks(plug, from_schedule); |
3667 | 1763 | ||
3668 | if (!list_empty(&plug->mq_list)) | 1764 | if (!list_empty(&plug->mq_list)) |
3669 | blk_mq_flush_plug_list(plug, from_schedule); | 1765 | blk_mq_flush_plug_list(plug, from_schedule); |
3670 | |||
3671 | if (list_empty(&plug->list)) | ||
3672 | return; | ||
3673 | |||
3674 | list_splice_init(&plug->list, &list); | ||
3675 | |||
3676 | list_sort(NULL, &list, plug_rq_cmp); | ||
3677 | |||
3678 | q = NULL; | ||
3679 | depth = 0; | ||
3680 | |||
3681 | while (!list_empty(&list)) { | ||
3682 | rq = list_entry_rq(list.next); | ||
3683 | list_del_init(&rq->queuelist); | ||
3684 | BUG_ON(!rq->q); | ||
3685 | if (rq->q != q) { | ||
3686 | /* | ||
3687 | * This drops the queue lock | ||
3688 | */ | ||
3689 | if (q) | ||
3690 | queue_unplugged(q, depth, from_schedule); | ||
3691 | q = rq->q; | ||
3692 | depth = 0; | ||
3693 | spin_lock_irq(q->queue_lock); | ||
3694 | } | ||
3695 | |||
3696 | /* | ||
3697 | * Short-circuit if @q is dead | ||
3698 | */ | ||
3699 | if (unlikely(blk_queue_dying(q))) { | ||
3700 | __blk_end_request_all(rq, BLK_STS_IOERR); | ||
3701 | continue; | ||
3702 | } | ||
3703 | |||
3704 | /* | ||
3705 | * rq is already accounted, so use raw insert | ||
3706 | */ | ||
3707 | if (op_is_flush(rq->cmd_flags)) | ||
3708 | __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); | ||
3709 | else | ||
3710 | __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); | ||
3711 | |||
3712 | depth++; | ||
3713 | } | ||
3714 | |||
3715 | /* | ||
3716 | * This drops the queue lock | ||
3717 | */ | ||
3718 | if (q) | ||
3719 | queue_unplugged(q, depth, from_schedule); | ||
3720 | } | 1766 | } |
3721 | 1767 | ||
3722 | void blk_finish_plug(struct blk_plug *plug) | 1768 | void blk_finish_plug(struct blk_plug *plug) |
@@ -3743,9 +1789,6 @@ int __init blk_dev_init(void) | |||
3743 | if (!kblockd_workqueue) | 1789 | if (!kblockd_workqueue) |
3744 | panic("Failed to create kblockd\n"); | 1790 | panic("Failed to create kblockd\n"); |
3745 | 1791 | ||
3746 | request_cachep = kmem_cache_create("blkdev_requests", | ||
3747 | sizeof(struct request), 0, SLAB_PANIC, NULL); | ||
3748 | |||
3749 | blk_requestq_cachep = kmem_cache_create("request_queue", | 1792 | blk_requestq_cachep = kmem_cache_create("request_queue", |
3750 | sizeof(struct request_queue), 0, SLAB_PANIC, NULL); | 1793 | sizeof(struct request_queue), 0, SLAB_PANIC, NULL); |
3751 | 1794 | ||
diff --git a/block/blk-exec.c b/block/blk-exec.c index f7b292f12449..a34b7d918742 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c | |||
@@ -48,8 +48,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, | |||
48 | struct request *rq, int at_head, | 48 | struct request *rq, int at_head, |
49 | rq_end_io_fn *done) | 49 | rq_end_io_fn *done) |
50 | { | 50 | { |
51 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; | ||
52 | |||
53 | WARN_ON(irqs_disabled()); | 51 | WARN_ON(irqs_disabled()); |
54 | WARN_ON(!blk_rq_is_passthrough(rq)); | 52 | WARN_ON(!blk_rq_is_passthrough(rq)); |
55 | 53 | ||
@@ -60,23 +58,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, | |||
60 | * don't check dying flag for MQ because the request won't | 58 | * don't check dying flag for MQ because the request won't |
61 | * be reused after dying flag is set | 59 | * be reused after dying flag is set |
62 | */ | 60 | */ |
63 | if (q->mq_ops) { | 61 | blk_mq_sched_insert_request(rq, at_head, true, false); |
64 | blk_mq_sched_insert_request(rq, at_head, true, false); | ||
65 | return; | ||
66 | } | ||
67 | |||
68 | spin_lock_irq(q->queue_lock); | ||
69 | |||
70 | if (unlikely(blk_queue_dying(q))) { | ||
71 | rq->rq_flags |= RQF_QUIET; | ||
72 | __blk_end_request_all(rq, BLK_STS_IOERR); | ||
73 | spin_unlock_irq(q->queue_lock); | ||
74 | return; | ||
75 | } | ||
76 | |||
77 | __elv_add_request(q, rq, where); | ||
78 | __blk_run_queue(q); | ||
79 | spin_unlock_irq(q->queue_lock); | ||
80 | } | 62 | } |
81 | EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); | 63 | EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); |
82 | 64 | ||
diff --git a/block/blk-flush.c b/block/blk-flush.c index 8b44b86779da..a3fc7191c694 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c | |||
@@ -93,7 +93,7 @@ enum { | |||
93 | FLUSH_PENDING_TIMEOUT = 5 * HZ, | 93 | FLUSH_PENDING_TIMEOUT = 5 * HZ, |
94 | }; | 94 | }; |
95 | 95 | ||
96 | static bool blk_kick_flush(struct request_queue *q, | 96 | static void blk_kick_flush(struct request_queue *q, |
97 | struct blk_flush_queue *fq, unsigned int flags); | 97 | struct blk_flush_queue *fq, unsigned int flags); |
98 | 98 | ||
99 | static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) | 99 | static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) |
@@ -132,18 +132,9 @@ static void blk_flush_restore_request(struct request *rq) | |||
132 | rq->end_io = rq->flush.saved_end_io; | 132 | rq->end_io = rq->flush.saved_end_io; |
133 | } | 133 | } |
134 | 134 | ||
135 | static bool blk_flush_queue_rq(struct request *rq, bool add_front) | 135 | static void blk_flush_queue_rq(struct request *rq, bool add_front) |
136 | { | 136 | { |
137 | if (rq->q->mq_ops) { | 137 | blk_mq_add_to_requeue_list(rq, add_front, true); |
138 | blk_mq_add_to_requeue_list(rq, add_front, true); | ||
139 | return false; | ||
140 | } else { | ||
141 | if (add_front) | ||
142 | list_add(&rq->queuelist, &rq->q->queue_head); | ||
143 | else | ||
144 | list_add_tail(&rq->queuelist, &rq->q->queue_head); | ||
145 | return true; | ||
146 | } | ||
147 | } | 138 | } |
148 | 139 | ||
149 | /** | 140 | /** |
@@ -157,18 +148,17 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) | |||
157 | * completion and trigger the next step. | 148 | * completion and trigger the next step. |
158 | * | 149 | * |
159 | * CONTEXT: | 150 | * CONTEXT: |
160 | * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) | 151 | * spin_lock_irq(fq->mq_flush_lock) |
161 | * | 152 | * |
162 | * RETURNS: | 153 | * RETURNS: |
163 | * %true if requests were added to the dispatch queue, %false otherwise. | 154 | * %true if requests were added to the dispatch queue, %false otherwise. |
164 | */ | 155 | */ |
165 | static bool blk_flush_complete_seq(struct request *rq, | 156 | static void blk_flush_complete_seq(struct request *rq, |
166 | struct blk_flush_queue *fq, | 157 | struct blk_flush_queue *fq, |
167 | unsigned int seq, blk_status_t error) | 158 | unsigned int seq, blk_status_t error) |
168 | { | 159 | { |
169 | struct request_queue *q = rq->q; | 160 | struct request_queue *q = rq->q; |
170 | struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; | 161 | struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; |
171 | bool queued = false, kicked; | ||
172 | unsigned int cmd_flags; | 162 | unsigned int cmd_flags; |
173 | 163 | ||
174 | BUG_ON(rq->flush.seq & seq); | 164 | BUG_ON(rq->flush.seq & seq); |
@@ -191,7 +181,7 @@ static bool blk_flush_complete_seq(struct request *rq, | |||
191 | 181 | ||
192 | case REQ_FSEQ_DATA: | 182 | case REQ_FSEQ_DATA: |
193 | list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); | 183 | list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); |
194 | queued = blk_flush_queue_rq(rq, true); | 184 | blk_flush_queue_rq(rq, true); |
195 | break; | 185 | break; |
196 | 186 | ||
197 | case REQ_FSEQ_DONE: | 187 | case REQ_FSEQ_DONE: |
@@ -204,42 +194,34 @@ static bool blk_flush_complete_seq(struct request *rq, | |||
204 | BUG_ON(!list_empty(&rq->queuelist)); | 194 | BUG_ON(!list_empty(&rq->queuelist)); |
205 | list_del_init(&rq->flush.list); | 195 | list_del_init(&rq->flush.list); |
206 | blk_flush_restore_request(rq); | 196 | blk_flush_restore_request(rq); |
207 | if (q->mq_ops) | 197 | blk_mq_end_request(rq, error); |
208 | blk_mq_end_request(rq, error); | ||
209 | else | ||
210 | __blk_end_request_all(rq, error); | ||
211 | break; | 198 | break; |
212 | 199 | ||
213 | default: | 200 | default: |
214 | BUG(); | 201 | BUG(); |
215 | } | 202 | } |
216 | 203 | ||
217 | kicked = blk_kick_flush(q, fq, cmd_flags); | 204 | blk_kick_flush(q, fq, cmd_flags); |
218 | return kicked | queued; | ||
219 | } | 205 | } |
220 | 206 | ||
221 | static void flush_end_io(struct request *flush_rq, blk_status_t error) | 207 | static void flush_end_io(struct request *flush_rq, blk_status_t error) |
222 | { | 208 | { |
223 | struct request_queue *q = flush_rq->q; | 209 | struct request_queue *q = flush_rq->q; |
224 | struct list_head *running; | 210 | struct list_head *running; |
225 | bool queued = false; | ||
226 | struct request *rq, *n; | 211 | struct request *rq, *n; |
227 | unsigned long flags = 0; | 212 | unsigned long flags = 0; |
228 | struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); | 213 | struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); |
214 | struct blk_mq_hw_ctx *hctx; | ||
229 | 215 | ||
230 | if (q->mq_ops) { | 216 | /* release the tag's ownership to the req cloned from */ |
231 | struct blk_mq_hw_ctx *hctx; | 217 | spin_lock_irqsave(&fq->mq_flush_lock, flags); |
232 | 218 | hctx = flush_rq->mq_hctx; | |
233 | /* release the tag's ownership to the req cloned from */ | 219 | if (!q->elevator) { |
234 | spin_lock_irqsave(&fq->mq_flush_lock, flags); | 220 | blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); |
235 | hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); | 221 | flush_rq->tag = -1; |
236 | if (!q->elevator) { | 222 | } else { |
237 | blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); | 223 | blk_mq_put_driver_tag_hctx(hctx, flush_rq); |
238 | flush_rq->tag = -1; | 224 | flush_rq->internal_tag = -1; |
239 | } else { | ||
240 | blk_mq_put_driver_tag_hctx(hctx, flush_rq); | ||
241 | flush_rq->internal_tag = -1; | ||
242 | } | ||
243 | } | 225 | } |
244 | 226 | ||
245 | running = &fq->flush_queue[fq->flush_running_idx]; | 227 | running = &fq->flush_queue[fq->flush_running_idx]; |
@@ -248,35 +230,16 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) | |||
248 | /* account completion of the flush request */ | 230 | /* account completion of the flush request */ |
249 | fq->flush_running_idx ^= 1; | 231 | fq->flush_running_idx ^= 1; |
250 | 232 | ||
251 | if (!q->mq_ops) | ||
252 | elv_completed_request(q, flush_rq); | ||
253 | |||
254 | /* and push the waiting requests to the next stage */ | 233 | /* and push the waiting requests to the next stage */ |
255 | list_for_each_entry_safe(rq, n, running, flush.list) { | 234 | list_for_each_entry_safe(rq, n, running, flush.list) { |
256 | unsigned int seq = blk_flush_cur_seq(rq); | 235 | unsigned int seq = blk_flush_cur_seq(rq); |
257 | 236 | ||
258 | BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); | 237 | BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); |
259 | queued |= blk_flush_complete_seq(rq, fq, seq, error); | 238 | blk_flush_complete_seq(rq, fq, seq, error); |
260 | } | 239 | } |
261 | 240 | ||
262 | /* | ||
263 | * Kick the queue to avoid stall for two cases: | ||
264 | * 1. Moving a request silently to empty queue_head may stall the | ||
265 | * queue. | ||
266 | * 2. When flush request is running in non-queueable queue, the | ||
267 | * queue is hold. Restart the queue after flush request is finished | ||
268 | * to avoid stall. | ||
269 | * This function is called from request completion path and calling | ||
270 | * directly into request_fn may confuse the driver. Always use | ||
271 | * kblockd. | ||
272 | */ | ||
273 | if (queued || fq->flush_queue_delayed) { | ||
274 | WARN_ON(q->mq_ops); | ||
275 | blk_run_queue_async(q); | ||
276 | } | ||
277 | fq->flush_queue_delayed = 0; | 241 | fq->flush_queue_delayed = 0; |
278 | if (q->mq_ops) | 242 | spin_unlock_irqrestore(&fq->mq_flush_lock, flags); |
279 | spin_unlock_irqrestore(&fq->mq_flush_lock, flags); | ||
280 | } | 243 | } |
281 | 244 | ||
282 | /** | 245 | /** |
@@ -289,12 +252,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) | |||
289 | * Please read the comment at the top of this file for more info. | 252 | * Please read the comment at the top of this file for more info. |
290 | * | 253 | * |
291 | * CONTEXT: | 254 | * CONTEXT: |
292 | * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) | 255 | * spin_lock_irq(fq->mq_flush_lock) |
293 | * | 256 | * |
294 | * RETURNS: | ||
295 | * %true if flush was issued, %false otherwise. | ||
296 | */ | 257 | */ |
297 | static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, | 258 | static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, |
298 | unsigned int flags) | 259 | unsigned int flags) |
299 | { | 260 | { |
300 | struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; | 261 | struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; |
@@ -304,7 +265,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, | |||
304 | 265 | ||
305 | /* C1 described at the top of this file */ | 266 | /* C1 described at the top of this file */ |
306 | if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) | 267 | if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) |
307 | return false; | 268 | return; |
308 | 269 | ||
309 | /* C2 and C3 | 270 | /* C2 and C3 |
310 | * | 271 | * |
@@ -312,11 +273,10 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, | |||
312 | * assigned to empty flushes, and we deadlock if we are expecting | 273 | * assigned to empty flushes, and we deadlock if we are expecting |
313 | * other requests to make progress. Don't defer for that case. | 274 | * other requests to make progress. Don't defer for that case. |
314 | */ | 275 | */ |
315 | if (!list_empty(&fq->flush_data_in_flight) && | 276 | if (!list_empty(&fq->flush_data_in_flight) && q->elevator && |
316 | !(q->mq_ops && q->elevator) && | ||
317 | time_before(jiffies, | 277 | time_before(jiffies, |
318 | fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) | 278 | fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) |
319 | return false; | 279 | return; |
320 | 280 | ||
321 | /* | 281 | /* |
322 | * Issue flush and toggle pending_idx. This makes pending_idx | 282 | * Issue flush and toggle pending_idx. This makes pending_idx |
@@ -334,19 +294,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, | |||
334 | * In case of IO scheduler, flush rq need to borrow scheduler tag | 294 | * In case of IO scheduler, flush rq need to borrow scheduler tag |
335 | * just for cheating put/get driver tag. | 295 | * just for cheating put/get driver tag. |
336 | */ | 296 | */ |
337 | if (q->mq_ops) { | 297 | flush_rq->mq_ctx = first_rq->mq_ctx; |
338 | struct blk_mq_hw_ctx *hctx; | 298 | flush_rq->mq_hctx = first_rq->mq_hctx; |
339 | 299 | ||
340 | flush_rq->mq_ctx = first_rq->mq_ctx; | 300 | if (!q->elevator) { |
341 | 301 | fq->orig_rq = first_rq; | |
342 | if (!q->elevator) { | 302 | flush_rq->tag = first_rq->tag; |
343 | fq->orig_rq = first_rq; | 303 | blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); |
344 | flush_rq->tag = first_rq->tag; | 304 | } else { |
345 | hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); | 305 | flush_rq->internal_tag = first_rq->internal_tag; |
346 | blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); | ||
347 | } else { | ||
348 | flush_rq->internal_tag = first_rq->internal_tag; | ||
349 | } | ||
350 | } | 306 | } |
351 | 307 | ||
352 | flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; | 308 | flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; |
@@ -355,62 +311,17 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, | |||
355 | flush_rq->rq_disk = first_rq->rq_disk; | 311 | flush_rq->rq_disk = first_rq->rq_disk; |
356 | flush_rq->end_io = flush_end_io; | 312 | flush_rq->end_io = flush_end_io; |
357 | 313 | ||
358 | return blk_flush_queue_rq(flush_rq, false); | 314 | blk_flush_queue_rq(flush_rq, false); |
359 | } | ||
360 | |||
361 | static void flush_data_end_io(struct request *rq, blk_status_t error) | ||
362 | { | ||
363 | struct request_queue *q = rq->q; | ||
364 | struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); | ||
365 | |||
366 | lockdep_assert_held(q->queue_lock); | ||
367 | |||
368 | /* | ||
369 | * Updating q->in_flight[] here for making this tag usable | ||
370 | * early. Because in blk_queue_start_tag(), | ||
371 | * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and | ||
372 | * reserve tags for sync I/O. | ||
373 | * | ||
374 | * More importantly this way can avoid the following I/O | ||
375 | * deadlock: | ||
376 | * | ||
377 | * - suppose there are 40 fua requests comming to flush queue | ||
378 | * and queue depth is 31 | ||
379 | * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc | ||
380 | * tag for async I/O any more | ||
381 | * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT | ||
382 | * and flush_data_end_io() is called | ||
383 | * - the other rqs still can't go ahead if not updating | ||
384 | * q->in_flight[BLK_RW_ASYNC] here, meantime these rqs | ||
385 | * are held in flush data queue and make no progress of | ||
386 | * handling post flush rq | ||
387 | * - only after the post flush rq is handled, all these rqs | ||
388 | * can be completed | ||
389 | */ | ||
390 | |||
391 | elv_completed_request(q, rq); | ||
392 | |||
393 | /* for avoiding double accounting */ | ||
394 | rq->rq_flags &= ~RQF_STARTED; | ||
395 | |||
396 | /* | ||
397 | * After populating an empty queue, kick it to avoid stall. Read | ||
398 | * the comment in flush_end_io(). | ||
399 | */ | ||
400 | if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) | ||
401 | blk_run_queue_async(q); | ||
402 | } | 315 | } |
403 | 316 | ||
404 | static void mq_flush_data_end_io(struct request *rq, blk_status_t error) | 317 | static void mq_flush_data_end_io(struct request *rq, blk_status_t error) |
405 | { | 318 | { |
406 | struct request_queue *q = rq->q; | 319 | struct request_queue *q = rq->q; |
407 | struct blk_mq_hw_ctx *hctx; | 320 | struct blk_mq_hw_ctx *hctx = rq->mq_hctx; |
408 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 321 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
409 | unsigned long flags; | 322 | unsigned long flags; |
410 | struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); | 323 | struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); |
411 | 324 | ||
412 | hctx = blk_mq_map_queue(q, ctx->cpu); | ||
413 | |||
414 | if (q->elevator) { | 325 | if (q->elevator) { |
415 | WARN_ON(rq->tag < 0); | 326 | WARN_ON(rq->tag < 0); |
416 | blk_mq_put_driver_tag_hctx(hctx, rq); | 327 | blk_mq_put_driver_tag_hctx(hctx, rq); |
@@ -443,9 +354,6 @@ void blk_insert_flush(struct request *rq) | |||
443 | unsigned int policy = blk_flush_policy(fflags, rq); | 354 | unsigned int policy = blk_flush_policy(fflags, rq); |
444 | struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); | 355 | struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); |
445 | 356 | ||
446 | if (!q->mq_ops) | ||
447 | lockdep_assert_held(q->queue_lock); | ||
448 | |||
449 | /* | 357 | /* |
450 | * @policy now records what operations need to be done. Adjust | 358 | * @policy now records what operations need to be done. Adjust |
451 | * REQ_PREFLUSH and FUA for the driver. | 359 | * REQ_PREFLUSH and FUA for the driver. |
@@ -468,10 +376,7 @@ void blk_insert_flush(struct request *rq) | |||
468 | * complete the request. | 376 | * complete the request. |
469 | */ | 377 | */ |
470 | if (!policy) { | 378 | if (!policy) { |
471 | if (q->mq_ops) | 379 | blk_mq_end_request(rq, 0); |
472 | blk_mq_end_request(rq, 0); | ||
473 | else | ||
474 | __blk_end_request(rq, 0, 0); | ||
475 | return; | 380 | return; |
476 | } | 381 | } |
477 | 382 | ||
@@ -484,10 +389,7 @@ void blk_insert_flush(struct request *rq) | |||
484 | */ | 389 | */ |
485 | if ((policy & REQ_FSEQ_DATA) && | 390 | if ((policy & REQ_FSEQ_DATA) && |
486 | !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { | 391 | !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { |
487 | if (q->mq_ops) | 392 | blk_mq_request_bypass_insert(rq, false); |
488 | blk_mq_request_bypass_insert(rq, false); | ||
489 | else | ||
490 | list_add_tail(&rq->queuelist, &q->queue_head); | ||
491 | return; | 393 | return; |
492 | } | 394 | } |
493 | 395 | ||
@@ -499,17 +401,12 @@ void blk_insert_flush(struct request *rq) | |||
499 | INIT_LIST_HEAD(&rq->flush.list); | 401 | INIT_LIST_HEAD(&rq->flush.list); |
500 | rq->rq_flags |= RQF_FLUSH_SEQ; | 402 | rq->rq_flags |= RQF_FLUSH_SEQ; |
501 | rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ | 403 | rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ |
502 | if (q->mq_ops) { | ||
503 | rq->end_io = mq_flush_data_end_io; | ||
504 | 404 | ||
505 | spin_lock_irq(&fq->mq_flush_lock); | 405 | rq->end_io = mq_flush_data_end_io; |
506 | blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); | ||
507 | spin_unlock_irq(&fq->mq_flush_lock); | ||
508 | return; | ||
509 | } | ||
510 | rq->end_io = flush_data_end_io; | ||
511 | 406 | ||
407 | spin_lock_irq(&fq->mq_flush_lock); | ||
512 | blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); | 408 | blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); |
409 | spin_unlock_irq(&fq->mq_flush_lock); | ||
513 | } | 410 | } |
514 | 411 | ||
515 | /** | 412 | /** |
@@ -575,8 +472,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, | |||
575 | if (!fq) | 472 | if (!fq) |
576 | goto fail; | 473 | goto fail; |
577 | 474 | ||
578 | if (q->mq_ops) | 475 | spin_lock_init(&fq->mq_flush_lock); |
579 | spin_lock_init(&fq->mq_flush_lock); | ||
580 | 476 | ||
581 | rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); | 477 | rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); |
582 | fq->flush_rq = kzalloc_node(rq_sz, flags, node); | 478 | fq->flush_rq = kzalloc_node(rq_sz, flags, node); |
diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 01580f88fcb3..5ed59ac6ae58 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c | |||
@@ -28,7 +28,6 @@ void get_io_context(struct io_context *ioc) | |||
28 | BUG_ON(atomic_long_read(&ioc->refcount) <= 0); | 28 | BUG_ON(atomic_long_read(&ioc->refcount) <= 0); |
29 | atomic_long_inc(&ioc->refcount); | 29 | atomic_long_inc(&ioc->refcount); |
30 | } | 30 | } |
31 | EXPORT_SYMBOL(get_io_context); | ||
32 | 31 | ||
33 | static void icq_free_icq_rcu(struct rcu_head *head) | 32 | static void icq_free_icq_rcu(struct rcu_head *head) |
34 | { | 33 | { |
@@ -48,10 +47,8 @@ static void ioc_exit_icq(struct io_cq *icq) | |||
48 | if (icq->flags & ICQ_EXITED) | 47 | if (icq->flags & ICQ_EXITED) |
49 | return; | 48 | return; |
50 | 49 | ||
51 | if (et->uses_mq && et->ops.mq.exit_icq) | 50 | if (et->ops.exit_icq) |
52 | et->ops.mq.exit_icq(icq); | 51 | et->ops.exit_icq(icq); |
53 | else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn) | ||
54 | et->ops.sq.elevator_exit_icq_fn(icq); | ||
55 | 52 | ||
56 | icq->flags |= ICQ_EXITED; | 53 | icq->flags |= ICQ_EXITED; |
57 | } | 54 | } |
@@ -113,9 +110,9 @@ static void ioc_release_fn(struct work_struct *work) | |||
113 | struct io_cq, ioc_node); | 110 | struct io_cq, ioc_node); |
114 | struct request_queue *q = icq->q; | 111 | struct request_queue *q = icq->q; |
115 | 112 | ||
116 | if (spin_trylock(q->queue_lock)) { | 113 | if (spin_trylock(&q->queue_lock)) { |
117 | ioc_destroy_icq(icq); | 114 | ioc_destroy_icq(icq); |
118 | spin_unlock(q->queue_lock); | 115 | spin_unlock(&q->queue_lock); |
119 | } else { | 116 | } else { |
120 | spin_unlock_irqrestore(&ioc->lock, flags); | 117 | spin_unlock_irqrestore(&ioc->lock, flags); |
121 | cpu_relax(); | 118 | cpu_relax(); |
@@ -162,7 +159,6 @@ void put_io_context(struct io_context *ioc) | |||
162 | if (free_ioc) | 159 | if (free_ioc) |
163 | kmem_cache_free(iocontext_cachep, ioc); | 160 | kmem_cache_free(iocontext_cachep, ioc); |
164 | } | 161 | } |
165 | EXPORT_SYMBOL(put_io_context); | ||
166 | 162 | ||
167 | /** | 163 | /** |
168 | * put_io_context_active - put active reference on ioc | 164 | * put_io_context_active - put active reference on ioc |
@@ -173,7 +169,6 @@ EXPORT_SYMBOL(put_io_context); | |||
173 | */ | 169 | */ |
174 | void put_io_context_active(struct io_context *ioc) | 170 | void put_io_context_active(struct io_context *ioc) |
175 | { | 171 | { |
176 | struct elevator_type *et; | ||
177 | unsigned long flags; | 172 | unsigned long flags; |
178 | struct io_cq *icq; | 173 | struct io_cq *icq; |
179 | 174 | ||
@@ -187,25 +182,12 @@ void put_io_context_active(struct io_context *ioc) | |||
187 | * reverse double locking. Read comment in ioc_release_fn() for | 182 | * reverse double locking. Read comment in ioc_release_fn() for |
188 | * explanation on the nested locking annotation. | 183 | * explanation on the nested locking annotation. |
189 | */ | 184 | */ |
190 | retry: | ||
191 | spin_lock_irqsave_nested(&ioc->lock, flags, 1); | 185 | spin_lock_irqsave_nested(&ioc->lock, flags, 1); |
192 | hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { | 186 | hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { |
193 | if (icq->flags & ICQ_EXITED) | 187 | if (icq->flags & ICQ_EXITED) |
194 | continue; | 188 | continue; |
195 | 189 | ||
196 | et = icq->q->elevator->type; | 190 | ioc_exit_icq(icq); |
197 | if (et->uses_mq) { | ||
198 | ioc_exit_icq(icq); | ||
199 | } else { | ||
200 | if (spin_trylock(icq->q->queue_lock)) { | ||
201 | ioc_exit_icq(icq); | ||
202 | spin_unlock(icq->q->queue_lock); | ||
203 | } else { | ||
204 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
205 | cpu_relax(); | ||
206 | goto retry; | ||
207 | } | ||
208 | } | ||
209 | } | 191 | } |
210 | spin_unlock_irqrestore(&ioc->lock, flags); | 192 | spin_unlock_irqrestore(&ioc->lock, flags); |
211 | 193 | ||
@@ -232,7 +214,7 @@ static void __ioc_clear_queue(struct list_head *icq_list) | |||
232 | 214 | ||
233 | while (!list_empty(icq_list)) { | 215 | while (!list_empty(icq_list)) { |
234 | struct io_cq *icq = list_entry(icq_list->next, | 216 | struct io_cq *icq = list_entry(icq_list->next, |
235 | struct io_cq, q_node); | 217 | struct io_cq, q_node); |
236 | struct io_context *ioc = icq->ioc; | 218 | struct io_context *ioc = icq->ioc; |
237 | 219 | ||
238 | spin_lock_irqsave(&ioc->lock, flags); | 220 | spin_lock_irqsave(&ioc->lock, flags); |
@@ -251,16 +233,11 @@ void ioc_clear_queue(struct request_queue *q) | |||
251 | { | 233 | { |
252 | LIST_HEAD(icq_list); | 234 | LIST_HEAD(icq_list); |
253 | 235 | ||
254 | spin_lock_irq(q->queue_lock); | 236 | spin_lock_irq(&q->queue_lock); |
255 | list_splice_init(&q->icq_list, &icq_list); | 237 | list_splice_init(&q->icq_list, &icq_list); |
238 | spin_unlock_irq(&q->queue_lock); | ||
256 | 239 | ||
257 | if (q->mq_ops) { | 240 | __ioc_clear_queue(&icq_list); |
258 | spin_unlock_irq(q->queue_lock); | ||
259 | __ioc_clear_queue(&icq_list); | ||
260 | } else { | ||
261 | __ioc_clear_queue(&icq_list); | ||
262 | spin_unlock_irq(q->queue_lock); | ||
263 | } | ||
264 | } | 241 | } |
265 | 242 | ||
266 | int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) | 243 | int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) |
@@ -336,7 +313,6 @@ struct io_context *get_task_io_context(struct task_struct *task, | |||
336 | 313 | ||
337 | return NULL; | 314 | return NULL; |
338 | } | 315 | } |
339 | EXPORT_SYMBOL(get_task_io_context); | ||
340 | 316 | ||
341 | /** | 317 | /** |
342 | * ioc_lookup_icq - lookup io_cq from ioc | 318 | * ioc_lookup_icq - lookup io_cq from ioc |
@@ -350,7 +326,7 @@ struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) | |||
350 | { | 326 | { |
351 | struct io_cq *icq; | 327 | struct io_cq *icq; |
352 | 328 | ||
353 | lockdep_assert_held(q->queue_lock); | 329 | lockdep_assert_held(&q->queue_lock); |
354 | 330 | ||
355 | /* | 331 | /* |
356 | * icq's are indexed from @ioc using radix tree and hint pointer, | 332 | * icq's are indexed from @ioc using radix tree and hint pointer, |
@@ -409,16 +385,14 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, | |||
409 | INIT_HLIST_NODE(&icq->ioc_node); | 385 | INIT_HLIST_NODE(&icq->ioc_node); |
410 | 386 | ||
411 | /* lock both q and ioc and try to link @icq */ | 387 | /* lock both q and ioc and try to link @icq */ |
412 | spin_lock_irq(q->queue_lock); | 388 | spin_lock_irq(&q->queue_lock); |
413 | spin_lock(&ioc->lock); | 389 | spin_lock(&ioc->lock); |
414 | 390 | ||
415 | if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { | 391 | if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { |
416 | hlist_add_head(&icq->ioc_node, &ioc->icq_list); | 392 | hlist_add_head(&icq->ioc_node, &ioc->icq_list); |
417 | list_add(&icq->q_node, &q->icq_list); | 393 | list_add(&icq->q_node, &q->icq_list); |
418 | if (et->uses_mq && et->ops.mq.init_icq) | 394 | if (et->ops.init_icq) |
419 | et->ops.mq.init_icq(icq); | 395 | et->ops.init_icq(icq); |
420 | else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn) | ||
421 | et->ops.sq.elevator_init_icq_fn(icq); | ||
422 | } else { | 396 | } else { |
423 | kmem_cache_free(et->icq_cache, icq); | 397 | kmem_cache_free(et->icq_cache, icq); |
424 | icq = ioc_lookup_icq(ioc, q); | 398 | icq = ioc_lookup_icq(ioc, q); |
@@ -427,7 +401,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, | |||
427 | } | 401 | } |
428 | 402 | ||
429 | spin_unlock(&ioc->lock); | 403 | spin_unlock(&ioc->lock); |
430 | spin_unlock_irq(q->queue_lock); | 404 | spin_unlock_irq(&q->queue_lock); |
431 | radix_tree_preload_end(); | 405 | radix_tree_preload_end(); |
432 | return icq; | 406 | return icq; |
433 | } | 407 | } |
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 38c35c32aff2..fc714ef402a6 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c | |||
@@ -262,29 +262,25 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, | |||
262 | stat->rqs.mean); | 262 | stat->rqs.mean); |
263 | } | 263 | } |
264 | 264 | ||
265 | static inline bool iolatency_may_queue(struct iolatency_grp *iolat, | 265 | static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) |
266 | wait_queue_entry_t *wait, | ||
267 | bool first_block) | ||
268 | { | 266 | { |
269 | struct rq_wait *rqw = &iolat->rq_wait; | 267 | atomic_dec(&rqw->inflight); |
268 | wake_up(&rqw->wait); | ||
269 | } | ||
270 | 270 | ||
271 | if (first_block && waitqueue_active(&rqw->wait) && | 271 | static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) |
272 | rqw->wait.head.next != &wait->entry) | 272 | { |
273 | return false; | 273 | struct iolatency_grp *iolat = private_data; |
274 | return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); | 274 | return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); |
275 | } | 275 | } |
276 | 276 | ||
277 | static void __blkcg_iolatency_throttle(struct rq_qos *rqos, | 277 | static void __blkcg_iolatency_throttle(struct rq_qos *rqos, |
278 | struct iolatency_grp *iolat, | 278 | struct iolatency_grp *iolat, |
279 | spinlock_t *lock, bool issue_as_root, | 279 | bool issue_as_root, |
280 | bool use_memdelay) | 280 | bool use_memdelay) |
281 | __releases(lock) | ||
282 | __acquires(lock) | ||
283 | { | 281 | { |
284 | struct rq_wait *rqw = &iolat->rq_wait; | 282 | struct rq_wait *rqw = &iolat->rq_wait; |
285 | unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); | 283 | unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); |
286 | DEFINE_WAIT(wait); | ||
287 | bool first_block = true; | ||
288 | 284 | ||
289 | if (use_delay) | 285 | if (use_delay) |
290 | blkcg_schedule_throttle(rqos->q, use_memdelay); | 286 | blkcg_schedule_throttle(rqos->q, use_memdelay); |
@@ -301,27 +297,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, | |||
301 | return; | 297 | return; |
302 | } | 298 | } |
303 | 299 | ||
304 | if (iolatency_may_queue(iolat, &wait, first_block)) | 300 | rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); |
305 | return; | ||
306 | |||
307 | do { | ||
308 | prepare_to_wait_exclusive(&rqw->wait, &wait, | ||
309 | TASK_UNINTERRUPTIBLE); | ||
310 | |||
311 | if (iolatency_may_queue(iolat, &wait, first_block)) | ||
312 | break; | ||
313 | first_block = false; | ||
314 | |||
315 | if (lock) { | ||
316 | spin_unlock_irq(lock); | ||
317 | io_schedule(); | ||
318 | spin_lock_irq(lock); | ||
319 | } else { | ||
320 | io_schedule(); | ||
321 | } | ||
322 | } while (1); | ||
323 | |||
324 | finish_wait(&rqw->wait, &wait); | ||
325 | } | 301 | } |
326 | 302 | ||
327 | #define SCALE_DOWN_FACTOR 2 | 303 | #define SCALE_DOWN_FACTOR 2 |
@@ -478,38 +454,15 @@ static void check_scale_change(struct iolatency_grp *iolat) | |||
478 | scale_change(iolat, direction > 0); | 454 | scale_change(iolat, direction > 0); |
479 | } | 455 | } |
480 | 456 | ||
481 | static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, | 457 | static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) |
482 | spinlock_t *lock) | ||
483 | { | 458 | { |
484 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); | 459 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); |
485 | struct blkcg *blkcg; | 460 | struct blkcg_gq *blkg = bio->bi_blkg; |
486 | struct blkcg_gq *blkg; | ||
487 | struct request_queue *q = rqos->q; | ||
488 | bool issue_as_root = bio_issue_as_root_blkg(bio); | 461 | bool issue_as_root = bio_issue_as_root_blkg(bio); |
489 | 462 | ||
490 | if (!blk_iolatency_enabled(blkiolat)) | 463 | if (!blk_iolatency_enabled(blkiolat)) |
491 | return; | 464 | return; |
492 | 465 | ||
493 | rcu_read_lock(); | ||
494 | blkcg = bio_blkcg(bio); | ||
495 | bio_associate_blkcg(bio, &blkcg->css); | ||
496 | blkg = blkg_lookup(blkcg, q); | ||
497 | if (unlikely(!blkg)) { | ||
498 | if (!lock) | ||
499 | spin_lock_irq(q->queue_lock); | ||
500 | blkg = blkg_lookup_create(blkcg, q); | ||
501 | if (IS_ERR(blkg)) | ||
502 | blkg = NULL; | ||
503 | if (!lock) | ||
504 | spin_unlock_irq(q->queue_lock); | ||
505 | } | ||
506 | if (!blkg) | ||
507 | goto out; | ||
508 | |||
509 | bio_issue_init(&bio->bi_issue, bio_sectors(bio)); | ||
510 | bio_associate_blkg(bio, blkg); | ||
511 | out: | ||
512 | rcu_read_unlock(); | ||
513 | while (blkg && blkg->parent) { | 466 | while (blkg && blkg->parent) { |
514 | struct iolatency_grp *iolat = blkg_to_lat(blkg); | 467 | struct iolatency_grp *iolat = blkg_to_lat(blkg); |
515 | if (!iolat) { | 468 | if (!iolat) { |
@@ -518,7 +471,7 @@ out: | |||
518 | } | 471 | } |
519 | 472 | ||
520 | check_scale_change(iolat); | 473 | check_scale_change(iolat); |
521 | __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, | 474 | __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, |
522 | (bio->bi_opf & REQ_SWAP) == REQ_SWAP); | 475 | (bio->bi_opf & REQ_SWAP) == REQ_SWAP); |
523 | blkg = blkg->parent; | 476 | blkg = blkg->parent; |
524 | } | 477 | } |
@@ -640,7 +593,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) | |||
640 | bool enabled = false; | 593 | bool enabled = false; |
641 | 594 | ||
642 | blkg = bio->bi_blkg; | 595 | blkg = bio->bi_blkg; |
643 | if (!blkg) | 596 | if (!blkg || !bio_flagged(bio, BIO_TRACKED)) |
644 | return; | 597 | return; |
645 | 598 | ||
646 | iolat = blkg_to_lat(bio->bi_blkg); | 599 | iolat = blkg_to_lat(bio->bi_blkg); |
@@ -730,7 +683,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) | |||
730 | * We could be exiting, don't access the pd unless we have a | 683 | * We could be exiting, don't access the pd unless we have a |
731 | * ref on the blkg. | 684 | * ref on the blkg. |
732 | */ | 685 | */ |
733 | if (!blkg_try_get(blkg)) | 686 | if (!blkg_tryget(blkg)) |
734 | continue; | 687 | continue; |
735 | 688 | ||
736 | iolat = blkg_to_lat(blkg); | 689 | iolat = blkg_to_lat(blkg); |
diff --git a/block/blk-lib.c b/block/blk-lib.c index 76f867ea9a9b..5f2c429d4378 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -51,16 +51,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
51 | if ((sector | nr_sects) & bs_mask) | 51 | if ((sector | nr_sects) & bs_mask) |
52 | return -EINVAL; | 52 | return -EINVAL; |
53 | 53 | ||
54 | while (nr_sects) { | 54 | if (!nr_sects) |
55 | unsigned int req_sects = nr_sects; | 55 | return -EINVAL; |
56 | sector_t end_sect; | ||
57 | 56 | ||
58 | if (!req_sects) | 57 | while (nr_sects) { |
59 | goto fail; | 58 | sector_t req_sects = min_t(sector_t, nr_sects, |
60 | if (req_sects > UINT_MAX >> 9) | 59 | bio_allowed_max_sectors(q)); |
61 | req_sects = UINT_MAX >> 9; | ||
62 | 60 | ||
63 | end_sect = sector + req_sects; | 61 | WARN_ON_ONCE((req_sects << 9) > UINT_MAX); |
64 | 62 | ||
65 | bio = blk_next_bio(bio, 0, gfp_mask); | 63 | bio = blk_next_bio(bio, 0, gfp_mask); |
66 | bio->bi_iter.bi_sector = sector; | 64 | bio->bi_iter.bi_sector = sector; |
@@ -68,8 +66,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
68 | bio_set_op_attrs(bio, op, 0); | 66 | bio_set_op_attrs(bio, op, 0); |
69 | 67 | ||
70 | bio->bi_iter.bi_size = req_sects << 9; | 68 | bio->bi_iter.bi_size = req_sects << 9; |
69 | sector += req_sects; | ||
71 | nr_sects -= req_sects; | 70 | nr_sects -= req_sects; |
72 | sector = end_sect; | ||
73 | 71 | ||
74 | /* | 72 | /* |
75 | * We can loop for a long time in here, if someone does | 73 | * We can loop for a long time in here, if someone does |
@@ -82,14 +80,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
82 | 80 | ||
83 | *biop = bio; | 81 | *biop = bio; |
84 | return 0; | 82 | return 0; |
85 | |||
86 | fail: | ||
87 | if (bio) { | ||
88 | submit_bio_wait(bio); | ||
89 | bio_put(bio); | ||
90 | } | ||
91 | *biop = NULL; | ||
92 | return -EOPNOTSUPP; | ||
93 | } | 83 | } |
94 | EXPORT_SYMBOL(__blkdev_issue_discard); | 84 | EXPORT_SYMBOL(__blkdev_issue_discard); |
95 | 85 | ||
@@ -161,7 +151,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, | |||
161 | return -EOPNOTSUPP; | 151 | return -EOPNOTSUPP; |
162 | 152 | ||
163 | /* Ensure that max_write_same_sectors doesn't overflow bi_size */ | 153 | /* Ensure that max_write_same_sectors doesn't overflow bi_size */ |
164 | max_write_same_sectors = UINT_MAX >> 9; | 154 | max_write_same_sectors = bio_allowed_max_sectors(q); |
165 | 155 | ||
166 | while (nr_sects) { | 156 | while (nr_sects) { |
167 | bio = blk_next_bio(bio, 1, gfp_mask); | 157 | bio = blk_next_bio(bio, 1, gfp_mask); |
diff --git a/block/blk-merge.c b/block/blk-merge.c index 4478d53cc6ee..71e9ac03f621 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -46,7 +46,7 @@ static inline bool bio_will_gap(struct request_queue *q, | |||
46 | bio_get_first_bvec(prev_rq->bio, &pb); | 46 | bio_get_first_bvec(prev_rq->bio, &pb); |
47 | else | 47 | else |
48 | bio_get_first_bvec(prev, &pb); | 48 | bio_get_first_bvec(prev, &pb); |
49 | if (pb.bv_offset) | 49 | if (pb.bv_offset & queue_virt_boundary(q)) |
50 | return true; | 50 | return true; |
51 | 51 | ||
52 | /* | 52 | /* |
@@ -90,7 +90,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, | |||
90 | /* Zero-sector (unknown) and one-sector granularities are the same. */ | 90 | /* Zero-sector (unknown) and one-sector granularities are the same. */ |
91 | granularity = max(q->limits.discard_granularity >> 9, 1U); | 91 | granularity = max(q->limits.discard_granularity >> 9, 1U); |
92 | 92 | ||
93 | max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); | 93 | max_discard_sectors = min(q->limits.max_discard_sectors, |
94 | bio_allowed_max_sectors(q)); | ||
94 | max_discard_sectors -= max_discard_sectors % granularity; | 95 | max_discard_sectors -= max_discard_sectors % granularity; |
95 | 96 | ||
96 | if (unlikely(!max_discard_sectors)) { | 97 | if (unlikely(!max_discard_sectors)) { |
@@ -387,7 +388,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio) | |||
387 | 388 | ||
388 | bio_set_flag(bio, BIO_SEG_VALID); | 389 | bio_set_flag(bio, BIO_SEG_VALID); |
389 | } | 390 | } |
390 | EXPORT_SYMBOL(blk_recount_segments); | ||
391 | 391 | ||
392 | static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, | 392 | static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, |
393 | struct bio *nxt) | 393 | struct bio *nxt) |
@@ -591,17 +591,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, | |||
591 | return ll_new_hw_segment(q, req, bio); | 591 | return ll_new_hw_segment(q, req, bio); |
592 | } | 592 | } |
593 | 593 | ||
594 | /* | ||
595 | * blk-mq uses req->special to carry normal driver per-request payload, it | ||
596 | * does not indicate a prepared command that we cannot merge with. | ||
597 | */ | ||
598 | static bool req_no_special_merge(struct request *req) | ||
599 | { | ||
600 | struct request_queue *q = req->q; | ||
601 | |||
602 | return !q->mq_ops && req->special; | ||
603 | } | ||
604 | |||
605 | static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, | 594 | static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, |
606 | struct request *next) | 595 | struct request *next) |
607 | { | 596 | { |
@@ -627,13 +616,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, | |||
627 | unsigned int seg_size = | 616 | unsigned int seg_size = |
628 | req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; | 617 | req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; |
629 | 618 | ||
630 | /* | ||
631 | * First check if the either of the requests are re-queued | ||
632 | * requests. Can't merge them if they are. | ||
633 | */ | ||
634 | if (req_no_special_merge(req) || req_no_special_merge(next)) | ||
635 | return 0; | ||
636 | |||
637 | if (req_gap_back_merge(req, next->bio)) | 619 | if (req_gap_back_merge(req, next->bio)) |
638 | return 0; | 620 | return 0; |
639 | 621 | ||
@@ -698,12 +680,10 @@ static void blk_account_io_merge(struct request *req) | |||
698 | { | 680 | { |
699 | if (blk_do_io_stat(req)) { | 681 | if (blk_do_io_stat(req)) { |
700 | struct hd_struct *part; | 682 | struct hd_struct *part; |
701 | int cpu; | ||
702 | 683 | ||
703 | cpu = part_stat_lock(); | 684 | part_stat_lock(); |
704 | part = req->part; | 685 | part = req->part; |
705 | 686 | ||
706 | part_round_stats(req->q, cpu, part); | ||
707 | part_dec_in_flight(req->q, part, rq_data_dir(req)); | 687 | part_dec_in_flight(req->q, part, rq_data_dir(req)); |
708 | 688 | ||
709 | hd_struct_put(part); | 689 | hd_struct_put(part); |
@@ -726,7 +706,8 @@ static inline bool blk_discard_mergable(struct request *req) | |||
726 | return false; | 706 | return false; |
727 | } | 707 | } |
728 | 708 | ||
729 | enum elv_merge blk_try_req_merge(struct request *req, struct request *next) | 709 | static enum elv_merge blk_try_req_merge(struct request *req, |
710 | struct request *next) | ||
730 | { | 711 | { |
731 | if (blk_discard_mergable(req)) | 712 | if (blk_discard_mergable(req)) |
732 | return ELEVATOR_DISCARD_MERGE; | 713 | return ELEVATOR_DISCARD_MERGE; |
@@ -743,9 +724,6 @@ enum elv_merge blk_try_req_merge(struct request *req, struct request *next) | |||
743 | static struct request *attempt_merge(struct request_queue *q, | 724 | static struct request *attempt_merge(struct request_queue *q, |
744 | struct request *req, struct request *next) | 725 | struct request *req, struct request *next) |
745 | { | 726 | { |
746 | if (!q->mq_ops) | ||
747 | lockdep_assert_held(q->queue_lock); | ||
748 | |||
749 | if (!rq_mergeable(req) || !rq_mergeable(next)) | 727 | if (!rq_mergeable(req) || !rq_mergeable(next)) |
750 | return NULL; | 728 | return NULL; |
751 | 729 | ||
@@ -753,8 +731,7 @@ static struct request *attempt_merge(struct request_queue *q, | |||
753 | return NULL; | 731 | return NULL; |
754 | 732 | ||
755 | if (rq_data_dir(req) != rq_data_dir(next) | 733 | if (rq_data_dir(req) != rq_data_dir(next) |
756 | || req->rq_disk != next->rq_disk | 734 | || req->rq_disk != next->rq_disk) |
757 | || req_no_special_merge(next)) | ||
758 | return NULL; | 735 | return NULL; |
759 | 736 | ||
760 | if (req_op(req) == REQ_OP_WRITE_SAME && | 737 | if (req_op(req) == REQ_OP_WRITE_SAME && |
@@ -768,6 +745,9 @@ static struct request *attempt_merge(struct request_queue *q, | |||
768 | if (req->write_hint != next->write_hint) | 745 | if (req->write_hint != next->write_hint) |
769 | return NULL; | 746 | return NULL; |
770 | 747 | ||
748 | if (req->ioprio != next->ioprio) | ||
749 | return NULL; | ||
750 | |||
771 | /* | 751 | /* |
772 | * If we are allowed to merge, then append bio list | 752 | * If we are allowed to merge, then append bio list |
773 | * from next to rq and release next. merge_requests_fn | 753 | * from next to rq and release next. merge_requests_fn |
@@ -815,7 +795,7 @@ static struct request *attempt_merge(struct request_queue *q, | |||
815 | 795 | ||
816 | req->__data_len += blk_rq_bytes(next); | 796 | req->__data_len += blk_rq_bytes(next); |
817 | 797 | ||
818 | if (req_op(req) != REQ_OP_DISCARD) | 798 | if (!blk_discard_mergable(req)) |
819 | elv_merge_requests(q, req, next); | 799 | elv_merge_requests(q, req, next); |
820 | 800 | ||
821 | /* | 801 | /* |
@@ -823,10 +803,6 @@ static struct request *attempt_merge(struct request_queue *q, | |||
823 | */ | 803 | */ |
824 | blk_account_io_merge(next); | 804 | blk_account_io_merge(next); |
825 | 805 | ||
826 | req->ioprio = ioprio_best(req->ioprio, next->ioprio); | ||
827 | if (blk_rq_cpu_valid(next)) | ||
828 | req->cpu = next->cpu; | ||
829 | |||
830 | /* | 806 | /* |
831 | * ownership of bio passed from next to req, return 'next' for | 807 | * ownership of bio passed from next to req, return 'next' for |
832 | * the caller to free | 808 | * the caller to free |
@@ -858,16 +834,11 @@ struct request *attempt_front_merge(struct request_queue *q, struct request *rq) | |||
858 | int blk_attempt_req_merge(struct request_queue *q, struct request *rq, | 834 | int blk_attempt_req_merge(struct request_queue *q, struct request *rq, |
859 | struct request *next) | 835 | struct request *next) |
860 | { | 836 | { |
861 | struct elevator_queue *e = q->elevator; | ||
862 | struct request *free; | 837 | struct request *free; |
863 | 838 | ||
864 | if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn) | ||
865 | if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next)) | ||
866 | return 0; | ||
867 | |||
868 | free = attempt_merge(q, rq, next); | 839 | free = attempt_merge(q, rq, next); |
869 | if (free) { | 840 | if (free) { |
870 | __blk_put_request(q, free); | 841 | blk_put_request(free); |
871 | return 1; | 842 | return 1; |
872 | } | 843 | } |
873 | 844 | ||
@@ -886,8 +857,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) | |||
886 | if (bio_data_dir(bio) != rq_data_dir(rq)) | 857 | if (bio_data_dir(bio) != rq_data_dir(rq)) |
887 | return false; | 858 | return false; |
888 | 859 | ||
889 | /* must be same device and not a special request */ | 860 | /* must be same device */ |
890 | if (rq->rq_disk != bio->bi_disk || req_no_special_merge(rq)) | 861 | if (rq->rq_disk != bio->bi_disk) |
891 | return false; | 862 | return false; |
892 | 863 | ||
893 | /* only merge integrity protected bio into ditto rq */ | 864 | /* only merge integrity protected bio into ditto rq */ |
@@ -906,6 +877,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) | |||
906 | if (rq->write_hint != bio->bi_write_hint) | 877 | if (rq->write_hint != bio->bi_write_hint) |
907 | return false; | 878 | return false; |
908 | 879 | ||
880 | if (rq->ioprio != bio_prio(bio)) | ||
881 | return false; | ||
882 | |||
909 | return true; | 883 | return true; |
910 | } | 884 | } |
911 | 885 | ||
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 3eb169f15842..03a534820271 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c | |||
@@ -14,9 +14,10 @@ | |||
14 | #include "blk.h" | 14 | #include "blk.h" |
15 | #include "blk-mq.h" | 15 | #include "blk-mq.h" |
16 | 16 | ||
17 | static int cpu_to_queue_index(unsigned int nr_queues, const int cpu) | 17 | static int cpu_to_queue_index(struct blk_mq_queue_map *qmap, |
18 | unsigned int nr_queues, const int cpu) | ||
18 | { | 19 | { |
19 | return cpu % nr_queues; | 20 | return qmap->queue_offset + (cpu % nr_queues); |
20 | } | 21 | } |
21 | 22 | ||
22 | static int get_first_sibling(unsigned int cpu) | 23 | static int get_first_sibling(unsigned int cpu) |
@@ -30,10 +31,10 @@ static int get_first_sibling(unsigned int cpu) | |||
30 | return cpu; | 31 | return cpu; |
31 | } | 32 | } |
32 | 33 | ||
33 | int blk_mq_map_queues(struct blk_mq_tag_set *set) | 34 | int blk_mq_map_queues(struct blk_mq_queue_map *qmap) |
34 | { | 35 | { |
35 | unsigned int *map = set->mq_map; | 36 | unsigned int *map = qmap->mq_map; |
36 | unsigned int nr_queues = set->nr_hw_queues; | 37 | unsigned int nr_queues = qmap->nr_queues; |
37 | unsigned int cpu, first_sibling; | 38 | unsigned int cpu, first_sibling; |
38 | 39 | ||
39 | for_each_possible_cpu(cpu) { | 40 | for_each_possible_cpu(cpu) { |
@@ -44,11 +45,11 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set) | |||
44 | * performace optimizations. | 45 | * performace optimizations. |
45 | */ | 46 | */ |
46 | if (cpu < nr_queues) { | 47 | if (cpu < nr_queues) { |
47 | map[cpu] = cpu_to_queue_index(nr_queues, cpu); | 48 | map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); |
48 | } else { | 49 | } else { |
49 | first_sibling = get_first_sibling(cpu); | 50 | first_sibling = get_first_sibling(cpu); |
50 | if (first_sibling == cpu) | 51 | if (first_sibling == cpu) |
51 | map[cpu] = cpu_to_queue_index(nr_queues, cpu); | 52 | map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); |
52 | else | 53 | else |
53 | map[cpu] = map[first_sibling]; | 54 | map[cpu] = map[first_sibling]; |
54 | } | 55 | } |
@@ -62,12 +63,12 @@ EXPORT_SYMBOL_GPL(blk_mq_map_queues); | |||
62 | * We have no quick way of doing reverse lookups. This is only used at | 63 | * We have no quick way of doing reverse lookups. This is only used at |
63 | * queue init time, so runtime isn't important. | 64 | * queue init time, so runtime isn't important. |
64 | */ | 65 | */ |
65 | int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) | 66 | int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) |
66 | { | 67 | { |
67 | int i; | 68 | int i; |
68 | 69 | ||
69 | for_each_possible_cpu(i) { | 70 | for_each_possible_cpu(i) { |
70 | if (index == mq_map[i]) | 71 | if (index == qmap->mq_map[i]) |
71 | return local_memory_node(cpu_to_node(i)); | 72 | return local_memory_node(cpu_to_node(i)); |
72 | } | 73 | } |
73 | 74 | ||
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 10b284a1f18d..90d68760af08 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include "blk-mq.h" | 23 | #include "blk-mq.h" |
24 | #include "blk-mq-debugfs.h" | 24 | #include "blk-mq-debugfs.h" |
25 | #include "blk-mq-tag.h" | 25 | #include "blk-mq-tag.h" |
26 | #include "blk-rq-qos.h" | ||
26 | 27 | ||
27 | static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) | 28 | static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) |
28 | { | 29 | { |
@@ -112,10 +113,8 @@ static int queue_pm_only_show(void *data, struct seq_file *m) | |||
112 | 113 | ||
113 | #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name | 114 | #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name |
114 | static const char *const blk_queue_flag_name[] = { | 115 | static const char *const blk_queue_flag_name[] = { |
115 | QUEUE_FLAG_NAME(QUEUED), | ||
116 | QUEUE_FLAG_NAME(STOPPED), | 116 | QUEUE_FLAG_NAME(STOPPED), |
117 | QUEUE_FLAG_NAME(DYING), | 117 | QUEUE_FLAG_NAME(DYING), |
118 | QUEUE_FLAG_NAME(BYPASS), | ||
119 | QUEUE_FLAG_NAME(BIDI), | 118 | QUEUE_FLAG_NAME(BIDI), |
120 | QUEUE_FLAG_NAME(NOMERGES), | 119 | QUEUE_FLAG_NAME(NOMERGES), |
121 | QUEUE_FLAG_NAME(SAME_COMP), | 120 | QUEUE_FLAG_NAME(SAME_COMP), |
@@ -318,7 +317,6 @@ static const char *const cmd_flag_name[] = { | |||
318 | static const char *const rqf_name[] = { | 317 | static const char *const rqf_name[] = { |
319 | RQF_NAME(SORTED), | 318 | RQF_NAME(SORTED), |
320 | RQF_NAME(STARTED), | 319 | RQF_NAME(STARTED), |
321 | RQF_NAME(QUEUED), | ||
322 | RQF_NAME(SOFTBARRIER), | 320 | RQF_NAME(SOFTBARRIER), |
323 | RQF_NAME(FLUSH_SEQ), | 321 | RQF_NAME(FLUSH_SEQ), |
324 | RQF_NAME(MIXED_MERGE), | 322 | RQF_NAME(MIXED_MERGE), |
@@ -424,15 +422,18 @@ struct show_busy_params { | |||
424 | 422 | ||
425 | /* | 423 | /* |
426 | * Note: the state of a request may change while this function is in progress, | 424 | * Note: the state of a request may change while this function is in progress, |
427 | * e.g. due to a concurrent blk_mq_finish_request() call. | 425 | * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to |
426 | * keep iterating requests. | ||
428 | */ | 427 | */ |
429 | static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) | 428 | static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved) |
430 | { | 429 | { |
431 | const struct show_busy_params *params = data; | 430 | const struct show_busy_params *params = data; |
432 | 431 | ||
433 | if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx) | 432 | if (rq->mq_hctx == params->hctx) |
434 | __blk_mq_debugfs_rq_show(params->m, | 433 | __blk_mq_debugfs_rq_show(params->m, |
435 | list_entry_rq(&rq->queuelist)); | 434 | list_entry_rq(&rq->queuelist)); |
435 | |||
436 | return true; | ||
436 | } | 437 | } |
437 | 438 | ||
438 | static int hctx_busy_show(void *data, struct seq_file *m) | 439 | static int hctx_busy_show(void *data, struct seq_file *m) |
@@ -446,6 +447,21 @@ static int hctx_busy_show(void *data, struct seq_file *m) | |||
446 | return 0; | 447 | return 0; |
447 | } | 448 | } |
448 | 449 | ||
450 | static const char *const hctx_types[] = { | ||
451 | [HCTX_TYPE_DEFAULT] = "default", | ||
452 | [HCTX_TYPE_READ] = "read", | ||
453 | [HCTX_TYPE_POLL] = "poll", | ||
454 | }; | ||
455 | |||
456 | static int hctx_type_show(void *data, struct seq_file *m) | ||
457 | { | ||
458 | struct blk_mq_hw_ctx *hctx = data; | ||
459 | |||
460 | BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES); | ||
461 | seq_printf(m, "%s\n", hctx_types[hctx->type]); | ||
462 | return 0; | ||
463 | } | ||
464 | |||
449 | static int hctx_ctx_map_show(void *data, struct seq_file *m) | 465 | static int hctx_ctx_map_show(void *data, struct seq_file *m) |
450 | { | 466 | { |
451 | struct blk_mq_hw_ctx *hctx = data; | 467 | struct blk_mq_hw_ctx *hctx = data; |
@@ -636,36 +652,43 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m) | |||
636 | return 0; | 652 | return 0; |
637 | } | 653 | } |
638 | 654 | ||
639 | static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) | 655 | #define CTX_RQ_SEQ_OPS(name, type) \ |
640 | __acquires(&ctx->lock) | 656 | static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ |
641 | { | 657 | __acquires(&ctx->lock) \ |
642 | struct blk_mq_ctx *ctx = m->private; | 658 | { \ |
643 | 659 | struct blk_mq_ctx *ctx = m->private; \ | |
644 | spin_lock(&ctx->lock); | 660 | \ |
645 | return seq_list_start(&ctx->rq_list, *pos); | 661 | spin_lock(&ctx->lock); \ |
646 | } | 662 | return seq_list_start(&ctx->rq_lists[type], *pos); \ |
647 | 663 | } \ | |
648 | static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) | 664 | \ |
649 | { | 665 | static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \ |
650 | struct blk_mq_ctx *ctx = m->private; | 666 | loff_t *pos) \ |
651 | 667 | { \ | |
652 | return seq_list_next(v, &ctx->rq_list, pos); | 668 | struct blk_mq_ctx *ctx = m->private; \ |
669 | \ | ||
670 | return seq_list_next(v, &ctx->rq_lists[type], pos); \ | ||
671 | } \ | ||
672 | \ | ||
673 | static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \ | ||
674 | __releases(&ctx->lock) \ | ||
675 | { \ | ||
676 | struct blk_mq_ctx *ctx = m->private; \ | ||
677 | \ | ||
678 | spin_unlock(&ctx->lock); \ | ||
679 | } \ | ||
680 | \ | ||
681 | static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \ | ||
682 | .start = ctx_##name##_rq_list_start, \ | ||
683 | .next = ctx_##name##_rq_list_next, \ | ||
684 | .stop = ctx_##name##_rq_list_stop, \ | ||
685 | .show = blk_mq_debugfs_rq_show, \ | ||
653 | } | 686 | } |
654 | 687 | ||
655 | static void ctx_rq_list_stop(struct seq_file *m, void *v) | 688 | CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); |
656 | __releases(&ctx->lock) | 689 | CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); |
657 | { | 690 | CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); |
658 | struct blk_mq_ctx *ctx = m->private; | ||
659 | |||
660 | spin_unlock(&ctx->lock); | ||
661 | } | ||
662 | 691 | ||
663 | static const struct seq_operations ctx_rq_list_seq_ops = { | ||
664 | .start = ctx_rq_list_start, | ||
665 | .next = ctx_rq_list_next, | ||
666 | .stop = ctx_rq_list_stop, | ||
667 | .show = blk_mq_debugfs_rq_show, | ||
668 | }; | ||
669 | static int ctx_dispatched_show(void *data, struct seq_file *m) | 692 | static int ctx_dispatched_show(void *data, struct seq_file *m) |
670 | { | 693 | { |
671 | struct blk_mq_ctx *ctx = data; | 694 | struct blk_mq_ctx *ctx = data; |
@@ -798,11 +821,14 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { | |||
798 | {"run", 0600, hctx_run_show, hctx_run_write}, | 821 | {"run", 0600, hctx_run_show, hctx_run_write}, |
799 | {"active", 0400, hctx_active_show}, | 822 | {"active", 0400, hctx_active_show}, |
800 | {"dispatch_busy", 0400, hctx_dispatch_busy_show}, | 823 | {"dispatch_busy", 0400, hctx_dispatch_busy_show}, |
824 | {"type", 0400, hctx_type_show}, | ||
801 | {}, | 825 | {}, |
802 | }; | 826 | }; |
803 | 827 | ||
804 | static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { | 828 | static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { |
805 | {"rq_list", 0400, .seq_ops = &ctx_rq_list_seq_ops}, | 829 | {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, |
830 | {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, | ||
831 | {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, | ||
806 | {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, | 832 | {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, |
807 | {"merged", 0600, ctx_merged_show, ctx_merged_write}, | 833 | {"merged", 0600, ctx_merged_show, ctx_merged_write}, |
808 | {"completed", 0600, ctx_completed_show, ctx_completed_write}, | 834 | {"completed", 0600, ctx_completed_show, ctx_completed_write}, |
@@ -856,6 +882,15 @@ int blk_mq_debugfs_register(struct request_queue *q) | |||
856 | goto err; | 882 | goto err; |
857 | } | 883 | } |
858 | 884 | ||
885 | if (q->rq_qos) { | ||
886 | struct rq_qos *rqos = q->rq_qos; | ||
887 | |||
888 | while (rqos) { | ||
889 | blk_mq_debugfs_register_rqos(rqos); | ||
890 | rqos = rqos->next; | ||
891 | } | ||
892 | } | ||
893 | |||
859 | return 0; | 894 | return 0; |
860 | 895 | ||
861 | err: | 896 | err: |
@@ -978,6 +1013,50 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q) | |||
978 | q->sched_debugfs_dir = NULL; | 1013 | q->sched_debugfs_dir = NULL; |
979 | } | 1014 | } |
980 | 1015 | ||
1016 | void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) | ||
1017 | { | ||
1018 | debugfs_remove_recursive(rqos->debugfs_dir); | ||
1019 | rqos->debugfs_dir = NULL; | ||
1020 | } | ||
1021 | |||
1022 | int blk_mq_debugfs_register_rqos(struct rq_qos *rqos) | ||
1023 | { | ||
1024 | struct request_queue *q = rqos->q; | ||
1025 | const char *dir_name = rq_qos_id_to_name(rqos->id); | ||
1026 | |||
1027 | if (!q->debugfs_dir) | ||
1028 | return -ENOENT; | ||
1029 | |||
1030 | if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) | ||
1031 | return 0; | ||
1032 | |||
1033 | if (!q->rqos_debugfs_dir) { | ||
1034 | q->rqos_debugfs_dir = debugfs_create_dir("rqos", | ||
1035 | q->debugfs_dir); | ||
1036 | if (!q->rqos_debugfs_dir) | ||
1037 | return -ENOMEM; | ||
1038 | } | ||
1039 | |||
1040 | rqos->debugfs_dir = debugfs_create_dir(dir_name, | ||
1041 | rqos->q->rqos_debugfs_dir); | ||
1042 | if (!rqos->debugfs_dir) | ||
1043 | return -ENOMEM; | ||
1044 | |||
1045 | if (!debugfs_create_files(rqos->debugfs_dir, rqos, | ||
1046 | rqos->ops->debugfs_attrs)) | ||
1047 | goto err; | ||
1048 | return 0; | ||
1049 | err: | ||
1050 | blk_mq_debugfs_unregister_rqos(rqos); | ||
1051 | return -ENOMEM; | ||
1052 | } | ||
1053 | |||
1054 | void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) | ||
1055 | { | ||
1056 | debugfs_remove_recursive(q->rqos_debugfs_dir); | ||
1057 | q->rqos_debugfs_dir = NULL; | ||
1058 | } | ||
1059 | |||
981 | int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, | 1060 | int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, |
982 | struct blk_mq_hw_ctx *hctx) | 1061 | struct blk_mq_hw_ctx *hctx) |
983 | { | 1062 | { |
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a9160be12be0..8c9012a578c1 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h | |||
@@ -31,6 +31,10 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q); | |||
31 | int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, | 31 | int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, |
32 | struct blk_mq_hw_ctx *hctx); | 32 | struct blk_mq_hw_ctx *hctx); |
33 | void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); | 33 | void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); |
34 | |||
35 | int blk_mq_debugfs_register_rqos(struct rq_qos *rqos); | ||
36 | void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); | ||
37 | void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q); | ||
34 | #else | 38 | #else |
35 | static inline int blk_mq_debugfs_register(struct request_queue *q) | 39 | static inline int blk_mq_debugfs_register(struct request_queue *q) |
36 | { | 40 | { |
@@ -78,6 +82,19 @@ static inline int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, | |||
78 | static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) | 82 | static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) |
79 | { | 83 | { |
80 | } | 84 | } |
85 | |||
86 | static inline int blk_mq_debugfs_register_rqos(struct rq_qos *rqos) | ||
87 | { | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) | ||
92 | { | ||
93 | } | ||
94 | |||
95 | static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) | ||
96 | { | ||
97 | } | ||
81 | #endif | 98 | #endif |
82 | 99 | ||
83 | #ifdef CONFIG_BLK_DEBUG_FS_ZONED | 100 | #ifdef CONFIG_BLK_DEBUG_FS_ZONED |
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index db644ec624f5..1dce18553984 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c | |||
@@ -31,26 +31,26 @@ | |||
31 | * that maps a queue to the CPUs that have irq affinity for the corresponding | 31 | * that maps a queue to the CPUs that have irq affinity for the corresponding |
32 | * vector. | 32 | * vector. |
33 | */ | 33 | */ |
34 | int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, | 34 | int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, |
35 | int offset) | 35 | int offset) |
36 | { | 36 | { |
37 | const struct cpumask *mask; | 37 | const struct cpumask *mask; |
38 | unsigned int queue, cpu; | 38 | unsigned int queue, cpu; |
39 | 39 | ||
40 | for (queue = 0; queue < set->nr_hw_queues; queue++) { | 40 | for (queue = 0; queue < qmap->nr_queues; queue++) { |
41 | mask = pci_irq_get_affinity(pdev, queue + offset); | 41 | mask = pci_irq_get_affinity(pdev, queue + offset); |
42 | if (!mask) | 42 | if (!mask) |
43 | goto fallback; | 43 | goto fallback; |
44 | 44 | ||
45 | for_each_cpu(cpu, mask) | 45 | for_each_cpu(cpu, mask) |
46 | set->mq_map[cpu] = queue; | 46 | qmap->mq_map[cpu] = qmap->queue_offset + queue; |
47 | } | 47 | } |
48 | 48 | ||
49 | return 0; | 49 | return 0; |
50 | 50 | ||
51 | fallback: | 51 | fallback: |
52 | WARN_ON_ONCE(set->nr_hw_queues > 1); | 52 | WARN_ON_ONCE(qmap->nr_queues > 1); |
53 | blk_mq_clear_mq_map(set); | 53 | blk_mq_clear_mq_map(qmap); |
54 | return 0; | 54 | return 0; |
55 | } | 55 | } |
56 | EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); | 56 | EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); |
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c index 996167f1de18..45030a81a1ed 100644 --- a/block/blk-mq-rdma.c +++ b/block/blk-mq-rdma.c | |||
@@ -29,24 +29,24 @@ | |||
29 | * @set->nr_hw_queues, or @dev does not provide an affinity mask for a | 29 | * @set->nr_hw_queues, or @dev does not provide an affinity mask for a |
30 | * vector, we fallback to the naive mapping. | 30 | * vector, we fallback to the naive mapping. |
31 | */ | 31 | */ |
32 | int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, | 32 | int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, |
33 | struct ib_device *dev, int first_vec) | 33 | struct ib_device *dev, int first_vec) |
34 | { | 34 | { |
35 | const struct cpumask *mask; | 35 | const struct cpumask *mask; |
36 | unsigned int queue, cpu; | 36 | unsigned int queue, cpu; |
37 | 37 | ||
38 | for (queue = 0; queue < set->nr_hw_queues; queue++) { | 38 | for (queue = 0; queue < map->nr_queues; queue++) { |
39 | mask = ib_get_vector_affinity(dev, first_vec + queue); | 39 | mask = ib_get_vector_affinity(dev, first_vec + queue); |
40 | if (!mask) | 40 | if (!mask) |
41 | goto fallback; | 41 | goto fallback; |
42 | 42 | ||
43 | for_each_cpu(cpu, mask) | 43 | for_each_cpu(cpu, mask) |
44 | set->mq_map[cpu] = queue; | 44 | map->mq_map[cpu] = map->queue_offset + queue; |
45 | } | 45 | } |
46 | 46 | ||
47 | return 0; | 47 | return 0; |
48 | 48 | ||
49 | fallback: | 49 | fallback: |
50 | return blk_mq_map_queues(set); | 50 | return blk_mq_map_queues(map); |
51 | } | 51 | } |
52 | EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); | 52 | EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); |
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 29bfe8017a2d..140933e4a7d1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c | |||
@@ -31,15 +31,22 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, | |||
31 | } | 31 | } |
32 | EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); | 32 | EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); |
33 | 33 | ||
34 | void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) | 34 | void blk_mq_sched_assign_ioc(struct request *rq) |
35 | { | 35 | { |
36 | struct request_queue *q = rq->q; | 36 | struct request_queue *q = rq->q; |
37 | struct io_context *ioc = rq_ioc(bio); | 37 | struct io_context *ioc; |
38 | struct io_cq *icq; | 38 | struct io_cq *icq; |
39 | 39 | ||
40 | spin_lock_irq(q->queue_lock); | 40 | /* |
41 | * May not have an IO context if it's a passthrough request | ||
42 | */ | ||
43 | ioc = current->io_context; | ||
44 | if (!ioc) | ||
45 | return; | ||
46 | |||
47 | spin_lock_irq(&q->queue_lock); | ||
41 | icq = ioc_lookup_icq(ioc, q); | 48 | icq = ioc_lookup_icq(ioc, q); |
42 | spin_unlock_irq(q->queue_lock); | 49 | spin_unlock_irq(&q->queue_lock); |
43 | 50 | ||
44 | if (!icq) { | 51 | if (!icq) { |
45 | icq = ioc_create_icq(ioc, q, GFP_ATOMIC); | 52 | icq = ioc_create_icq(ioc, q, GFP_ATOMIC); |
@@ -54,13 +61,14 @@ void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) | |||
54 | * Mark a hardware queue as needing a restart. For shared queues, maintain | 61 | * Mark a hardware queue as needing a restart. For shared queues, maintain |
55 | * a count of how many hardware queues are marked for restart. | 62 | * a count of how many hardware queues are marked for restart. |
56 | */ | 63 | */ |
57 | static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) | 64 | void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) |
58 | { | 65 | { |
59 | if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) | 66 | if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) |
60 | return; | 67 | return; |
61 | 68 | ||
62 | set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | 69 | set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
63 | } | 70 | } |
71 | EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); | ||
64 | 72 | ||
65 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) | 73 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) |
66 | { | 74 | { |
@@ -85,14 +93,13 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) | |||
85 | do { | 93 | do { |
86 | struct request *rq; | 94 | struct request *rq; |
87 | 95 | ||
88 | if (e->type->ops.mq.has_work && | 96 | if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) |
89 | !e->type->ops.mq.has_work(hctx)) | ||
90 | break; | 97 | break; |
91 | 98 | ||
92 | if (!blk_mq_get_dispatch_budget(hctx)) | 99 | if (!blk_mq_get_dispatch_budget(hctx)) |
93 | break; | 100 | break; |
94 | 101 | ||
95 | rq = e->type->ops.mq.dispatch_request(hctx); | 102 | rq = e->type->ops.dispatch_request(hctx); |
96 | if (!rq) { | 103 | if (!rq) { |
97 | blk_mq_put_dispatch_budget(hctx); | 104 | blk_mq_put_dispatch_budget(hctx); |
98 | break; | 105 | break; |
@@ -110,7 +117,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) | |||
110 | static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, | 117 | static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, |
111 | struct blk_mq_ctx *ctx) | 118 | struct blk_mq_ctx *ctx) |
112 | { | 119 | { |
113 | unsigned idx = ctx->index_hw; | 120 | unsigned short idx = ctx->index_hw[hctx->type]; |
114 | 121 | ||
115 | if (++idx == hctx->nr_ctx) | 122 | if (++idx == hctx->nr_ctx) |
116 | idx = 0; | 123 | idx = 0; |
@@ -163,7 +170,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | |||
163 | { | 170 | { |
164 | struct request_queue *q = hctx->queue; | 171 | struct request_queue *q = hctx->queue; |
165 | struct elevator_queue *e = q->elevator; | 172 | struct elevator_queue *e = q->elevator; |
166 | const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; | 173 | const bool has_sched_dispatch = e && e->type->ops.dispatch_request; |
167 | LIST_HEAD(rq_list); | 174 | LIST_HEAD(rq_list); |
168 | 175 | ||
169 | /* RCU or SRCU read lock is needed before checking quiesced flag */ | 176 | /* RCU or SRCU read lock is needed before checking quiesced flag */ |
@@ -295,11 +302,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); | |||
295 | * too much time checking for merges. | 302 | * too much time checking for merges. |
296 | */ | 303 | */ |
297 | static bool blk_mq_attempt_merge(struct request_queue *q, | 304 | static bool blk_mq_attempt_merge(struct request_queue *q, |
305 | struct blk_mq_hw_ctx *hctx, | ||
298 | struct blk_mq_ctx *ctx, struct bio *bio) | 306 | struct blk_mq_ctx *ctx, struct bio *bio) |
299 | { | 307 | { |
308 | enum hctx_type type = hctx->type; | ||
309 | |||
300 | lockdep_assert_held(&ctx->lock); | 310 | lockdep_assert_held(&ctx->lock); |
301 | 311 | ||
302 | if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { | 312 | if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) { |
303 | ctx->rq_merged++; | 313 | ctx->rq_merged++; |
304 | return true; | 314 | return true; |
305 | } | 315 | } |
@@ -311,19 +321,21 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) | |||
311 | { | 321 | { |
312 | struct elevator_queue *e = q->elevator; | 322 | struct elevator_queue *e = q->elevator; |
313 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); | 323 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); |
314 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | 324 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu); |
315 | bool ret = false; | 325 | bool ret = false; |
326 | enum hctx_type type; | ||
316 | 327 | ||
317 | if (e && e->type->ops.mq.bio_merge) { | 328 | if (e && e->type->ops.bio_merge) { |
318 | blk_mq_put_ctx(ctx); | 329 | blk_mq_put_ctx(ctx); |
319 | return e->type->ops.mq.bio_merge(hctx, bio); | 330 | return e->type->ops.bio_merge(hctx, bio); |
320 | } | 331 | } |
321 | 332 | ||
333 | type = hctx->type; | ||
322 | if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && | 334 | if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && |
323 | !list_empty_careful(&ctx->rq_list)) { | 335 | !list_empty_careful(&ctx->rq_lists[type])) { |
324 | /* default per sw-queue merge */ | 336 | /* default per sw-queue merge */ |
325 | spin_lock(&ctx->lock); | 337 | spin_lock(&ctx->lock); |
326 | ret = blk_mq_attempt_merge(q, ctx, bio); | 338 | ret = blk_mq_attempt_merge(q, hctx, ctx, bio); |
327 | spin_unlock(&ctx->lock); | 339 | spin_unlock(&ctx->lock); |
328 | } | 340 | } |
329 | 341 | ||
@@ -367,7 +379,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, | |||
367 | struct request_queue *q = rq->q; | 379 | struct request_queue *q = rq->q; |
368 | struct elevator_queue *e = q->elevator; | 380 | struct elevator_queue *e = q->elevator; |
369 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 381 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
370 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | 382 | struct blk_mq_hw_ctx *hctx = rq->mq_hctx; |
371 | 383 | ||
372 | /* flush rq in flush machinery need to be dispatched directly */ | 384 | /* flush rq in flush machinery need to be dispatched directly */ |
373 | if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { | 385 | if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { |
@@ -380,11 +392,11 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, | |||
380 | if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) | 392 | if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) |
381 | goto run; | 393 | goto run; |
382 | 394 | ||
383 | if (e && e->type->ops.mq.insert_requests) { | 395 | if (e && e->type->ops.insert_requests) { |
384 | LIST_HEAD(list); | 396 | LIST_HEAD(list); |
385 | 397 | ||
386 | list_add(&rq->queuelist, &list); | 398 | list_add(&rq->queuelist, &list); |
387 | e->type->ops.mq.insert_requests(hctx, &list, at_head); | 399 | e->type->ops.insert_requests(hctx, &list, at_head); |
388 | } else { | 400 | } else { |
389 | spin_lock(&ctx->lock); | 401 | spin_lock(&ctx->lock); |
390 | __blk_mq_insert_request(hctx, rq, at_head); | 402 | __blk_mq_insert_request(hctx, rq, at_head); |
@@ -396,27 +408,25 @@ run: | |||
396 | blk_mq_run_hw_queue(hctx, async); | 408 | blk_mq_run_hw_queue(hctx, async); |
397 | } | 409 | } |
398 | 410 | ||
399 | void blk_mq_sched_insert_requests(struct request_queue *q, | 411 | void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, |
400 | struct blk_mq_ctx *ctx, | 412 | struct blk_mq_ctx *ctx, |
401 | struct list_head *list, bool run_queue_async) | 413 | struct list_head *list, bool run_queue_async) |
402 | { | 414 | { |
403 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | 415 | struct elevator_queue *e; |
404 | struct elevator_queue *e = hctx->queue->elevator; | ||
405 | 416 | ||
406 | if (e && e->type->ops.mq.insert_requests) | 417 | e = hctx->queue->elevator; |
407 | e->type->ops.mq.insert_requests(hctx, list, false); | 418 | if (e && e->type->ops.insert_requests) |
419 | e->type->ops.insert_requests(hctx, list, false); | ||
408 | else { | 420 | else { |
409 | /* | 421 | /* |
410 | * try to issue requests directly if the hw queue isn't | 422 | * try to issue requests directly if the hw queue isn't |
411 | * busy in case of 'none' scheduler, and this way may save | 423 | * busy in case of 'none' scheduler, and this way may save |
412 | * us one extra enqueue & dequeue to sw queue. | 424 | * us one extra enqueue & dequeue to sw queue. |
413 | */ | 425 | */ |
414 | if (!hctx->dispatch_busy && !e && !run_queue_async) { | 426 | if (!hctx->dispatch_busy && !e && !run_queue_async) |
415 | blk_mq_try_issue_list_directly(hctx, list); | 427 | blk_mq_try_issue_list_directly(hctx, list); |
416 | if (list_empty(list)) | 428 | else |
417 | return; | 429 | blk_mq_insert_requests(hctx, ctx, list); |
418 | } | ||
419 | blk_mq_insert_requests(hctx, ctx, list); | ||
420 | } | 430 | } |
421 | 431 | ||
422 | blk_mq_run_hw_queue(hctx, run_queue_async); | 432 | blk_mq_run_hw_queue(hctx, run_queue_async); |
@@ -489,15 +499,15 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) | |||
489 | goto err; | 499 | goto err; |
490 | } | 500 | } |
491 | 501 | ||
492 | ret = e->ops.mq.init_sched(q, e); | 502 | ret = e->ops.init_sched(q, e); |
493 | if (ret) | 503 | if (ret) |
494 | goto err; | 504 | goto err; |
495 | 505 | ||
496 | blk_mq_debugfs_register_sched(q); | 506 | blk_mq_debugfs_register_sched(q); |
497 | 507 | ||
498 | queue_for_each_hw_ctx(q, hctx, i) { | 508 | queue_for_each_hw_ctx(q, hctx, i) { |
499 | if (e->ops.mq.init_hctx) { | 509 | if (e->ops.init_hctx) { |
500 | ret = e->ops.mq.init_hctx(hctx, i); | 510 | ret = e->ops.init_hctx(hctx, i); |
501 | if (ret) { | 511 | if (ret) { |
502 | eq = q->elevator; | 512 | eq = q->elevator; |
503 | blk_mq_exit_sched(q, eq); | 513 | blk_mq_exit_sched(q, eq); |
@@ -523,14 +533,14 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) | |||
523 | 533 | ||
524 | queue_for_each_hw_ctx(q, hctx, i) { | 534 | queue_for_each_hw_ctx(q, hctx, i) { |
525 | blk_mq_debugfs_unregister_sched_hctx(hctx); | 535 | blk_mq_debugfs_unregister_sched_hctx(hctx); |
526 | if (e->type->ops.mq.exit_hctx && hctx->sched_data) { | 536 | if (e->type->ops.exit_hctx && hctx->sched_data) { |
527 | e->type->ops.mq.exit_hctx(hctx, i); | 537 | e->type->ops.exit_hctx(hctx, i); |
528 | hctx->sched_data = NULL; | 538 | hctx->sched_data = NULL; |
529 | } | 539 | } |
530 | } | 540 | } |
531 | blk_mq_debugfs_unregister_sched(q); | 541 | blk_mq_debugfs_unregister_sched(q); |
532 | if (e->type->ops.mq.exit_sched) | 542 | if (e->type->ops.exit_sched) |
533 | e->type->ops.mq.exit_sched(e); | 543 | e->type->ops.exit_sched(e); |
534 | blk_mq_sched_tags_teardown(q); | 544 | blk_mq_sched_tags_teardown(q); |
535 | q->elevator = NULL; | 545 | q->elevator = NULL; |
536 | } | 546 | } |
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 8a9544203173..c7bdb52367ac 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h | |||
@@ -8,18 +8,19 @@ | |||
8 | void blk_mq_sched_free_hctx_data(struct request_queue *q, | 8 | void blk_mq_sched_free_hctx_data(struct request_queue *q, |
9 | void (*exit)(struct blk_mq_hw_ctx *)); | 9 | void (*exit)(struct blk_mq_hw_ctx *)); |
10 | 10 | ||
11 | void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio); | 11 | void blk_mq_sched_assign_ioc(struct request *rq); |
12 | 12 | ||
13 | void blk_mq_sched_request_inserted(struct request *rq); | 13 | void blk_mq_sched_request_inserted(struct request *rq); |
14 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, | 14 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, |
15 | struct request **merged_request); | 15 | struct request **merged_request); |
16 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); | 16 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); |
17 | bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); | 17 | bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); |
18 | void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); | ||
18 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); | 19 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); |
19 | 20 | ||
20 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, | 21 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, |
21 | bool run_queue, bool async); | 22 | bool run_queue, bool async); |
22 | void blk_mq_sched_insert_requests(struct request_queue *q, | 23 | void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, |
23 | struct blk_mq_ctx *ctx, | 24 | struct blk_mq_ctx *ctx, |
24 | struct list_head *list, bool run_queue_async); | 25 | struct list_head *list, bool run_queue_async); |
25 | 26 | ||
@@ -43,8 +44,8 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, | |||
43 | { | 44 | { |
44 | struct elevator_queue *e = q->elevator; | 45 | struct elevator_queue *e = q->elevator; |
45 | 46 | ||
46 | if (e && e->type->ops.mq.allow_merge) | 47 | if (e && e->type->ops.allow_merge) |
47 | return e->type->ops.mq.allow_merge(q, rq, bio); | 48 | return e->type->ops.allow_merge(q, rq, bio); |
48 | 49 | ||
49 | return true; | 50 | return true; |
50 | } | 51 | } |
@@ -53,8 +54,8 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) | |||
53 | { | 54 | { |
54 | struct elevator_queue *e = rq->q->elevator; | 55 | struct elevator_queue *e = rq->q->elevator; |
55 | 56 | ||
56 | if (e && e->type->ops.mq.completed_request) | 57 | if (e && e->type->ops.completed_request) |
57 | e->type->ops.mq.completed_request(rq, now); | 58 | e->type->ops.completed_request(rq, now); |
58 | } | 59 | } |
59 | 60 | ||
60 | static inline void blk_mq_sched_started_request(struct request *rq) | 61 | static inline void blk_mq_sched_started_request(struct request *rq) |
@@ -62,8 +63,8 @@ static inline void blk_mq_sched_started_request(struct request *rq) | |||
62 | struct request_queue *q = rq->q; | 63 | struct request_queue *q = rq->q; |
63 | struct elevator_queue *e = q->elevator; | 64 | struct elevator_queue *e = q->elevator; |
64 | 65 | ||
65 | if (e && e->type->ops.mq.started_request) | 66 | if (e && e->type->ops.started_request) |
66 | e->type->ops.mq.started_request(rq); | 67 | e->type->ops.started_request(rq); |
67 | } | 68 | } |
68 | 69 | ||
69 | static inline void blk_mq_sched_requeue_request(struct request *rq) | 70 | static inline void blk_mq_sched_requeue_request(struct request *rq) |
@@ -71,16 +72,16 @@ static inline void blk_mq_sched_requeue_request(struct request *rq) | |||
71 | struct request_queue *q = rq->q; | 72 | struct request_queue *q = rq->q; |
72 | struct elevator_queue *e = q->elevator; | 73 | struct elevator_queue *e = q->elevator; |
73 | 74 | ||
74 | if (e && e->type->ops.mq.requeue_request) | 75 | if (e && e->type->ops.requeue_request) |
75 | e->type->ops.mq.requeue_request(rq); | 76 | e->type->ops.requeue_request(rq); |
76 | } | 77 | } |
77 | 78 | ||
78 | static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) | 79 | static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) |
79 | { | 80 | { |
80 | struct elevator_queue *e = hctx->queue->elevator; | 81 | struct elevator_queue *e = hctx->queue->elevator; |
81 | 82 | ||
82 | if (e && e->type->ops.mq.has_work) | 83 | if (e && e->type->ops.has_work) |
83 | return e->type->ops.mq.has_work(hctx); | 84 | return e->type->ops.has_work(hctx); |
84 | 85 | ||
85 | return false; | 86 | return false; |
86 | } | 87 | } |
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index aafb44224c89..3f9c3f4ac44c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c | |||
@@ -15,6 +15,18 @@ | |||
15 | 15 | ||
16 | static void blk_mq_sysfs_release(struct kobject *kobj) | 16 | static void blk_mq_sysfs_release(struct kobject *kobj) |
17 | { | 17 | { |
18 | struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj); | ||
19 | |||
20 | free_percpu(ctxs->queue_ctx); | ||
21 | kfree(ctxs); | ||
22 | } | ||
23 | |||
24 | static void blk_mq_ctx_sysfs_release(struct kobject *kobj) | ||
25 | { | ||
26 | struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj); | ||
27 | |||
28 | /* ctx->ctxs won't be released until all ctx are freed */ | ||
29 | kobject_put(&ctx->ctxs->kobj); | ||
18 | } | 30 | } |
19 | 31 | ||
20 | static void blk_mq_hw_sysfs_release(struct kobject *kobj) | 32 | static void blk_mq_hw_sysfs_release(struct kobject *kobj) |
@@ -203,7 +215,7 @@ static struct kobj_type blk_mq_ktype = { | |||
203 | static struct kobj_type blk_mq_ctx_ktype = { | 215 | static struct kobj_type blk_mq_ctx_ktype = { |
204 | .sysfs_ops = &blk_mq_sysfs_ops, | 216 | .sysfs_ops = &blk_mq_sysfs_ops, |
205 | .default_attrs = default_ctx_attrs, | 217 | .default_attrs = default_ctx_attrs, |
206 | .release = blk_mq_sysfs_release, | 218 | .release = blk_mq_ctx_sysfs_release, |
207 | }; | 219 | }; |
208 | 220 | ||
209 | static struct kobj_type blk_mq_hw_ktype = { | 221 | static struct kobj_type blk_mq_hw_ktype = { |
@@ -235,7 +247,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) | |||
235 | if (!hctx->nr_ctx) | 247 | if (!hctx->nr_ctx) |
236 | return 0; | 248 | return 0; |
237 | 249 | ||
238 | ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); | 250 | ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num); |
239 | if (ret) | 251 | if (ret) |
240 | return ret; | 252 | return ret; |
241 | 253 | ||
@@ -258,8 +270,8 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) | |||
258 | queue_for_each_hw_ctx(q, hctx, i) | 270 | queue_for_each_hw_ctx(q, hctx, i) |
259 | blk_mq_unregister_hctx(hctx); | 271 | blk_mq_unregister_hctx(hctx); |
260 | 272 | ||
261 | kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); | 273 | kobject_uevent(q->mq_kobj, KOBJ_REMOVE); |
262 | kobject_del(&q->mq_kobj); | 274 | kobject_del(q->mq_kobj); |
263 | kobject_put(&dev->kobj); | 275 | kobject_put(&dev->kobj); |
264 | 276 | ||
265 | q->mq_sysfs_init_done = false; | 277 | q->mq_sysfs_init_done = false; |
@@ -279,7 +291,7 @@ void blk_mq_sysfs_deinit(struct request_queue *q) | |||
279 | ctx = per_cpu_ptr(q->queue_ctx, cpu); | 291 | ctx = per_cpu_ptr(q->queue_ctx, cpu); |
280 | kobject_put(&ctx->kobj); | 292 | kobject_put(&ctx->kobj); |
281 | } | 293 | } |
282 | kobject_put(&q->mq_kobj); | 294 | kobject_put(q->mq_kobj); |
283 | } | 295 | } |
284 | 296 | ||
285 | void blk_mq_sysfs_init(struct request_queue *q) | 297 | void blk_mq_sysfs_init(struct request_queue *q) |
@@ -287,10 +299,12 @@ void blk_mq_sysfs_init(struct request_queue *q) | |||
287 | struct blk_mq_ctx *ctx; | 299 | struct blk_mq_ctx *ctx; |
288 | int cpu; | 300 | int cpu; |
289 | 301 | ||
290 | kobject_init(&q->mq_kobj, &blk_mq_ktype); | 302 | kobject_init(q->mq_kobj, &blk_mq_ktype); |
291 | 303 | ||
292 | for_each_possible_cpu(cpu) { | 304 | for_each_possible_cpu(cpu) { |
293 | ctx = per_cpu_ptr(q->queue_ctx, cpu); | 305 | ctx = per_cpu_ptr(q->queue_ctx, cpu); |
306 | |||
307 | kobject_get(q->mq_kobj); | ||
294 | kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); | 308 | kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); |
295 | } | 309 | } |
296 | } | 310 | } |
@@ -303,11 +317,11 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q) | |||
303 | WARN_ON_ONCE(!q->kobj.parent); | 317 | WARN_ON_ONCE(!q->kobj.parent); |
304 | lockdep_assert_held(&q->sysfs_lock); | 318 | lockdep_assert_held(&q->sysfs_lock); |
305 | 319 | ||
306 | ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); | 320 | ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); |
307 | if (ret < 0) | 321 | if (ret < 0) |
308 | goto out; | 322 | goto out; |
309 | 323 | ||
310 | kobject_uevent(&q->mq_kobj, KOBJ_ADD); | 324 | kobject_uevent(q->mq_kobj, KOBJ_ADD); |
311 | 325 | ||
312 | queue_for_each_hw_ctx(q, hctx, i) { | 326 | queue_for_each_hw_ctx(q, hctx, i) { |
313 | ret = blk_mq_register_hctx(hctx); | 327 | ret = blk_mq_register_hctx(hctx); |
@@ -324,8 +338,8 @@ unreg: | |||
324 | while (--i >= 0) | 338 | while (--i >= 0) |
325 | blk_mq_unregister_hctx(q->queue_hw_ctx[i]); | 339 | blk_mq_unregister_hctx(q->queue_hw_ctx[i]); |
326 | 340 | ||
327 | kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); | 341 | kobject_uevent(q->mq_kobj, KOBJ_REMOVE); |
328 | kobject_del(&q->mq_kobj); | 342 | kobject_del(q->mq_kobj); |
329 | kobject_put(&dev->kobj); | 343 | kobject_put(&dev->kobj); |
330 | return ret; | 344 | return ret; |
331 | } | 345 | } |
@@ -340,7 +354,6 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q) | |||
340 | 354 | ||
341 | return ret; | 355 | return ret; |
342 | } | 356 | } |
343 | EXPORT_SYMBOL_GPL(blk_mq_register_dev); | ||
344 | 357 | ||
345 | void blk_mq_sysfs_unregister(struct request_queue *q) | 358 | void blk_mq_sysfs_unregister(struct request_queue *q) |
346 | { | 359 | { |
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index cfda95b85d34..2089c6c62f44 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c | |||
@@ -110,7 +110,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
110 | struct blk_mq_tags *tags = blk_mq_tags_from_data(data); | 110 | struct blk_mq_tags *tags = blk_mq_tags_from_data(data); |
111 | struct sbitmap_queue *bt; | 111 | struct sbitmap_queue *bt; |
112 | struct sbq_wait_state *ws; | 112 | struct sbq_wait_state *ws; |
113 | DEFINE_WAIT(wait); | 113 | DEFINE_SBQ_WAIT(wait); |
114 | unsigned int tag_offset; | 114 | unsigned int tag_offset; |
115 | bool drop_ctx; | 115 | bool drop_ctx; |
116 | int tag; | 116 | int tag; |
@@ -154,8 +154,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
154 | if (tag != -1) | 154 | if (tag != -1) |
155 | break; | 155 | break; |
156 | 156 | ||
157 | prepare_to_wait_exclusive(&ws->wait, &wait, | 157 | sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); |
158 | TASK_UNINTERRUPTIBLE); | ||
159 | 158 | ||
160 | tag = __blk_mq_get_tag(data, bt); | 159 | tag = __blk_mq_get_tag(data, bt); |
161 | if (tag != -1) | 160 | if (tag != -1) |
@@ -167,16 +166,17 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
167 | bt_prev = bt; | 166 | bt_prev = bt; |
168 | io_schedule(); | 167 | io_schedule(); |
169 | 168 | ||
169 | sbitmap_finish_wait(bt, ws, &wait); | ||
170 | |||
170 | data->ctx = blk_mq_get_ctx(data->q); | 171 | data->ctx = blk_mq_get_ctx(data->q); |
171 | data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); | 172 | data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, |
173 | data->ctx->cpu); | ||
172 | tags = blk_mq_tags_from_data(data); | 174 | tags = blk_mq_tags_from_data(data); |
173 | if (data->flags & BLK_MQ_REQ_RESERVED) | 175 | if (data->flags & BLK_MQ_REQ_RESERVED) |
174 | bt = &tags->breserved_tags; | 176 | bt = &tags->breserved_tags; |
175 | else | 177 | else |
176 | bt = &tags->bitmap_tags; | 178 | bt = &tags->bitmap_tags; |
177 | 179 | ||
178 | finish_wait(&ws->wait, &wait); | ||
179 | |||
180 | /* | 180 | /* |
181 | * If destination hw queue is changed, fake wake up on | 181 | * If destination hw queue is changed, fake wake up on |
182 | * previous queue for compensating the wake up miss, so | 182 | * previous queue for compensating the wake up miss, so |
@@ -191,7 +191,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | |||
191 | if (drop_ctx && data->ctx) | 191 | if (drop_ctx && data->ctx) |
192 | blk_mq_put_ctx(data->ctx); | 192 | blk_mq_put_ctx(data->ctx); |
193 | 193 | ||
194 | finish_wait(&ws->wait, &wait); | 194 | sbitmap_finish_wait(bt, ws, &wait); |
195 | 195 | ||
196 | found_tag: | 196 | found_tag: |
197 | return tag + tag_offset; | 197 | return tag + tag_offset; |
@@ -235,7 +235,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) | |||
235 | * test and set the bit before assigning ->rqs[]. | 235 | * test and set the bit before assigning ->rqs[]. |
236 | */ | 236 | */ |
237 | if (rq && rq->q == hctx->queue) | 237 | if (rq && rq->q == hctx->queue) |
238 | iter_data->fn(hctx, rq, iter_data->data, reserved); | 238 | return iter_data->fn(hctx, rq, iter_data->data, reserved); |
239 | return true; | 239 | return true; |
240 | } | 240 | } |
241 | 241 | ||
@@ -247,7 +247,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) | |||
247 | * @fn: Pointer to the function that will be called for each request | 247 | * @fn: Pointer to the function that will be called for each request |
248 | * associated with @hctx that has been assigned a driver tag. | 248 | * associated with @hctx that has been assigned a driver tag. |
249 | * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) | 249 | * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) |
250 | * where rq is a pointer to a request. | 250 | * where rq is a pointer to a request. Return true to continue |
251 | * iterating tags, false to stop. | ||
251 | * @data: Will be passed as third argument to @fn. | 252 | * @data: Will be passed as third argument to @fn. |
252 | * @reserved: Indicates whether @bt is the breserved_tags member or the | 253 | * @reserved: Indicates whether @bt is the breserved_tags member or the |
253 | * bitmap_tags member of struct blk_mq_tags. | 254 | * bitmap_tags member of struct blk_mq_tags. |
@@ -288,7 +289,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) | |||
288 | */ | 289 | */ |
289 | rq = tags->rqs[bitnr]; | 290 | rq = tags->rqs[bitnr]; |
290 | if (rq && blk_mq_request_started(rq)) | 291 | if (rq && blk_mq_request_started(rq)) |
291 | iter_data->fn(rq, iter_data->data, reserved); | 292 | return iter_data->fn(rq, iter_data->data, reserved); |
292 | 293 | ||
293 | return true; | 294 | return true; |
294 | } | 295 | } |
@@ -300,7 +301,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) | |||
300 | * or the bitmap_tags member of struct blk_mq_tags. | 301 | * or the bitmap_tags member of struct blk_mq_tags. |
301 | * @fn: Pointer to the function that will be called for each started | 302 | * @fn: Pointer to the function that will be called for each started |
302 | * request. @fn will be called as follows: @fn(rq, @data, | 303 | * request. @fn will be called as follows: @fn(rq, @data, |
303 | * @reserved) where rq is a pointer to a request. | 304 | * @reserved) where rq is a pointer to a request. Return true |
305 | * to continue iterating tags, false to stop. | ||
304 | * @data: Will be passed as second argument to @fn. | 306 | * @data: Will be passed as second argument to @fn. |
305 | * @reserved: Indicates whether @bt is the breserved_tags member or the | 307 | * @reserved: Indicates whether @bt is the breserved_tags member or the |
306 | * bitmap_tags member of struct blk_mq_tags. | 308 | * bitmap_tags member of struct blk_mq_tags. |
@@ -325,7 +327,8 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, | |||
325 | * @fn: Pointer to the function that will be called for each started | 327 | * @fn: Pointer to the function that will be called for each started |
326 | * request. @fn will be called as follows: @fn(rq, @priv, | 328 | * request. @fn will be called as follows: @fn(rq, @priv, |
327 | * reserved) where rq is a pointer to a request. 'reserved' | 329 | * reserved) where rq is a pointer to a request. 'reserved' |
328 | * indicates whether or not @rq is a reserved request. | 330 | * indicates whether or not @rq is a reserved request. Return |
331 | * true to continue iterating tags, false to stop. | ||
329 | * @priv: Will be passed as second argument to @fn. | 332 | * @priv: Will be passed as second argument to @fn. |
330 | */ | 333 | */ |
331 | static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, | 334 | static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, |
@@ -342,7 +345,8 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, | |||
342 | * @fn: Pointer to the function that will be called for each started | 345 | * @fn: Pointer to the function that will be called for each started |
343 | * request. @fn will be called as follows: @fn(rq, @priv, | 346 | * request. @fn will be called as follows: @fn(rq, @priv, |
344 | * reserved) where rq is a pointer to a request. 'reserved' | 347 | * reserved) where rq is a pointer to a request. 'reserved' |
345 | * indicates whether or not @rq is a reserved request. | 348 | * indicates whether or not @rq is a reserved request. Return |
349 | * true to continue iterating tags, false to stop. | ||
346 | * @priv: Will be passed as second argument to @fn. | 350 | * @priv: Will be passed as second argument to @fn. |
347 | */ | 351 | */ |
348 | void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, | 352 | void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, |
@@ -526,16 +530,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, | |||
526 | */ | 530 | */ |
527 | u32 blk_mq_unique_tag(struct request *rq) | 531 | u32 blk_mq_unique_tag(struct request *rq) |
528 | { | 532 | { |
529 | struct request_queue *q = rq->q; | 533 | return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | |
530 | struct blk_mq_hw_ctx *hctx; | ||
531 | int hwq = 0; | ||
532 | |||
533 | if (q->mq_ops) { | ||
534 | hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); | ||
535 | hwq = hctx->queue_num; | ||
536 | } | ||
537 | |||
538 | return (hwq << BLK_MQ_UNIQUE_TAG_BITS) | | ||
539 | (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); | 534 | (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); |
540 | } | 535 | } |
541 | EXPORT_SYMBOL(blk_mq_unique_tag); | 536 | EXPORT_SYMBOL(blk_mq_unique_tag); |
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c index c3afbca11299..370827163835 100644 --- a/block/blk-mq-virtio.c +++ b/block/blk-mq-virtio.c | |||
@@ -29,7 +29,7 @@ | |||
29 | * that maps a queue to the CPUs that have irq affinity for the corresponding | 29 | * that maps a queue to the CPUs that have irq affinity for the corresponding |
30 | * vector. | 30 | * vector. |
31 | */ | 31 | */ |
32 | int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, | 32 | int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, |
33 | struct virtio_device *vdev, int first_vec) | 33 | struct virtio_device *vdev, int first_vec) |
34 | { | 34 | { |
35 | const struct cpumask *mask; | 35 | const struct cpumask *mask; |
@@ -38,17 +38,17 @@ int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, | |||
38 | if (!vdev->config->get_vq_affinity) | 38 | if (!vdev->config->get_vq_affinity) |
39 | goto fallback; | 39 | goto fallback; |
40 | 40 | ||
41 | for (queue = 0; queue < set->nr_hw_queues; queue++) { | 41 | for (queue = 0; queue < qmap->nr_queues; queue++) { |
42 | mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); | 42 | mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); |
43 | if (!mask) | 43 | if (!mask) |
44 | goto fallback; | 44 | goto fallback; |
45 | 45 | ||
46 | for_each_cpu(cpu, mask) | 46 | for_each_cpu(cpu, mask) |
47 | set->mq_map[cpu] = queue; | 47 | qmap->mq_map[cpu] = qmap->queue_offset + queue; |
48 | } | 48 | } |
49 | 49 | ||
50 | return 0; | 50 | return 0; |
51 | fallback: | 51 | fallback: |
52 | return blk_mq_map_queues(set); | 52 | return blk_mq_map_queues(qmap); |
53 | } | 53 | } |
54 | EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); | 54 | EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); |
diff --git a/block/blk-mq.c b/block/blk-mq.c index 3f91c6e5b17a..3ba37b9e15e9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -38,7 +38,6 @@ | |||
38 | #include "blk-mq-sched.h" | 38 | #include "blk-mq-sched.h" |
39 | #include "blk-rq-qos.h" | 39 | #include "blk-rq-qos.h" |
40 | 40 | ||
41 | static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); | ||
42 | static void blk_mq_poll_stats_start(struct request_queue *q); | 41 | static void blk_mq_poll_stats_start(struct request_queue *q); |
43 | static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); | 42 | static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); |
44 | 43 | ||
@@ -75,14 +74,18 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) | |||
75 | static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, | 74 | static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, |
76 | struct blk_mq_ctx *ctx) | 75 | struct blk_mq_ctx *ctx) |
77 | { | 76 | { |
78 | if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) | 77 | const int bit = ctx->index_hw[hctx->type]; |
79 | sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); | 78 | |
79 | if (!sbitmap_test_bit(&hctx->ctx_map, bit)) | ||
80 | sbitmap_set_bit(&hctx->ctx_map, bit); | ||
80 | } | 81 | } |
81 | 82 | ||
82 | static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, | 83 | static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, |
83 | struct blk_mq_ctx *ctx) | 84 | struct blk_mq_ctx *ctx) |
84 | { | 85 | { |
85 | sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); | 86 | const int bit = ctx->index_hw[hctx->type]; |
87 | |||
88 | sbitmap_clear_bit(&hctx->ctx_map, bit); | ||
86 | } | 89 | } |
87 | 90 | ||
88 | struct mq_inflight { | 91 | struct mq_inflight { |
@@ -90,33 +93,33 @@ struct mq_inflight { | |||
90 | unsigned int *inflight; | 93 | unsigned int *inflight; |
91 | }; | 94 | }; |
92 | 95 | ||
93 | static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, | 96 | static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, |
94 | struct request *rq, void *priv, | 97 | struct request *rq, void *priv, |
95 | bool reserved) | 98 | bool reserved) |
96 | { | 99 | { |
97 | struct mq_inflight *mi = priv; | 100 | struct mq_inflight *mi = priv; |
98 | 101 | ||
99 | /* | 102 | /* |
100 | * index[0] counts the specific partition that was asked for. index[1] | 103 | * index[0] counts the specific partition that was asked for. |
101 | * counts the ones that are active on the whole device, so increment | ||
102 | * that if mi->part is indeed a partition, and not a whole device. | ||
103 | */ | 104 | */ |
104 | if (rq->part == mi->part) | 105 | if (rq->part == mi->part) |
105 | mi->inflight[0]++; | 106 | mi->inflight[0]++; |
106 | if (mi->part->partno) | 107 | |
107 | mi->inflight[1]++; | 108 | return true; |
108 | } | 109 | } |
109 | 110 | ||
110 | void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, | 111 | unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) |
111 | unsigned int inflight[2]) | ||
112 | { | 112 | { |
113 | unsigned inflight[2]; | ||
113 | struct mq_inflight mi = { .part = part, .inflight = inflight, }; | 114 | struct mq_inflight mi = { .part = part, .inflight = inflight, }; |
114 | 115 | ||
115 | inflight[0] = inflight[1] = 0; | 116 | inflight[0] = inflight[1] = 0; |
116 | blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); | 117 | blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); |
118 | |||
119 | return inflight[0]; | ||
117 | } | 120 | } |
118 | 121 | ||
119 | static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, | 122 | static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, |
120 | struct request *rq, void *priv, | 123 | struct request *rq, void *priv, |
121 | bool reserved) | 124 | bool reserved) |
122 | { | 125 | { |
@@ -124,6 +127,8 @@ static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, | |||
124 | 127 | ||
125 | if (rq->part == mi->part) | 128 | if (rq->part == mi->part) |
126 | mi->inflight[rq_data_dir(rq)]++; | 129 | mi->inflight[rq_data_dir(rq)]++; |
130 | |||
131 | return true; | ||
127 | } | 132 | } |
128 | 133 | ||
129 | void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, | 134 | void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, |
@@ -142,7 +147,7 @@ void blk_freeze_queue_start(struct request_queue *q) | |||
142 | freeze_depth = atomic_inc_return(&q->mq_freeze_depth); | 147 | freeze_depth = atomic_inc_return(&q->mq_freeze_depth); |
143 | if (freeze_depth == 1) { | 148 | if (freeze_depth == 1) { |
144 | percpu_ref_kill(&q->q_usage_counter); | 149 | percpu_ref_kill(&q->q_usage_counter); |
145 | if (q->mq_ops) | 150 | if (queue_is_mq(q)) |
146 | blk_mq_run_hw_queues(q, false); | 151 | blk_mq_run_hw_queues(q, false); |
147 | } | 152 | } |
148 | } | 153 | } |
@@ -177,8 +182,6 @@ void blk_freeze_queue(struct request_queue *q) | |||
177 | * exported to drivers as the only user for unfreeze is blk_mq. | 182 | * exported to drivers as the only user for unfreeze is blk_mq. |
178 | */ | 183 | */ |
179 | blk_freeze_queue_start(q); | 184 | blk_freeze_queue_start(q); |
180 | if (!q->mq_ops) | ||
181 | blk_drain_queue(q); | ||
182 | blk_mq_freeze_queue_wait(q); | 185 | blk_mq_freeze_queue_wait(q); |
183 | } | 186 | } |
184 | 187 | ||
@@ -275,6 +278,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) | |||
275 | } | 278 | } |
276 | EXPORT_SYMBOL(blk_mq_can_queue); | 279 | EXPORT_SYMBOL(blk_mq_can_queue); |
277 | 280 | ||
281 | /* | ||
282 | * Only need start/end time stamping if we have stats enabled, or using | ||
283 | * an IO scheduler. | ||
284 | */ | ||
285 | static inline bool blk_mq_need_time_stamp(struct request *rq) | ||
286 | { | ||
287 | return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; | ||
288 | } | ||
289 | |||
278 | static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | 290 | static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, |
279 | unsigned int tag, unsigned int op) | 291 | unsigned int tag, unsigned int op) |
280 | { | 292 | { |
@@ -298,8 +310,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |||
298 | /* csd/requeue_work/fifo_time is initialized before use */ | 310 | /* csd/requeue_work/fifo_time is initialized before use */ |
299 | rq->q = data->q; | 311 | rq->q = data->q; |
300 | rq->mq_ctx = data->ctx; | 312 | rq->mq_ctx = data->ctx; |
313 | rq->mq_hctx = data->hctx; | ||
301 | rq->rq_flags = rq_flags; | 314 | rq->rq_flags = rq_flags; |
302 | rq->cpu = -1; | ||
303 | rq->cmd_flags = op; | 315 | rq->cmd_flags = op; |
304 | if (data->flags & BLK_MQ_REQ_PREEMPT) | 316 | if (data->flags & BLK_MQ_REQ_PREEMPT) |
305 | rq->rq_flags |= RQF_PREEMPT; | 317 | rq->rq_flags |= RQF_PREEMPT; |
@@ -310,7 +322,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |||
310 | RB_CLEAR_NODE(&rq->rb_node); | 322 | RB_CLEAR_NODE(&rq->rb_node); |
311 | rq->rq_disk = NULL; | 323 | rq->rq_disk = NULL; |
312 | rq->part = NULL; | 324 | rq->part = NULL; |
313 | rq->start_time_ns = ktime_get_ns(); | 325 | if (blk_mq_need_time_stamp(rq)) |
326 | rq->start_time_ns = ktime_get_ns(); | ||
327 | else | ||
328 | rq->start_time_ns = 0; | ||
314 | rq->io_start_time_ns = 0; | 329 | rq->io_start_time_ns = 0; |
315 | rq->nr_phys_segments = 0; | 330 | rq->nr_phys_segments = 0; |
316 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 331 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
@@ -319,27 +334,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |||
319 | rq->special = NULL; | 334 | rq->special = NULL; |
320 | /* tag was already set */ | 335 | /* tag was already set */ |
321 | rq->extra_len = 0; | 336 | rq->extra_len = 0; |
322 | rq->__deadline = 0; | 337 | WRITE_ONCE(rq->deadline, 0); |
323 | 338 | ||
324 | INIT_LIST_HEAD(&rq->timeout_list); | ||
325 | rq->timeout = 0; | 339 | rq->timeout = 0; |
326 | 340 | ||
327 | rq->end_io = NULL; | 341 | rq->end_io = NULL; |
328 | rq->end_io_data = NULL; | 342 | rq->end_io_data = NULL; |
329 | rq->next_rq = NULL; | 343 | rq->next_rq = NULL; |
330 | 344 | ||
331 | #ifdef CONFIG_BLK_CGROUP | ||
332 | rq->rl = NULL; | ||
333 | #endif | ||
334 | |||
335 | data->ctx->rq_dispatched[op_is_sync(op)]++; | 345 | data->ctx->rq_dispatched[op_is_sync(op)]++; |
336 | refcount_set(&rq->ref, 1); | 346 | refcount_set(&rq->ref, 1); |
337 | return rq; | 347 | return rq; |
338 | } | 348 | } |
339 | 349 | ||
340 | static struct request *blk_mq_get_request(struct request_queue *q, | 350 | static struct request *blk_mq_get_request(struct request_queue *q, |
341 | struct bio *bio, unsigned int op, | 351 | struct bio *bio, |
342 | struct blk_mq_alloc_data *data) | 352 | struct blk_mq_alloc_data *data) |
343 | { | 353 | { |
344 | struct elevator_queue *e = q->elevator; | 354 | struct elevator_queue *e = q->elevator; |
345 | struct request *rq; | 355 | struct request *rq; |
@@ -353,8 +363,9 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
353 | put_ctx_on_error = true; | 363 | put_ctx_on_error = true; |
354 | } | 364 | } |
355 | if (likely(!data->hctx)) | 365 | if (likely(!data->hctx)) |
356 | data->hctx = blk_mq_map_queue(q, data->ctx->cpu); | 366 | data->hctx = blk_mq_map_queue(q, data->cmd_flags, |
357 | if (op & REQ_NOWAIT) | 367 | data->ctx->cpu); |
368 | if (data->cmd_flags & REQ_NOWAIT) | ||
358 | data->flags |= BLK_MQ_REQ_NOWAIT; | 369 | data->flags |= BLK_MQ_REQ_NOWAIT; |
359 | 370 | ||
360 | if (e) { | 371 | if (e) { |
@@ -365,9 +376,10 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
365 | * dispatch list. Don't include reserved tags in the | 376 | * dispatch list. Don't include reserved tags in the |
366 | * limiting, as it isn't useful. | 377 | * limiting, as it isn't useful. |
367 | */ | 378 | */ |
368 | if (!op_is_flush(op) && e->type->ops.mq.limit_depth && | 379 | if (!op_is_flush(data->cmd_flags) && |
380 | e->type->ops.limit_depth && | ||
369 | !(data->flags & BLK_MQ_REQ_RESERVED)) | 381 | !(data->flags & BLK_MQ_REQ_RESERVED)) |
370 | e->type->ops.mq.limit_depth(op, data); | 382 | e->type->ops.limit_depth(data->cmd_flags, data); |
371 | } else { | 383 | } else { |
372 | blk_mq_tag_busy(data->hctx); | 384 | blk_mq_tag_busy(data->hctx); |
373 | } | 385 | } |
@@ -382,14 +394,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
382 | return NULL; | 394 | return NULL; |
383 | } | 395 | } |
384 | 396 | ||
385 | rq = blk_mq_rq_ctx_init(data, tag, op); | 397 | rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); |
386 | if (!op_is_flush(op)) { | 398 | if (!op_is_flush(data->cmd_flags)) { |
387 | rq->elv.icq = NULL; | 399 | rq->elv.icq = NULL; |
388 | if (e && e->type->ops.mq.prepare_request) { | 400 | if (e && e->type->ops.prepare_request) { |
389 | if (e->type->icq_cache && rq_ioc(bio)) | 401 | if (e->type->icq_cache) |
390 | blk_mq_sched_assign_ioc(rq, bio); | 402 | blk_mq_sched_assign_ioc(rq); |
391 | 403 | ||
392 | e->type->ops.mq.prepare_request(rq, bio); | 404 | e->type->ops.prepare_request(rq, bio); |
393 | rq->rq_flags |= RQF_ELVPRIV; | 405 | rq->rq_flags |= RQF_ELVPRIV; |
394 | } | 406 | } |
395 | } | 407 | } |
@@ -400,7 +412,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
400 | struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, | 412 | struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, |
401 | blk_mq_req_flags_t flags) | 413 | blk_mq_req_flags_t flags) |
402 | { | 414 | { |
403 | struct blk_mq_alloc_data alloc_data = { .flags = flags }; | 415 | struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; |
404 | struct request *rq; | 416 | struct request *rq; |
405 | int ret; | 417 | int ret; |
406 | 418 | ||
@@ -408,7 +420,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, | |||
408 | if (ret) | 420 | if (ret) |
409 | return ERR_PTR(ret); | 421 | return ERR_PTR(ret); |
410 | 422 | ||
411 | rq = blk_mq_get_request(q, NULL, op, &alloc_data); | 423 | rq = blk_mq_get_request(q, NULL, &alloc_data); |
412 | blk_queue_exit(q); | 424 | blk_queue_exit(q); |
413 | 425 | ||
414 | if (!rq) | 426 | if (!rq) |
@@ -426,7 +438,7 @@ EXPORT_SYMBOL(blk_mq_alloc_request); | |||
426 | struct request *blk_mq_alloc_request_hctx(struct request_queue *q, | 438 | struct request *blk_mq_alloc_request_hctx(struct request_queue *q, |
427 | unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) | 439 | unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) |
428 | { | 440 | { |
429 | struct blk_mq_alloc_data alloc_data = { .flags = flags }; | 441 | struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; |
430 | struct request *rq; | 442 | struct request *rq; |
431 | unsigned int cpu; | 443 | unsigned int cpu; |
432 | int ret; | 444 | int ret; |
@@ -459,7 +471,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, | |||
459 | cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); | 471 | cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); |
460 | alloc_data.ctx = __blk_mq_get_ctx(q, cpu); | 472 | alloc_data.ctx = __blk_mq_get_ctx(q, cpu); |
461 | 473 | ||
462 | rq = blk_mq_get_request(q, NULL, op, &alloc_data); | 474 | rq = blk_mq_get_request(q, NULL, &alloc_data); |
463 | blk_queue_exit(q); | 475 | blk_queue_exit(q); |
464 | 476 | ||
465 | if (!rq) | 477 | if (!rq) |
@@ -473,10 +485,11 @@ static void __blk_mq_free_request(struct request *rq) | |||
473 | { | 485 | { |
474 | struct request_queue *q = rq->q; | 486 | struct request_queue *q = rq->q; |
475 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 487 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
476 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | 488 | struct blk_mq_hw_ctx *hctx = rq->mq_hctx; |
477 | const int sched_tag = rq->internal_tag; | 489 | const int sched_tag = rq->internal_tag; |
478 | 490 | ||
479 | blk_pm_mark_last_busy(rq); | 491 | blk_pm_mark_last_busy(rq); |
492 | rq->mq_hctx = NULL; | ||
480 | if (rq->tag != -1) | 493 | if (rq->tag != -1) |
481 | blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); | 494 | blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); |
482 | if (sched_tag != -1) | 495 | if (sched_tag != -1) |
@@ -490,11 +503,11 @@ void blk_mq_free_request(struct request *rq) | |||
490 | struct request_queue *q = rq->q; | 503 | struct request_queue *q = rq->q; |
491 | struct elevator_queue *e = q->elevator; | 504 | struct elevator_queue *e = q->elevator; |
492 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 505 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
493 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | 506 | struct blk_mq_hw_ctx *hctx = rq->mq_hctx; |
494 | 507 | ||
495 | if (rq->rq_flags & RQF_ELVPRIV) { | 508 | if (rq->rq_flags & RQF_ELVPRIV) { |
496 | if (e && e->type->ops.mq.finish_request) | 509 | if (e && e->type->ops.finish_request) |
497 | e->type->ops.mq.finish_request(rq); | 510 | e->type->ops.finish_request(rq); |
498 | if (rq->elv.icq) { | 511 | if (rq->elv.icq) { |
499 | put_io_context(rq->elv.icq->ioc); | 512 | put_io_context(rq->elv.icq->ioc); |
500 | rq->elv.icq = NULL; | 513 | rq->elv.icq = NULL; |
@@ -510,9 +523,6 @@ void blk_mq_free_request(struct request *rq) | |||
510 | 523 | ||
511 | rq_qos_done(q, rq); | 524 | rq_qos_done(q, rq); |
512 | 525 | ||
513 | if (blk_rq_rl(rq)) | ||
514 | blk_put_rl(blk_rq_rl(rq)); | ||
515 | |||
516 | WRITE_ONCE(rq->state, MQ_RQ_IDLE); | 526 | WRITE_ONCE(rq->state, MQ_RQ_IDLE); |
517 | if (refcount_dec_and_test(&rq->ref)) | 527 | if (refcount_dec_and_test(&rq->ref)) |
518 | __blk_mq_free_request(rq); | 528 | __blk_mq_free_request(rq); |
@@ -521,7 +531,10 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); | |||
521 | 531 | ||
522 | inline void __blk_mq_end_request(struct request *rq, blk_status_t error) | 532 | inline void __blk_mq_end_request(struct request *rq, blk_status_t error) |
523 | { | 533 | { |
524 | u64 now = ktime_get_ns(); | 534 | u64 now = 0; |
535 | |||
536 | if (blk_mq_need_time_stamp(rq)) | ||
537 | now = ktime_get_ns(); | ||
525 | 538 | ||
526 | if (rq->rq_flags & RQF_STATS) { | 539 | if (rq->rq_flags & RQF_STATS) { |
527 | blk_mq_poll_stats_start(rq->q); | 540 | blk_mq_poll_stats_start(rq->q); |
@@ -555,19 +568,19 @@ EXPORT_SYMBOL(blk_mq_end_request); | |||
555 | static void __blk_mq_complete_request_remote(void *data) | 568 | static void __blk_mq_complete_request_remote(void *data) |
556 | { | 569 | { |
557 | struct request *rq = data; | 570 | struct request *rq = data; |
571 | struct request_queue *q = rq->q; | ||
558 | 572 | ||
559 | rq->q->softirq_done_fn(rq); | 573 | q->mq_ops->complete(rq); |
560 | } | 574 | } |
561 | 575 | ||
562 | static void __blk_mq_complete_request(struct request *rq) | 576 | static void __blk_mq_complete_request(struct request *rq) |
563 | { | 577 | { |
564 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 578 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
579 | struct request_queue *q = rq->q; | ||
565 | bool shared = false; | 580 | bool shared = false; |
566 | int cpu; | 581 | int cpu; |
567 | 582 | ||
568 | if (!blk_mq_mark_complete(rq)) | 583 | WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); |
569 | return; | ||
570 | |||
571 | /* | 584 | /* |
572 | * Most of single queue controllers, there is only one irq vector | 585 | * Most of single queue controllers, there is only one irq vector |
573 | * for handling IO completion, and the only irq's affinity is set | 586 | * for handling IO completion, and the only irq's affinity is set |
@@ -577,18 +590,23 @@ static void __blk_mq_complete_request(struct request *rq) | |||
577 | * So complete IO reqeust in softirq context in case of single queue | 590 | * So complete IO reqeust in softirq context in case of single queue |
578 | * for not degrading IO performance by irqsoff latency. | 591 | * for not degrading IO performance by irqsoff latency. |
579 | */ | 592 | */ |
580 | if (rq->q->nr_hw_queues == 1) { | 593 | if (q->nr_hw_queues == 1) { |
581 | __blk_complete_request(rq); | 594 | __blk_complete_request(rq); |
582 | return; | 595 | return; |
583 | } | 596 | } |
584 | 597 | ||
585 | if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { | 598 | /* |
586 | rq->q->softirq_done_fn(rq); | 599 | * For a polled request, always complete locallly, it's pointless |
600 | * to redirect the completion. | ||
601 | */ | ||
602 | if ((rq->cmd_flags & REQ_HIPRI) || | ||
603 | !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { | ||
604 | q->mq_ops->complete(rq); | ||
587 | return; | 605 | return; |
588 | } | 606 | } |
589 | 607 | ||
590 | cpu = get_cpu(); | 608 | cpu = get_cpu(); |
591 | if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) | 609 | if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) |
592 | shared = cpus_share_cache(cpu, ctx->cpu); | 610 | shared = cpus_share_cache(cpu, ctx->cpu); |
593 | 611 | ||
594 | if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { | 612 | if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { |
@@ -597,7 +615,7 @@ static void __blk_mq_complete_request(struct request *rq) | |||
597 | rq->csd.flags = 0; | 615 | rq->csd.flags = 0; |
598 | smp_call_function_single_async(ctx->cpu, &rq->csd); | 616 | smp_call_function_single_async(ctx->cpu, &rq->csd); |
599 | } else { | 617 | } else { |
600 | rq->q->softirq_done_fn(rq); | 618 | q->mq_ops->complete(rq); |
601 | } | 619 | } |
602 | put_cpu(); | 620 | put_cpu(); |
603 | } | 621 | } |
@@ -630,11 +648,12 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) | |||
630 | * Ends all I/O on a request. It does not handle partial completions. | 648 | * Ends all I/O on a request. It does not handle partial completions. |
631 | * The actual completion happens out-of-order, through a IPI handler. | 649 | * The actual completion happens out-of-order, through a IPI handler. |
632 | **/ | 650 | **/ |
633 | void blk_mq_complete_request(struct request *rq) | 651 | bool blk_mq_complete_request(struct request *rq) |
634 | { | 652 | { |
635 | if (unlikely(blk_should_fake_timeout(rq->q))) | 653 | if (unlikely(blk_should_fake_timeout(rq->q))) |
636 | return; | 654 | return false; |
637 | __blk_mq_complete_request(rq); | 655 | __blk_mq_complete_request(rq); |
656 | return true; | ||
638 | } | 657 | } |
639 | EXPORT_SYMBOL(blk_mq_complete_request); | 658 | EXPORT_SYMBOL(blk_mq_complete_request); |
640 | 659 | ||
@@ -701,7 +720,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) | |||
701 | /* this request will be re-inserted to io scheduler queue */ | 720 | /* this request will be re-inserted to io scheduler queue */ |
702 | blk_mq_sched_requeue_request(rq); | 721 | blk_mq_sched_requeue_request(rq); |
703 | 722 | ||
704 | BUG_ON(blk_queued_rq(rq)); | 723 | BUG_ON(!list_empty(&rq->queuelist)); |
705 | blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); | 724 | blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); |
706 | } | 725 | } |
707 | EXPORT_SYMBOL(blk_mq_requeue_request); | 726 | EXPORT_SYMBOL(blk_mq_requeue_request); |
@@ -786,6 +805,32 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) | |||
786 | } | 805 | } |
787 | EXPORT_SYMBOL(blk_mq_tag_to_rq); | 806 | EXPORT_SYMBOL(blk_mq_tag_to_rq); |
788 | 807 | ||
808 | static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, | ||
809 | void *priv, bool reserved) | ||
810 | { | ||
811 | /* | ||
812 | * If we find a request that is inflight and the queue matches, | ||
813 | * we know the queue is busy. Return false to stop the iteration. | ||
814 | */ | ||
815 | if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { | ||
816 | bool *busy = priv; | ||
817 | |||
818 | *busy = true; | ||
819 | return false; | ||
820 | } | ||
821 | |||
822 | return true; | ||
823 | } | ||
824 | |||
825 | bool blk_mq_queue_inflight(struct request_queue *q) | ||
826 | { | ||
827 | bool busy = false; | ||
828 | |||
829 | blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); | ||
830 | return busy; | ||
831 | } | ||
832 | EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); | ||
833 | |||
789 | static void blk_mq_rq_timed_out(struct request *req, bool reserved) | 834 | static void blk_mq_rq_timed_out(struct request *req, bool reserved) |
790 | { | 835 | { |
791 | req->rq_flags |= RQF_TIMED_OUT; | 836 | req->rq_flags |= RQF_TIMED_OUT; |
@@ -810,7 +855,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) | |||
810 | if (rq->rq_flags & RQF_TIMED_OUT) | 855 | if (rq->rq_flags & RQF_TIMED_OUT) |
811 | return false; | 856 | return false; |
812 | 857 | ||
813 | deadline = blk_rq_deadline(rq); | 858 | deadline = READ_ONCE(rq->deadline); |
814 | if (time_after_eq(jiffies, deadline)) | 859 | if (time_after_eq(jiffies, deadline)) |
815 | return true; | 860 | return true; |
816 | 861 | ||
@@ -821,7 +866,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) | |||
821 | return false; | 866 | return false; |
822 | } | 867 | } |
823 | 868 | ||
824 | static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | 869 | static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, |
825 | struct request *rq, void *priv, bool reserved) | 870 | struct request *rq, void *priv, bool reserved) |
826 | { | 871 | { |
827 | unsigned long *next = priv; | 872 | unsigned long *next = priv; |
@@ -831,7 +876,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | |||
831 | * so we're not unnecessarilly synchronizing across CPUs. | 876 | * so we're not unnecessarilly synchronizing across CPUs. |
832 | */ | 877 | */ |
833 | if (!blk_mq_req_expired(rq, next)) | 878 | if (!blk_mq_req_expired(rq, next)) |
834 | return; | 879 | return true; |
835 | 880 | ||
836 | /* | 881 | /* |
837 | * We have reason to believe the request may be expired. Take a | 882 | * We have reason to believe the request may be expired. Take a |
@@ -843,7 +888,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | |||
843 | * timeout handler to posting a natural completion. | 888 | * timeout handler to posting a natural completion. |
844 | */ | 889 | */ |
845 | if (!refcount_inc_not_zero(&rq->ref)) | 890 | if (!refcount_inc_not_zero(&rq->ref)) |
846 | return; | 891 | return true; |
847 | 892 | ||
848 | /* | 893 | /* |
849 | * The request is now locked and cannot be reallocated underneath the | 894 | * The request is now locked and cannot be reallocated underneath the |
@@ -855,6 +900,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | |||
855 | blk_mq_rq_timed_out(rq, reserved); | 900 | blk_mq_rq_timed_out(rq, reserved); |
856 | if (refcount_dec_and_test(&rq->ref)) | 901 | if (refcount_dec_and_test(&rq->ref)) |
857 | __blk_mq_free_request(rq); | 902 | __blk_mq_free_request(rq); |
903 | |||
904 | return true; | ||
858 | } | 905 | } |
859 | 906 | ||
860 | static void blk_mq_timeout_work(struct work_struct *work) | 907 | static void blk_mq_timeout_work(struct work_struct *work) |
@@ -911,9 +958,10 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) | |||
911 | struct flush_busy_ctx_data *flush_data = data; | 958 | struct flush_busy_ctx_data *flush_data = data; |
912 | struct blk_mq_hw_ctx *hctx = flush_data->hctx; | 959 | struct blk_mq_hw_ctx *hctx = flush_data->hctx; |
913 | struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; | 960 | struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; |
961 | enum hctx_type type = hctx->type; | ||
914 | 962 | ||
915 | spin_lock(&ctx->lock); | 963 | spin_lock(&ctx->lock); |
916 | list_splice_tail_init(&ctx->rq_list, flush_data->list); | 964 | list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); |
917 | sbitmap_clear_bit(sb, bitnr); | 965 | sbitmap_clear_bit(sb, bitnr); |
918 | spin_unlock(&ctx->lock); | 966 | spin_unlock(&ctx->lock); |
919 | return true; | 967 | return true; |
@@ -945,12 +993,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, | |||
945 | struct dispatch_rq_data *dispatch_data = data; | 993 | struct dispatch_rq_data *dispatch_data = data; |
946 | struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; | 994 | struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; |
947 | struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; | 995 | struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; |
996 | enum hctx_type type = hctx->type; | ||
948 | 997 | ||
949 | spin_lock(&ctx->lock); | 998 | spin_lock(&ctx->lock); |
950 | if (!list_empty(&ctx->rq_list)) { | 999 | if (!list_empty(&ctx->rq_lists[type])) { |
951 | dispatch_data->rq = list_entry_rq(ctx->rq_list.next); | 1000 | dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); |
952 | list_del_init(&dispatch_data->rq->queuelist); | 1001 | list_del_init(&dispatch_data->rq->queuelist); |
953 | if (list_empty(&ctx->rq_list)) | 1002 | if (list_empty(&ctx->rq_lists[type])) |
954 | sbitmap_clear_bit(sb, bitnr); | 1003 | sbitmap_clear_bit(sb, bitnr); |
955 | } | 1004 | } |
956 | spin_unlock(&ctx->lock); | 1005 | spin_unlock(&ctx->lock); |
@@ -961,7 +1010,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, | |||
961 | struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, | 1010 | struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, |
962 | struct blk_mq_ctx *start) | 1011 | struct blk_mq_ctx *start) |
963 | { | 1012 | { |
964 | unsigned off = start ? start->index_hw : 0; | 1013 | unsigned off = start ? start->index_hw[hctx->type] : 0; |
965 | struct dispatch_rq_data data = { | 1014 | struct dispatch_rq_data data = { |
966 | .hctx = hctx, | 1015 | .hctx = hctx, |
967 | .rq = NULL, | 1016 | .rq = NULL, |
@@ -985,8 +1034,9 @@ bool blk_mq_get_driver_tag(struct request *rq) | |||
985 | { | 1034 | { |
986 | struct blk_mq_alloc_data data = { | 1035 | struct blk_mq_alloc_data data = { |
987 | .q = rq->q, | 1036 | .q = rq->q, |
988 | .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), | 1037 | .hctx = rq->mq_hctx, |
989 | .flags = BLK_MQ_REQ_NOWAIT, | 1038 | .flags = BLK_MQ_REQ_NOWAIT, |
1039 | .cmd_flags = rq->cmd_flags, | ||
990 | }; | 1040 | }; |
991 | bool shared; | 1041 | bool shared; |
992 | 1042 | ||
@@ -1150,7 +1200,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | |||
1150 | 1200 | ||
1151 | rq = list_first_entry(list, struct request, queuelist); | 1201 | rq = list_first_entry(list, struct request, queuelist); |
1152 | 1202 | ||
1153 | hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); | 1203 | hctx = rq->mq_hctx; |
1154 | if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) | 1204 | if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) |
1155 | break; | 1205 | break; |
1156 | 1206 | ||
@@ -1223,6 +1273,14 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | |||
1223 | if (!list_empty(list)) { | 1273 | if (!list_empty(list)) { |
1224 | bool needs_restart; | 1274 | bool needs_restart; |
1225 | 1275 | ||
1276 | /* | ||
1277 | * If we didn't flush the entire list, we could have told | ||
1278 | * the driver there was more coming, but that turned out to | ||
1279 | * be a lie. | ||
1280 | */ | ||
1281 | if (q->mq_ops->commit_rqs) | ||
1282 | q->mq_ops->commit_rqs(hctx); | ||
1283 | |||
1226 | spin_lock(&hctx->lock); | 1284 | spin_lock(&hctx->lock); |
1227 | list_splice_init(list, &hctx->dispatch); | 1285 | list_splice_init(list, &hctx->dispatch); |
1228 | spin_unlock(&hctx->lock); | 1286 | spin_unlock(&hctx->lock); |
@@ -1552,15 +1610,16 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, | |||
1552 | bool at_head) | 1610 | bool at_head) |
1553 | { | 1611 | { |
1554 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 1612 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
1613 | enum hctx_type type = hctx->type; | ||
1555 | 1614 | ||
1556 | lockdep_assert_held(&ctx->lock); | 1615 | lockdep_assert_held(&ctx->lock); |
1557 | 1616 | ||
1558 | trace_block_rq_insert(hctx->queue, rq); | 1617 | trace_block_rq_insert(hctx->queue, rq); |
1559 | 1618 | ||
1560 | if (at_head) | 1619 | if (at_head) |
1561 | list_add(&rq->queuelist, &ctx->rq_list); | 1620 | list_add(&rq->queuelist, &ctx->rq_lists[type]); |
1562 | else | 1621 | else |
1563 | list_add_tail(&rq->queuelist, &ctx->rq_list); | 1622 | list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); |
1564 | } | 1623 | } |
1565 | 1624 | ||
1566 | void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | 1625 | void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, |
@@ -1580,8 +1639,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | |||
1580 | */ | 1639 | */ |
1581 | void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) | 1640 | void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) |
1582 | { | 1641 | { |
1583 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 1642 | struct blk_mq_hw_ctx *hctx = rq->mq_hctx; |
1584 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); | ||
1585 | 1643 | ||
1586 | spin_lock(&hctx->lock); | 1644 | spin_lock(&hctx->lock); |
1587 | list_add_tail(&rq->queuelist, &hctx->dispatch); | 1645 | list_add_tail(&rq->queuelist, &hctx->dispatch); |
@@ -1596,6 +1654,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, | |||
1596 | 1654 | ||
1597 | { | 1655 | { |
1598 | struct request *rq; | 1656 | struct request *rq; |
1657 | enum hctx_type type = hctx->type; | ||
1599 | 1658 | ||
1600 | /* | 1659 | /* |
1601 | * preemption doesn't flush plug list, so it's possible ctx->cpu is | 1660 | * preemption doesn't flush plug list, so it's possible ctx->cpu is |
@@ -1607,35 +1666,46 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, | |||
1607 | } | 1666 | } |
1608 | 1667 | ||
1609 | spin_lock(&ctx->lock); | 1668 | spin_lock(&ctx->lock); |
1610 | list_splice_tail_init(list, &ctx->rq_list); | 1669 | list_splice_tail_init(list, &ctx->rq_lists[type]); |
1611 | blk_mq_hctx_mark_pending(hctx, ctx); | 1670 | blk_mq_hctx_mark_pending(hctx, ctx); |
1612 | spin_unlock(&ctx->lock); | 1671 | spin_unlock(&ctx->lock); |
1613 | } | 1672 | } |
1614 | 1673 | ||
1615 | static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) | 1674 | static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) |
1616 | { | 1675 | { |
1617 | struct request *rqa = container_of(a, struct request, queuelist); | 1676 | struct request *rqa = container_of(a, struct request, queuelist); |
1618 | struct request *rqb = container_of(b, struct request, queuelist); | 1677 | struct request *rqb = container_of(b, struct request, queuelist); |
1619 | 1678 | ||
1620 | return !(rqa->mq_ctx < rqb->mq_ctx || | 1679 | if (rqa->mq_ctx < rqb->mq_ctx) |
1621 | (rqa->mq_ctx == rqb->mq_ctx && | 1680 | return -1; |
1622 | blk_rq_pos(rqa) < blk_rq_pos(rqb))); | 1681 | else if (rqa->mq_ctx > rqb->mq_ctx) |
1682 | return 1; | ||
1683 | else if (rqa->mq_hctx < rqb->mq_hctx) | ||
1684 | return -1; | ||
1685 | else if (rqa->mq_hctx > rqb->mq_hctx) | ||
1686 | return 1; | ||
1687 | |||
1688 | return blk_rq_pos(rqa) > blk_rq_pos(rqb); | ||
1623 | } | 1689 | } |
1624 | 1690 | ||
1625 | void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) | 1691 | void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) |
1626 | { | 1692 | { |
1693 | struct blk_mq_hw_ctx *this_hctx; | ||
1627 | struct blk_mq_ctx *this_ctx; | 1694 | struct blk_mq_ctx *this_ctx; |
1628 | struct request_queue *this_q; | 1695 | struct request_queue *this_q; |
1629 | struct request *rq; | 1696 | struct request *rq; |
1630 | LIST_HEAD(list); | 1697 | LIST_HEAD(list); |
1631 | LIST_HEAD(ctx_list); | 1698 | LIST_HEAD(rq_list); |
1632 | unsigned int depth; | 1699 | unsigned int depth; |
1633 | 1700 | ||
1634 | list_splice_init(&plug->mq_list, &list); | 1701 | list_splice_init(&plug->mq_list, &list); |
1702 | plug->rq_count = 0; | ||
1635 | 1703 | ||
1636 | list_sort(NULL, &list, plug_ctx_cmp); | 1704 | if (plug->rq_count > 2 && plug->multiple_queues) |
1705 | list_sort(NULL, &list, plug_rq_cmp); | ||
1637 | 1706 | ||
1638 | this_q = NULL; | 1707 | this_q = NULL; |
1708 | this_hctx = NULL; | ||
1639 | this_ctx = NULL; | 1709 | this_ctx = NULL; |
1640 | depth = 0; | 1710 | depth = 0; |
1641 | 1711 | ||
@@ -1643,30 +1713,31 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) | |||
1643 | rq = list_entry_rq(list.next); | 1713 | rq = list_entry_rq(list.next); |
1644 | list_del_init(&rq->queuelist); | 1714 | list_del_init(&rq->queuelist); |
1645 | BUG_ON(!rq->q); | 1715 | BUG_ON(!rq->q); |
1646 | if (rq->mq_ctx != this_ctx) { | 1716 | if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) { |
1647 | if (this_ctx) { | 1717 | if (this_hctx) { |
1648 | trace_block_unplug(this_q, depth, !from_schedule); | 1718 | trace_block_unplug(this_q, depth, !from_schedule); |
1649 | blk_mq_sched_insert_requests(this_q, this_ctx, | 1719 | blk_mq_sched_insert_requests(this_hctx, this_ctx, |
1650 | &ctx_list, | 1720 | &rq_list, |
1651 | from_schedule); | 1721 | from_schedule); |
1652 | } | 1722 | } |
1653 | 1723 | ||
1654 | this_ctx = rq->mq_ctx; | ||
1655 | this_q = rq->q; | 1724 | this_q = rq->q; |
1725 | this_ctx = rq->mq_ctx; | ||
1726 | this_hctx = rq->mq_hctx; | ||
1656 | depth = 0; | 1727 | depth = 0; |
1657 | } | 1728 | } |
1658 | 1729 | ||
1659 | depth++; | 1730 | depth++; |
1660 | list_add_tail(&rq->queuelist, &ctx_list); | 1731 | list_add_tail(&rq->queuelist, &rq_list); |
1661 | } | 1732 | } |
1662 | 1733 | ||
1663 | /* | 1734 | /* |
1664 | * If 'this_ctx' is set, we know we have entries to complete | 1735 | * If 'this_hctx' is set, we know we have entries to complete |
1665 | * on 'ctx_list'. Do those. | 1736 | * on 'rq_list'. Do those. |
1666 | */ | 1737 | */ |
1667 | if (this_ctx) { | 1738 | if (this_hctx) { |
1668 | trace_block_unplug(this_q, depth, !from_schedule); | 1739 | trace_block_unplug(this_q, depth, !from_schedule); |
1669 | blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, | 1740 | blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, |
1670 | from_schedule); | 1741 | from_schedule); |
1671 | } | 1742 | } |
1672 | } | 1743 | } |
@@ -1675,27 +1746,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) | |||
1675 | { | 1746 | { |
1676 | blk_init_request_from_bio(rq, bio); | 1747 | blk_init_request_from_bio(rq, bio); |
1677 | 1748 | ||
1678 | blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); | ||
1679 | |||
1680 | blk_account_io_start(rq, true); | 1749 | blk_account_io_start(rq, true); |
1681 | } | 1750 | } |
1682 | 1751 | ||
1683 | static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) | ||
1684 | { | ||
1685 | if (rq->tag != -1) | ||
1686 | return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); | ||
1687 | |||
1688 | return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); | ||
1689 | } | ||
1690 | |||
1691 | static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, | 1752 | static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, |
1692 | struct request *rq, | 1753 | struct request *rq, |
1693 | blk_qc_t *cookie) | 1754 | blk_qc_t *cookie, bool last) |
1694 | { | 1755 | { |
1695 | struct request_queue *q = rq->q; | 1756 | struct request_queue *q = rq->q; |
1696 | struct blk_mq_queue_data bd = { | 1757 | struct blk_mq_queue_data bd = { |
1697 | .rq = rq, | 1758 | .rq = rq, |
1698 | .last = true, | 1759 | .last = last, |
1699 | }; | 1760 | }; |
1700 | blk_qc_t new_cookie; | 1761 | blk_qc_t new_cookie; |
1701 | blk_status_t ret; | 1762 | blk_status_t ret; |
@@ -1727,77 +1788,74 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, | |||
1727 | return ret; | 1788 | return ret; |
1728 | } | 1789 | } |
1729 | 1790 | ||
1730 | static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | 1791 | blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, |
1731 | struct request *rq, | 1792 | struct request *rq, |
1732 | blk_qc_t *cookie, | 1793 | blk_qc_t *cookie, |
1733 | bool bypass_insert) | 1794 | bool bypass, bool last) |
1734 | { | 1795 | { |
1735 | struct request_queue *q = rq->q; | 1796 | struct request_queue *q = rq->q; |
1736 | bool run_queue = true; | 1797 | bool run_queue = true; |
1798 | blk_status_t ret = BLK_STS_RESOURCE; | ||
1799 | int srcu_idx; | ||
1800 | bool force = false; | ||
1737 | 1801 | ||
1802 | hctx_lock(hctx, &srcu_idx); | ||
1738 | /* | 1803 | /* |
1739 | * RCU or SRCU read lock is needed before checking quiesced flag. | 1804 | * hctx_lock is needed before checking quiesced flag. |
1740 | * | 1805 | * |
1741 | * When queue is stopped or quiesced, ignore 'bypass_insert' from | 1806 | * When queue is stopped or quiesced, ignore 'bypass', insert |
1742 | * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, | 1807 | * and return BLK_STS_OK to caller, and avoid driver to try to |
1743 | * and avoid driver to try to dispatch again. | 1808 | * dispatch again. |
1744 | */ | 1809 | */ |
1745 | if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { | 1810 | if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) { |
1746 | run_queue = false; | 1811 | run_queue = false; |
1747 | bypass_insert = false; | 1812 | bypass = false; |
1748 | goto insert; | 1813 | goto out_unlock; |
1749 | } | 1814 | } |
1750 | 1815 | ||
1751 | if (q->elevator && !bypass_insert) | 1816 | if (unlikely(q->elevator && !bypass)) |
1752 | goto insert; | 1817 | goto out_unlock; |
1753 | 1818 | ||
1754 | if (!blk_mq_get_dispatch_budget(hctx)) | 1819 | if (!blk_mq_get_dispatch_budget(hctx)) |
1755 | goto insert; | 1820 | goto out_unlock; |
1756 | 1821 | ||
1757 | if (!blk_mq_get_driver_tag(rq)) { | 1822 | if (!blk_mq_get_driver_tag(rq)) { |
1758 | blk_mq_put_dispatch_budget(hctx); | 1823 | blk_mq_put_dispatch_budget(hctx); |
1759 | goto insert; | 1824 | goto out_unlock; |
1760 | } | 1825 | } |
1761 | 1826 | ||
1762 | return __blk_mq_issue_directly(hctx, rq, cookie); | 1827 | /* |
1763 | insert: | 1828 | * Always add a request that has been through |
1764 | if (bypass_insert) | 1829 | *.queue_rq() to the hardware dispatch list. |
1765 | return BLK_STS_RESOURCE; | 1830 | */ |
1766 | 1831 | force = true; | |
1767 | blk_mq_sched_insert_request(rq, false, run_queue, false); | 1832 | ret = __blk_mq_issue_directly(hctx, rq, cookie, last); |
1768 | return BLK_STS_OK; | 1833 | out_unlock: |
1769 | } | ||
1770 | |||
1771 | static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | ||
1772 | struct request *rq, blk_qc_t *cookie) | ||
1773 | { | ||
1774 | blk_status_t ret; | ||
1775 | int srcu_idx; | ||
1776 | |||
1777 | might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); | ||
1778 | |||
1779 | hctx_lock(hctx, &srcu_idx); | ||
1780 | |||
1781 | ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); | ||
1782 | if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) | ||
1783 | blk_mq_sched_insert_request(rq, false, true, false); | ||
1784 | else if (ret != BLK_STS_OK) | ||
1785 | blk_mq_end_request(rq, ret); | ||
1786 | |||
1787 | hctx_unlock(hctx, srcu_idx); | ||
1788 | } | ||
1789 | |||
1790 | blk_status_t blk_mq_request_issue_directly(struct request *rq) | ||
1791 | { | ||
1792 | blk_status_t ret; | ||
1793 | int srcu_idx; | ||
1794 | blk_qc_t unused_cookie; | ||
1795 | struct blk_mq_ctx *ctx = rq->mq_ctx; | ||
1796 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); | ||
1797 | |||
1798 | hctx_lock(hctx, &srcu_idx); | ||
1799 | ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); | ||
1800 | hctx_unlock(hctx, srcu_idx); | 1834 | hctx_unlock(hctx, srcu_idx); |
1835 | switch (ret) { | ||
1836 | case BLK_STS_OK: | ||
1837 | break; | ||
1838 | case BLK_STS_DEV_RESOURCE: | ||
1839 | case BLK_STS_RESOURCE: | ||
1840 | if (force) { | ||
1841 | blk_mq_request_bypass_insert(rq, run_queue); | ||
1842 | /* | ||
1843 | * We have to return BLK_STS_OK for the DM | ||
1844 | * to avoid livelock. Otherwise, we return | ||
1845 | * the real result to indicate whether the | ||
1846 | * request is direct-issued successfully. | ||
1847 | */ | ||
1848 | ret = bypass ? BLK_STS_OK : ret; | ||
1849 | } else if (!bypass) { | ||
1850 | blk_mq_sched_insert_request(rq, false, | ||
1851 | run_queue, false); | ||
1852 | } | ||
1853 | break; | ||
1854 | default: | ||
1855 | if (!bypass) | ||
1856 | blk_mq_end_request(rq, ret); | ||
1857 | break; | ||
1858 | } | ||
1801 | 1859 | ||
1802 | return ret; | 1860 | return ret; |
1803 | } | 1861 | } |
@@ -1805,21 +1863,42 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq) | |||
1805 | void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, | 1863 | void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, |
1806 | struct list_head *list) | 1864 | struct list_head *list) |
1807 | { | 1865 | { |
1866 | blk_qc_t unused; | ||
1867 | blk_status_t ret = BLK_STS_OK; | ||
1868 | |||
1808 | while (!list_empty(list)) { | 1869 | while (!list_empty(list)) { |
1809 | blk_status_t ret; | ||
1810 | struct request *rq = list_first_entry(list, struct request, | 1870 | struct request *rq = list_first_entry(list, struct request, |
1811 | queuelist); | 1871 | queuelist); |
1812 | 1872 | ||
1813 | list_del_init(&rq->queuelist); | 1873 | list_del_init(&rq->queuelist); |
1814 | ret = blk_mq_request_issue_directly(rq); | 1874 | if (ret == BLK_STS_OK) |
1815 | if (ret != BLK_STS_OK) { | 1875 | ret = blk_mq_try_issue_directly(hctx, rq, &unused, |
1816 | if (ret == BLK_STS_RESOURCE || | 1876 | false, |
1817 | ret == BLK_STS_DEV_RESOURCE) { | 1877 | list_empty(list)); |
1818 | list_add(&rq->queuelist, list); | 1878 | else |
1819 | break; | 1879 | blk_mq_sched_insert_request(rq, false, true, false); |
1820 | } | 1880 | } |
1821 | blk_mq_end_request(rq, ret); | 1881 | |
1822 | } | 1882 | /* |
1883 | * If we didn't flush the entire list, we could have told | ||
1884 | * the driver there was more coming, but that turned out to | ||
1885 | * be a lie. | ||
1886 | */ | ||
1887 | if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs) | ||
1888 | hctx->queue->mq_ops->commit_rqs(hctx); | ||
1889 | } | ||
1890 | |||
1891 | static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) | ||
1892 | { | ||
1893 | list_add_tail(&rq->queuelist, &plug->mq_list); | ||
1894 | plug->rq_count++; | ||
1895 | if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { | ||
1896 | struct request *tmp; | ||
1897 | |||
1898 | tmp = list_first_entry(&plug->mq_list, struct request, | ||
1899 | queuelist); | ||
1900 | if (tmp->q != rq->q) | ||
1901 | plug->multiple_queues = true; | ||
1823 | } | 1902 | } |
1824 | } | 1903 | } |
1825 | 1904 | ||
@@ -1827,9 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1827 | { | 1906 | { |
1828 | const int is_sync = op_is_sync(bio->bi_opf); | 1907 | const int is_sync = op_is_sync(bio->bi_opf); |
1829 | const int is_flush_fua = op_is_flush(bio->bi_opf); | 1908 | const int is_flush_fua = op_is_flush(bio->bi_opf); |
1830 | struct blk_mq_alloc_data data = { .flags = 0 }; | 1909 | struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf }; |
1831 | struct request *rq; | 1910 | struct request *rq; |
1832 | unsigned int request_count = 0; | ||
1833 | struct blk_plug *plug; | 1911 | struct blk_plug *plug; |
1834 | struct request *same_queue_rq = NULL; | 1912 | struct request *same_queue_rq = NULL; |
1835 | blk_qc_t cookie; | 1913 | blk_qc_t cookie; |
@@ -1842,15 +1920,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1842 | return BLK_QC_T_NONE; | 1920 | return BLK_QC_T_NONE; |
1843 | 1921 | ||
1844 | if (!is_flush_fua && !blk_queue_nomerges(q) && | 1922 | if (!is_flush_fua && !blk_queue_nomerges(q) && |
1845 | blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) | 1923 | blk_attempt_plug_merge(q, bio, &same_queue_rq)) |
1846 | return BLK_QC_T_NONE; | 1924 | return BLK_QC_T_NONE; |
1847 | 1925 | ||
1848 | if (blk_mq_sched_bio_merge(q, bio)) | 1926 | if (blk_mq_sched_bio_merge(q, bio)) |
1849 | return BLK_QC_T_NONE; | 1927 | return BLK_QC_T_NONE; |
1850 | 1928 | ||
1851 | rq_qos_throttle(q, bio, NULL); | 1929 | rq_qos_throttle(q, bio); |
1852 | 1930 | ||
1853 | rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); | 1931 | rq = blk_mq_get_request(q, bio, &data); |
1854 | if (unlikely(!rq)) { | 1932 | if (unlikely(!rq)) { |
1855 | rq_qos_cleanup(q, bio); | 1933 | rq_qos_cleanup(q, bio); |
1856 | if (bio->bi_opf & REQ_NOWAIT) | 1934 | if (bio->bi_opf & REQ_NOWAIT) |
@@ -1872,21 +1950,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1872 | /* bypass scheduler for flush rq */ | 1950 | /* bypass scheduler for flush rq */ |
1873 | blk_insert_flush(rq); | 1951 | blk_insert_flush(rq); |
1874 | blk_mq_run_hw_queue(data.hctx, true); | 1952 | blk_mq_run_hw_queue(data.hctx, true); |
1875 | } else if (plug && q->nr_hw_queues == 1) { | 1953 | } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { |
1954 | /* | ||
1955 | * Use plugging if we have a ->commit_rqs() hook as well, as | ||
1956 | * we know the driver uses bd->last in a smart fashion. | ||
1957 | */ | ||
1958 | unsigned int request_count = plug->rq_count; | ||
1876 | struct request *last = NULL; | 1959 | struct request *last = NULL; |
1877 | 1960 | ||
1878 | blk_mq_put_ctx(data.ctx); | 1961 | blk_mq_put_ctx(data.ctx); |
1879 | blk_mq_bio_to_request(rq, bio); | 1962 | blk_mq_bio_to_request(rq, bio); |
1880 | 1963 | ||
1881 | /* | ||
1882 | * @request_count may become stale because of schedule | ||
1883 | * out, so check the list again. | ||
1884 | */ | ||
1885 | if (list_empty(&plug->mq_list)) | ||
1886 | request_count = 0; | ||
1887 | else if (blk_queue_nomerges(q)) | ||
1888 | request_count = blk_plug_queued_count(q); | ||
1889 | |||
1890 | if (!request_count) | 1964 | if (!request_count) |
1891 | trace_block_plug(q); | 1965 | trace_block_plug(q); |
1892 | else | 1966 | else |
@@ -1898,7 +1972,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1898 | trace_block_plug(q); | 1972 | trace_block_plug(q); |
1899 | } | 1973 | } |
1900 | 1974 | ||
1901 | list_add_tail(&rq->queuelist, &plug->mq_list); | 1975 | blk_add_rq_to_plug(plug, rq); |
1902 | } else if (plug && !blk_queue_nomerges(q)) { | 1976 | } else if (plug && !blk_queue_nomerges(q)) { |
1903 | blk_mq_bio_to_request(rq, bio); | 1977 | blk_mq_bio_to_request(rq, bio); |
1904 | 1978 | ||
@@ -1911,23 +1985,24 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1911 | */ | 1985 | */ |
1912 | if (list_empty(&plug->mq_list)) | 1986 | if (list_empty(&plug->mq_list)) |
1913 | same_queue_rq = NULL; | 1987 | same_queue_rq = NULL; |
1914 | if (same_queue_rq) | 1988 | if (same_queue_rq) { |
1915 | list_del_init(&same_queue_rq->queuelist); | 1989 | list_del_init(&same_queue_rq->queuelist); |
1916 | list_add_tail(&rq->queuelist, &plug->mq_list); | 1990 | plug->rq_count--; |
1991 | } | ||
1992 | blk_add_rq_to_plug(plug, rq); | ||
1917 | 1993 | ||
1918 | blk_mq_put_ctx(data.ctx); | 1994 | blk_mq_put_ctx(data.ctx); |
1919 | 1995 | ||
1920 | if (same_queue_rq) { | 1996 | if (same_queue_rq) { |
1921 | data.hctx = blk_mq_map_queue(q, | 1997 | data.hctx = same_queue_rq->mq_hctx; |
1922 | same_queue_rq->mq_ctx->cpu); | ||
1923 | blk_mq_try_issue_directly(data.hctx, same_queue_rq, | 1998 | blk_mq_try_issue_directly(data.hctx, same_queue_rq, |
1924 | &cookie); | 1999 | &cookie, false, true); |
1925 | } | 2000 | } |
1926 | } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && | 2001 | } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && |
1927 | !data.hctx->dispatch_busy)) { | 2002 | !data.hctx->dispatch_busy)) { |
1928 | blk_mq_put_ctx(data.ctx); | 2003 | blk_mq_put_ctx(data.ctx); |
1929 | blk_mq_bio_to_request(rq, bio); | 2004 | blk_mq_bio_to_request(rq, bio); |
1930 | blk_mq_try_issue_directly(data.hctx, rq, &cookie); | 2005 | blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true); |
1931 | } else { | 2006 | } else { |
1932 | blk_mq_put_ctx(data.ctx); | 2007 | blk_mq_put_ctx(data.ctx); |
1933 | blk_mq_bio_to_request(rq, bio); | 2008 | blk_mq_bio_to_request(rq, bio); |
@@ -1985,7 +2060,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, | |||
1985 | struct blk_mq_tags *tags; | 2060 | struct blk_mq_tags *tags; |
1986 | int node; | 2061 | int node; |
1987 | 2062 | ||
1988 | node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); | 2063 | node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); |
1989 | if (node == NUMA_NO_NODE) | 2064 | if (node == NUMA_NO_NODE) |
1990 | node = set->numa_node; | 2065 | node = set->numa_node; |
1991 | 2066 | ||
@@ -2041,7 +2116,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, | |||
2041 | size_t rq_size, left; | 2116 | size_t rq_size, left; |
2042 | int node; | 2117 | int node; |
2043 | 2118 | ||
2044 | node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); | 2119 | node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); |
2045 | if (node == NUMA_NO_NODE) | 2120 | if (node == NUMA_NO_NODE) |
2046 | node = set->numa_node; | 2121 | node = set->numa_node; |
2047 | 2122 | ||
@@ -2121,13 +2196,15 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) | |||
2121 | struct blk_mq_hw_ctx *hctx; | 2196 | struct blk_mq_hw_ctx *hctx; |
2122 | struct blk_mq_ctx *ctx; | 2197 | struct blk_mq_ctx *ctx; |
2123 | LIST_HEAD(tmp); | 2198 | LIST_HEAD(tmp); |
2199 | enum hctx_type type; | ||
2124 | 2200 | ||
2125 | hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); | 2201 | hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); |
2126 | ctx = __blk_mq_get_ctx(hctx->queue, cpu); | 2202 | ctx = __blk_mq_get_ctx(hctx->queue, cpu); |
2203 | type = hctx->type; | ||
2127 | 2204 | ||
2128 | spin_lock(&ctx->lock); | 2205 | spin_lock(&ctx->lock); |
2129 | if (!list_empty(&ctx->rq_list)) { | 2206 | if (!list_empty(&ctx->rq_lists[type])) { |
2130 | list_splice_init(&ctx->rq_list, &tmp); | 2207 | list_splice_init(&ctx->rq_lists[type], &tmp); |
2131 | blk_mq_hctx_clear_pending(hctx, ctx); | 2208 | blk_mq_hctx_clear_pending(hctx, ctx); |
2132 | } | 2209 | } |
2133 | spin_unlock(&ctx->lock); | 2210 | spin_unlock(&ctx->lock); |
@@ -2258,24 +2335,30 @@ static int blk_mq_init_hctx(struct request_queue *q, | |||
2258 | static void blk_mq_init_cpu_queues(struct request_queue *q, | 2335 | static void blk_mq_init_cpu_queues(struct request_queue *q, |
2259 | unsigned int nr_hw_queues) | 2336 | unsigned int nr_hw_queues) |
2260 | { | 2337 | { |
2261 | unsigned int i; | 2338 | struct blk_mq_tag_set *set = q->tag_set; |
2339 | unsigned int i, j; | ||
2262 | 2340 | ||
2263 | for_each_possible_cpu(i) { | 2341 | for_each_possible_cpu(i) { |
2264 | struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); | 2342 | struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); |
2265 | struct blk_mq_hw_ctx *hctx; | 2343 | struct blk_mq_hw_ctx *hctx; |
2344 | int k; | ||
2266 | 2345 | ||
2267 | __ctx->cpu = i; | 2346 | __ctx->cpu = i; |
2268 | spin_lock_init(&__ctx->lock); | 2347 | spin_lock_init(&__ctx->lock); |
2269 | INIT_LIST_HEAD(&__ctx->rq_list); | 2348 | for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) |
2349 | INIT_LIST_HEAD(&__ctx->rq_lists[k]); | ||
2350 | |||
2270 | __ctx->queue = q; | 2351 | __ctx->queue = q; |
2271 | 2352 | ||
2272 | /* | 2353 | /* |
2273 | * Set local node, IFF we have more than one hw queue. If | 2354 | * Set local node, IFF we have more than one hw queue. If |
2274 | * not, we remain on the home node of the device | 2355 | * not, we remain on the home node of the device |
2275 | */ | 2356 | */ |
2276 | hctx = blk_mq_map_queue(q, i); | 2357 | for (j = 0; j < set->nr_maps; j++) { |
2277 | if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) | 2358 | hctx = blk_mq_map_queue_type(q, j, i); |
2278 | hctx->numa_node = local_memory_node(cpu_to_node(i)); | 2359 | if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) |
2360 | hctx->numa_node = local_memory_node(cpu_to_node(i)); | ||
2361 | } | ||
2279 | } | 2362 | } |
2280 | } | 2363 | } |
2281 | 2364 | ||
@@ -2301,7 +2384,7 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) | |||
2301 | static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, | 2384 | static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, |
2302 | unsigned int hctx_idx) | 2385 | unsigned int hctx_idx) |
2303 | { | 2386 | { |
2304 | if (set->tags[hctx_idx]) { | 2387 | if (set->tags && set->tags[hctx_idx]) { |
2305 | blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); | 2388 | blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); |
2306 | blk_mq_free_rq_map(set->tags[hctx_idx]); | 2389 | blk_mq_free_rq_map(set->tags[hctx_idx]); |
2307 | set->tags[hctx_idx] = NULL; | 2390 | set->tags[hctx_idx] = NULL; |
@@ -2310,7 +2393,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, | |||
2310 | 2393 | ||
2311 | static void blk_mq_map_swqueue(struct request_queue *q) | 2394 | static void blk_mq_map_swqueue(struct request_queue *q) |
2312 | { | 2395 | { |
2313 | unsigned int i, hctx_idx; | 2396 | unsigned int i, j, hctx_idx; |
2314 | struct blk_mq_hw_ctx *hctx; | 2397 | struct blk_mq_hw_ctx *hctx; |
2315 | struct blk_mq_ctx *ctx; | 2398 | struct blk_mq_ctx *ctx; |
2316 | struct blk_mq_tag_set *set = q->tag_set; | 2399 | struct blk_mq_tag_set *set = q->tag_set; |
@@ -2332,7 +2415,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) | |||
2332 | * If the cpu isn't present, the cpu is mapped to first hctx. | 2415 | * If the cpu isn't present, the cpu is mapped to first hctx. |
2333 | */ | 2416 | */ |
2334 | for_each_possible_cpu(i) { | 2417 | for_each_possible_cpu(i) { |
2335 | hctx_idx = q->mq_map[i]; | 2418 | hctx_idx = set->map[0].mq_map[i]; |
2336 | /* unmapped hw queue can be remapped after CPU topo changed */ | 2419 | /* unmapped hw queue can be remapped after CPU topo changed */ |
2337 | if (!set->tags[hctx_idx] && | 2420 | if (!set->tags[hctx_idx] && |
2338 | !__blk_mq_alloc_rq_map(set, hctx_idx)) { | 2421 | !__blk_mq_alloc_rq_map(set, hctx_idx)) { |
@@ -2342,15 +2425,35 @@ static void blk_mq_map_swqueue(struct request_queue *q) | |||
2342 | * case, remap the current ctx to hctx[0] which | 2425 | * case, remap the current ctx to hctx[0] which |
2343 | * is guaranteed to always have tags allocated | 2426 | * is guaranteed to always have tags allocated |
2344 | */ | 2427 | */ |
2345 | q->mq_map[i] = 0; | 2428 | set->map[0].mq_map[i] = 0; |
2346 | } | 2429 | } |
2347 | 2430 | ||
2348 | ctx = per_cpu_ptr(q->queue_ctx, i); | 2431 | ctx = per_cpu_ptr(q->queue_ctx, i); |
2349 | hctx = blk_mq_map_queue(q, i); | 2432 | for (j = 0; j < set->nr_maps; j++) { |
2433 | if (!set->map[j].nr_queues) | ||
2434 | continue; | ||
2435 | |||
2436 | hctx = blk_mq_map_queue_type(q, j, i); | ||
2437 | |||
2438 | /* | ||
2439 | * If the CPU is already set in the mask, then we've | ||
2440 | * mapped this one already. This can happen if | ||
2441 | * devices share queues across queue maps. | ||
2442 | */ | ||
2443 | if (cpumask_test_cpu(i, hctx->cpumask)) | ||
2444 | continue; | ||
2445 | |||
2446 | cpumask_set_cpu(i, hctx->cpumask); | ||
2447 | hctx->type = j; | ||
2448 | ctx->index_hw[hctx->type] = hctx->nr_ctx; | ||
2449 | hctx->ctxs[hctx->nr_ctx++] = ctx; | ||
2350 | 2450 | ||
2351 | cpumask_set_cpu(i, hctx->cpumask); | 2451 | /* |
2352 | ctx->index_hw = hctx->nr_ctx; | 2452 | * If the nr_ctx type overflows, we have exceeded the |
2353 | hctx->ctxs[hctx->nr_ctx++] = ctx; | 2453 | * amount of sw queues we can support. |
2454 | */ | ||
2455 | BUG_ON(!hctx->nr_ctx); | ||
2456 | } | ||
2354 | } | 2457 | } |
2355 | 2458 | ||
2356 | mutex_unlock(&q->sysfs_lock); | 2459 | mutex_unlock(&q->sysfs_lock); |
@@ -2440,8 +2543,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) | |||
2440 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, | 2543 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, |
2441 | struct request_queue *q) | 2544 | struct request_queue *q) |
2442 | { | 2545 | { |
2443 | q->tag_set = set; | ||
2444 | |||
2445 | mutex_lock(&set->tag_list_lock); | 2546 | mutex_lock(&set->tag_list_lock); |
2446 | 2547 | ||
2447 | /* | 2548 | /* |
@@ -2460,6 +2561,34 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, | |||
2460 | mutex_unlock(&set->tag_list_lock); | 2561 | mutex_unlock(&set->tag_list_lock); |
2461 | } | 2562 | } |
2462 | 2563 | ||
2564 | /* All allocations will be freed in release handler of q->mq_kobj */ | ||
2565 | static int blk_mq_alloc_ctxs(struct request_queue *q) | ||
2566 | { | ||
2567 | struct blk_mq_ctxs *ctxs; | ||
2568 | int cpu; | ||
2569 | |||
2570 | ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); | ||
2571 | if (!ctxs) | ||
2572 | return -ENOMEM; | ||
2573 | |||
2574 | ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); | ||
2575 | if (!ctxs->queue_ctx) | ||
2576 | goto fail; | ||
2577 | |||
2578 | for_each_possible_cpu(cpu) { | ||
2579 | struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); | ||
2580 | ctx->ctxs = ctxs; | ||
2581 | } | ||
2582 | |||
2583 | q->mq_kobj = &ctxs->kobj; | ||
2584 | q->queue_ctx = ctxs->queue_ctx; | ||
2585 | |||
2586 | return 0; | ||
2587 | fail: | ||
2588 | kfree(ctxs); | ||
2589 | return -ENOMEM; | ||
2590 | } | ||
2591 | |||
2463 | /* | 2592 | /* |
2464 | * It is the actual release handler for mq, but we do it from | 2593 | * It is the actual release handler for mq, but we do it from |
2465 | * request queue's release handler for avoiding use-after-free | 2594 | * request queue's release handler for avoiding use-after-free |
@@ -2478,8 +2607,6 @@ void blk_mq_release(struct request_queue *q) | |||
2478 | kobject_put(&hctx->kobj); | 2607 | kobject_put(&hctx->kobj); |
2479 | } | 2608 | } |
2480 | 2609 | ||
2481 | q->mq_map = NULL; | ||
2482 | |||
2483 | kfree(q->queue_hw_ctx); | 2610 | kfree(q->queue_hw_ctx); |
2484 | 2611 | ||
2485 | /* | 2612 | /* |
@@ -2487,15 +2614,13 @@ void blk_mq_release(struct request_queue *q) | |||
2487 | * both share lifetime with request queue. | 2614 | * both share lifetime with request queue. |
2488 | */ | 2615 | */ |
2489 | blk_mq_sysfs_deinit(q); | 2616 | blk_mq_sysfs_deinit(q); |
2490 | |||
2491 | free_percpu(q->queue_ctx); | ||
2492 | } | 2617 | } |
2493 | 2618 | ||
2494 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | 2619 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) |
2495 | { | 2620 | { |
2496 | struct request_queue *uninit_q, *q; | 2621 | struct request_queue *uninit_q, *q; |
2497 | 2622 | ||
2498 | uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL); | 2623 | uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); |
2499 | if (!uninit_q) | 2624 | if (!uninit_q) |
2500 | return ERR_PTR(-ENOMEM); | 2625 | return ERR_PTR(-ENOMEM); |
2501 | 2626 | ||
@@ -2522,6 +2647,7 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, | |||
2522 | memset(set, 0, sizeof(*set)); | 2647 | memset(set, 0, sizeof(*set)); |
2523 | set->ops = ops; | 2648 | set->ops = ops; |
2524 | set->nr_hw_queues = 1; | 2649 | set->nr_hw_queues = 1; |
2650 | set->nr_maps = 1; | ||
2525 | set->queue_depth = queue_depth; | 2651 | set->queue_depth = queue_depth; |
2526 | set->numa_node = NUMA_NO_NODE; | 2652 | set->numa_node = NUMA_NO_NODE; |
2527 | set->flags = set_flags; | 2653 | set->flags = set_flags; |
@@ -2599,7 +2725,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, | |||
2599 | int node; | 2725 | int node; |
2600 | struct blk_mq_hw_ctx *hctx; | 2726 | struct blk_mq_hw_ctx *hctx; |
2601 | 2727 | ||
2602 | node = blk_mq_hw_queue_to_node(q->mq_map, i); | 2728 | node = blk_mq_hw_queue_to_node(&set->map[0], i); |
2603 | /* | 2729 | /* |
2604 | * If the hw queue has been mapped to another numa node, | 2730 | * If the hw queue has been mapped to another numa node, |
2605 | * we need to realloc the hctx. If allocation fails, fallback | 2731 | * we need to realloc the hctx. If allocation fails, fallback |
@@ -2652,6 +2778,19 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, | |||
2652 | mutex_unlock(&q->sysfs_lock); | 2778 | mutex_unlock(&q->sysfs_lock); |
2653 | } | 2779 | } |
2654 | 2780 | ||
2781 | /* | ||
2782 | * Maximum number of hardware queues we support. For single sets, we'll never | ||
2783 | * have more than the CPUs (software queues). For multiple sets, the tag_set | ||
2784 | * user may have set ->nr_hw_queues larger. | ||
2785 | */ | ||
2786 | static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) | ||
2787 | { | ||
2788 | if (set->nr_maps == 1) | ||
2789 | return nr_cpu_ids; | ||
2790 | |||
2791 | return max(set->nr_hw_queues, nr_cpu_ids); | ||
2792 | } | ||
2793 | |||
2655 | struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, | 2794 | struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, |
2656 | struct request_queue *q) | 2795 | struct request_queue *q) |
2657 | { | 2796 | { |
@@ -2664,19 +2803,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, | |||
2664 | if (!q->poll_cb) | 2803 | if (!q->poll_cb) |
2665 | goto err_exit; | 2804 | goto err_exit; |
2666 | 2805 | ||
2667 | q->queue_ctx = alloc_percpu(struct blk_mq_ctx); | 2806 | if (blk_mq_alloc_ctxs(q)) |
2668 | if (!q->queue_ctx) | ||
2669 | goto err_exit; | 2807 | goto err_exit; |
2670 | 2808 | ||
2671 | /* init q->mq_kobj and sw queues' kobjects */ | 2809 | /* init q->mq_kobj and sw queues' kobjects */ |
2672 | blk_mq_sysfs_init(q); | 2810 | blk_mq_sysfs_init(q); |
2673 | 2811 | ||
2674 | q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)), | 2812 | q->nr_queues = nr_hw_queues(set); |
2813 | q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), | ||
2675 | GFP_KERNEL, set->numa_node); | 2814 | GFP_KERNEL, set->numa_node); |
2676 | if (!q->queue_hw_ctx) | 2815 | if (!q->queue_hw_ctx) |
2677 | goto err_percpu; | 2816 | goto err_sys_init; |
2678 | |||
2679 | q->mq_map = set->mq_map; | ||
2680 | 2817 | ||
2681 | blk_mq_realloc_hw_ctxs(set, q); | 2818 | blk_mq_realloc_hw_ctxs(set, q); |
2682 | if (!q->nr_hw_queues) | 2819 | if (!q->nr_hw_queues) |
@@ -2685,12 +2822,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, | |||
2685 | INIT_WORK(&q->timeout_work, blk_mq_timeout_work); | 2822 | INIT_WORK(&q->timeout_work, blk_mq_timeout_work); |
2686 | blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); | 2823 | blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); |
2687 | 2824 | ||
2688 | q->nr_queues = nr_cpu_ids; | 2825 | q->tag_set = set; |
2689 | 2826 | ||
2690 | q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; | 2827 | q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; |
2828 | if (set->nr_maps > HCTX_TYPE_POLL && | ||
2829 | set->map[HCTX_TYPE_POLL].nr_queues) | ||
2830 | blk_queue_flag_set(QUEUE_FLAG_POLL, q); | ||
2691 | 2831 | ||
2692 | if (!(set->flags & BLK_MQ_F_SG_MERGE)) | 2832 | if (!(set->flags & BLK_MQ_F_SG_MERGE)) |
2693 | queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); | 2833 | blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q); |
2694 | 2834 | ||
2695 | q->sg_reserved_size = INT_MAX; | 2835 | q->sg_reserved_size = INT_MAX; |
2696 | 2836 | ||
@@ -2699,8 +2839,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, | |||
2699 | spin_lock_init(&q->requeue_lock); | 2839 | spin_lock_init(&q->requeue_lock); |
2700 | 2840 | ||
2701 | blk_queue_make_request(q, blk_mq_make_request); | 2841 | blk_queue_make_request(q, blk_mq_make_request); |
2702 | if (q->mq_ops->poll) | ||
2703 | q->poll_fn = blk_mq_poll; | ||
2704 | 2842 | ||
2705 | /* | 2843 | /* |
2706 | * Do this after blk_queue_make_request() overrides it... | 2844 | * Do this after blk_queue_make_request() overrides it... |
@@ -2712,9 +2850,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, | |||
2712 | */ | 2850 | */ |
2713 | q->poll_nsec = -1; | 2851 | q->poll_nsec = -1; |
2714 | 2852 | ||
2715 | if (set->ops->complete) | ||
2716 | blk_queue_softirq_done(q, set->ops->complete); | ||
2717 | |||
2718 | blk_mq_init_cpu_queues(q, set->nr_hw_queues); | 2853 | blk_mq_init_cpu_queues(q, set->nr_hw_queues); |
2719 | blk_mq_add_queue_tag_set(set, q); | 2854 | blk_mq_add_queue_tag_set(set, q); |
2720 | blk_mq_map_swqueue(q); | 2855 | blk_mq_map_swqueue(q); |
@@ -2731,8 +2866,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, | |||
2731 | 2866 | ||
2732 | err_hctxs: | 2867 | err_hctxs: |
2733 | kfree(q->queue_hw_ctx); | 2868 | kfree(q->queue_hw_ctx); |
2734 | err_percpu: | 2869 | err_sys_init: |
2735 | free_percpu(q->queue_ctx); | 2870 | blk_mq_sysfs_deinit(q); |
2736 | err_exit: | 2871 | err_exit: |
2737 | q->mq_ops = NULL; | 2872 | q->mq_ops = NULL; |
2738 | return ERR_PTR(-ENOMEM); | 2873 | return ERR_PTR(-ENOMEM); |
@@ -2801,7 +2936,9 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) | |||
2801 | 2936 | ||
2802 | static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) | 2937 | static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) |
2803 | { | 2938 | { |
2804 | if (set->ops->map_queues) { | 2939 | if (set->ops->map_queues && !is_kdump_kernel()) { |
2940 | int i; | ||
2941 | |||
2805 | /* | 2942 | /* |
2806 | * transport .map_queues is usually done in the following | 2943 | * transport .map_queues is usually done in the following |
2807 | * way: | 2944 | * way: |
@@ -2809,18 +2946,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) | |||
2809 | * for (queue = 0; queue < set->nr_hw_queues; queue++) { | 2946 | * for (queue = 0; queue < set->nr_hw_queues; queue++) { |
2810 | * mask = get_cpu_mask(queue) | 2947 | * mask = get_cpu_mask(queue) |
2811 | * for_each_cpu(cpu, mask) | 2948 | * for_each_cpu(cpu, mask) |
2812 | * set->mq_map[cpu] = queue; | 2949 | * set->map[x].mq_map[cpu] = queue; |
2813 | * } | 2950 | * } |
2814 | * | 2951 | * |
2815 | * When we need to remap, the table has to be cleared for | 2952 | * When we need to remap, the table has to be cleared for |
2816 | * killing stale mapping since one CPU may not be mapped | 2953 | * killing stale mapping since one CPU may not be mapped |
2817 | * to any hw queue. | 2954 | * to any hw queue. |
2818 | */ | 2955 | */ |
2819 | blk_mq_clear_mq_map(set); | 2956 | for (i = 0; i < set->nr_maps; i++) |
2957 | blk_mq_clear_mq_map(&set->map[i]); | ||
2820 | 2958 | ||
2821 | return set->ops->map_queues(set); | 2959 | return set->ops->map_queues(set); |
2822 | } else | 2960 | } else { |
2823 | return blk_mq_map_queues(set); | 2961 | BUG_ON(set->nr_maps > 1); |
2962 | return blk_mq_map_queues(&set->map[0]); | ||
2963 | } | ||
2824 | } | 2964 | } |
2825 | 2965 | ||
2826 | /* | 2966 | /* |
@@ -2831,7 +2971,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) | |||
2831 | */ | 2971 | */ |
2832 | int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | 2972 | int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) |
2833 | { | 2973 | { |
2834 | int ret; | 2974 | int i, ret; |
2835 | 2975 | ||
2836 | BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); | 2976 | BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); |
2837 | 2977 | ||
@@ -2854,6 +2994,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
2854 | set->queue_depth = BLK_MQ_MAX_DEPTH; | 2994 | set->queue_depth = BLK_MQ_MAX_DEPTH; |
2855 | } | 2995 | } |
2856 | 2996 | ||
2997 | if (!set->nr_maps) | ||
2998 | set->nr_maps = 1; | ||
2999 | else if (set->nr_maps > HCTX_MAX_TYPES) | ||
3000 | return -EINVAL; | ||
3001 | |||
2857 | /* | 3002 | /* |
2858 | * If a crashdump is active, then we are potentially in a very | 3003 | * If a crashdump is active, then we are potentially in a very |
2859 | * memory constrained environment. Limit us to 1 queue and | 3004 | * memory constrained environment. Limit us to 1 queue and |
@@ -2861,24 +3006,30 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
2861 | */ | 3006 | */ |
2862 | if (is_kdump_kernel()) { | 3007 | if (is_kdump_kernel()) { |
2863 | set->nr_hw_queues = 1; | 3008 | set->nr_hw_queues = 1; |
3009 | set->nr_maps = 1; | ||
2864 | set->queue_depth = min(64U, set->queue_depth); | 3010 | set->queue_depth = min(64U, set->queue_depth); |
2865 | } | 3011 | } |
2866 | /* | 3012 | /* |
2867 | * There is no use for more h/w queues than cpus. | 3013 | * There is no use for more h/w queues than cpus if we just have |
3014 | * a single map | ||
2868 | */ | 3015 | */ |
2869 | if (set->nr_hw_queues > nr_cpu_ids) | 3016 | if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) |
2870 | set->nr_hw_queues = nr_cpu_ids; | 3017 | set->nr_hw_queues = nr_cpu_ids; |
2871 | 3018 | ||
2872 | set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *), | 3019 | set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), |
2873 | GFP_KERNEL, set->numa_node); | 3020 | GFP_KERNEL, set->numa_node); |
2874 | if (!set->tags) | 3021 | if (!set->tags) |
2875 | return -ENOMEM; | 3022 | return -ENOMEM; |
2876 | 3023 | ||
2877 | ret = -ENOMEM; | 3024 | ret = -ENOMEM; |
2878 | set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map), | 3025 | for (i = 0; i < set->nr_maps; i++) { |
2879 | GFP_KERNEL, set->numa_node); | 3026 | set->map[i].mq_map = kcalloc_node(nr_cpu_ids, |
2880 | if (!set->mq_map) | 3027 | sizeof(set->map[i].mq_map[0]), |
2881 | goto out_free_tags; | 3028 | GFP_KERNEL, set->numa_node); |
3029 | if (!set->map[i].mq_map) | ||
3030 | goto out_free_mq_map; | ||
3031 | set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; | ||
3032 | } | ||
2882 | 3033 | ||
2883 | ret = blk_mq_update_queue_map(set); | 3034 | ret = blk_mq_update_queue_map(set); |
2884 | if (ret) | 3035 | if (ret) |
@@ -2894,9 +3045,10 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
2894 | return 0; | 3045 | return 0; |
2895 | 3046 | ||
2896 | out_free_mq_map: | 3047 | out_free_mq_map: |
2897 | kfree(set->mq_map); | 3048 | for (i = 0; i < set->nr_maps; i++) { |
2898 | set->mq_map = NULL; | 3049 | kfree(set->map[i].mq_map); |
2899 | out_free_tags: | 3050 | set->map[i].mq_map = NULL; |
3051 | } | ||
2900 | kfree(set->tags); | 3052 | kfree(set->tags); |
2901 | set->tags = NULL; | 3053 | set->tags = NULL; |
2902 | return ret; | 3054 | return ret; |
@@ -2905,13 +3057,15 @@ EXPORT_SYMBOL(blk_mq_alloc_tag_set); | |||
2905 | 3057 | ||
2906 | void blk_mq_free_tag_set(struct blk_mq_tag_set *set) | 3058 | void blk_mq_free_tag_set(struct blk_mq_tag_set *set) |
2907 | { | 3059 | { |
2908 | int i; | 3060 | int i, j; |
2909 | 3061 | ||
2910 | for (i = 0; i < nr_cpu_ids; i++) | 3062 | for (i = 0; i < nr_hw_queues(set); i++) |
2911 | blk_mq_free_map_and_requests(set, i); | 3063 | blk_mq_free_map_and_requests(set, i); |
2912 | 3064 | ||
2913 | kfree(set->mq_map); | 3065 | for (j = 0; j < set->nr_maps; j++) { |
2914 | set->mq_map = NULL; | 3066 | kfree(set->map[j].mq_map); |
3067 | set->map[j].mq_map = NULL; | ||
3068 | } | ||
2915 | 3069 | ||
2916 | kfree(set->tags); | 3070 | kfree(set->tags); |
2917 | set->tags = NULL; | 3071 | set->tags = NULL; |
@@ -3037,7 +3191,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, | |||
3037 | 3191 | ||
3038 | lockdep_assert_held(&set->tag_list_lock); | 3192 | lockdep_assert_held(&set->tag_list_lock); |
3039 | 3193 | ||
3040 | if (nr_hw_queues > nr_cpu_ids) | 3194 | if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) |
3041 | nr_hw_queues = nr_cpu_ids; | 3195 | nr_hw_queues = nr_cpu_ids; |
3042 | if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) | 3196 | if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) |
3043 | return; | 3197 | return; |
@@ -3072,7 +3226,7 @@ fallback: | |||
3072 | pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", | 3226 | pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", |
3073 | nr_hw_queues, prev_nr_hw_queues); | 3227 | nr_hw_queues, prev_nr_hw_queues); |
3074 | set->nr_hw_queues = prev_nr_hw_queues; | 3228 | set->nr_hw_queues = prev_nr_hw_queues; |
3075 | blk_mq_map_queues(set); | 3229 | blk_mq_map_queues(&set->map[0]); |
3076 | goto fallback; | 3230 | goto fallback; |
3077 | } | 3231 | } |
3078 | blk_mq_map_swqueue(q); | 3232 | blk_mq_map_swqueue(q); |
@@ -3179,15 +3333,12 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, | |||
3179 | return false; | 3333 | return false; |
3180 | 3334 | ||
3181 | /* | 3335 | /* |
3182 | * poll_nsec can be: | 3336 | * If we get here, hybrid polling is enabled. Hence poll_nsec can be: |
3183 | * | 3337 | * |
3184 | * -1: don't ever hybrid sleep | ||
3185 | * 0: use half of prev avg | 3338 | * 0: use half of prev avg |
3186 | * >0: use this specific value | 3339 | * >0: use this specific value |
3187 | */ | 3340 | */ |
3188 | if (q->poll_nsec == -1) | 3341 | if (q->poll_nsec > 0) |
3189 | return false; | ||
3190 | else if (q->poll_nsec > 0) | ||
3191 | nsecs = q->poll_nsec; | 3342 | nsecs = q->poll_nsec; |
3192 | else | 3343 | else |
3193 | nsecs = blk_mq_poll_nsecs(q, hctx, rq); | 3344 | nsecs = blk_mq_poll_nsecs(q, hctx, rq); |
@@ -3224,11 +3375,57 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, | |||
3224 | return true; | 3375 | return true; |
3225 | } | 3376 | } |
3226 | 3377 | ||
3227 | static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) | 3378 | static bool blk_mq_poll_hybrid(struct request_queue *q, |
3379 | struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) | ||
3228 | { | 3380 | { |
3229 | struct request_queue *q = hctx->queue; | 3381 | struct request *rq; |
3382 | |||
3383 | if (q->poll_nsec == -1) | ||
3384 | return false; | ||
3385 | |||
3386 | if (!blk_qc_t_is_internal(cookie)) | ||
3387 | rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); | ||
3388 | else { | ||
3389 | rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); | ||
3390 | /* | ||
3391 | * With scheduling, if the request has completed, we'll | ||
3392 | * get a NULL return here, as we clear the sched tag when | ||
3393 | * that happens. The request still remains valid, like always, | ||
3394 | * so we should be safe with just the NULL check. | ||
3395 | */ | ||
3396 | if (!rq) | ||
3397 | return false; | ||
3398 | } | ||
3399 | |||
3400 | return blk_mq_poll_hybrid_sleep(q, hctx, rq); | ||
3401 | } | ||
3402 | |||
3403 | /** | ||
3404 | * blk_poll - poll for IO completions | ||
3405 | * @q: the queue | ||
3406 | * @cookie: cookie passed back at IO submission time | ||
3407 | * @spin: whether to spin for completions | ||
3408 | * | ||
3409 | * Description: | ||
3410 | * Poll for completions on the passed in queue. Returns number of | ||
3411 | * completed entries found. If @spin is true, then blk_poll will continue | ||
3412 | * looping until at least one completion is found, unless the task is | ||
3413 | * otherwise marked running (or we need to reschedule). | ||
3414 | */ | ||
3415 | int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) | ||
3416 | { | ||
3417 | struct blk_mq_hw_ctx *hctx; | ||
3230 | long state; | 3418 | long state; |
3231 | 3419 | ||
3420 | if (!blk_qc_t_valid(cookie) || | ||
3421 | !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) | ||
3422 | return 0; | ||
3423 | |||
3424 | if (current->plug) | ||
3425 | blk_flush_plug_list(current->plug, false); | ||
3426 | |||
3427 | hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; | ||
3428 | |||
3232 | /* | 3429 | /* |
3233 | * If we sleep, have the caller restart the poll loop to reset | 3430 | * If we sleep, have the caller restart the poll loop to reset |
3234 | * the state. Like for the other success return cases, the | 3431 | * the state. Like for the other success return cases, the |
@@ -3236,63 +3433,44 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) | |||
3236 | * the IO isn't complete, we'll get called again and will go | 3433 | * the IO isn't complete, we'll get called again and will go |
3237 | * straight to the busy poll loop. | 3434 | * straight to the busy poll loop. |
3238 | */ | 3435 | */ |
3239 | if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) | 3436 | if (blk_mq_poll_hybrid(q, hctx, cookie)) |
3240 | return true; | 3437 | return 1; |
3241 | 3438 | ||
3242 | hctx->poll_considered++; | 3439 | hctx->poll_considered++; |
3243 | 3440 | ||
3244 | state = current->state; | 3441 | state = current->state; |
3245 | while (!need_resched()) { | 3442 | do { |
3246 | int ret; | 3443 | int ret; |
3247 | 3444 | ||
3248 | hctx->poll_invoked++; | 3445 | hctx->poll_invoked++; |
3249 | 3446 | ||
3250 | ret = q->mq_ops->poll(hctx, rq->tag); | 3447 | ret = q->mq_ops->poll(hctx); |
3251 | if (ret > 0) { | 3448 | if (ret > 0) { |
3252 | hctx->poll_success++; | 3449 | hctx->poll_success++; |
3253 | set_current_state(TASK_RUNNING); | 3450 | __set_current_state(TASK_RUNNING); |
3254 | return true; | 3451 | return ret; |
3255 | } | 3452 | } |
3256 | 3453 | ||
3257 | if (signal_pending_state(state, current)) | 3454 | if (signal_pending_state(state, current)) |
3258 | set_current_state(TASK_RUNNING); | 3455 | __set_current_state(TASK_RUNNING); |
3259 | 3456 | ||
3260 | if (current->state == TASK_RUNNING) | 3457 | if (current->state == TASK_RUNNING) |
3261 | return true; | 3458 | return 1; |
3262 | if (ret < 0) | 3459 | if (ret < 0 || !spin) |
3263 | break; | 3460 | break; |
3264 | cpu_relax(); | 3461 | cpu_relax(); |
3265 | } | 3462 | } while (!need_resched()); |
3266 | 3463 | ||
3267 | __set_current_state(TASK_RUNNING); | 3464 | __set_current_state(TASK_RUNNING); |
3268 | return false; | 3465 | return 0; |
3269 | } | 3466 | } |
3467 | EXPORT_SYMBOL_GPL(blk_poll); | ||
3270 | 3468 | ||
3271 | static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) | 3469 | unsigned int blk_mq_rq_cpu(struct request *rq) |
3272 | { | 3470 | { |
3273 | struct blk_mq_hw_ctx *hctx; | 3471 | return rq->mq_ctx->cpu; |
3274 | struct request *rq; | ||
3275 | |||
3276 | if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) | ||
3277 | return false; | ||
3278 | |||
3279 | hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; | ||
3280 | if (!blk_qc_t_is_internal(cookie)) | ||
3281 | rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); | ||
3282 | else { | ||
3283 | rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); | ||
3284 | /* | ||
3285 | * With scheduling, if the request has completed, we'll | ||
3286 | * get a NULL return here, as we clear the sched tag when | ||
3287 | * that happens. The request still remains valid, like always, | ||
3288 | * so we should be safe with just the NULL check. | ||
3289 | */ | ||
3290 | if (!rq) | ||
3291 | return false; | ||
3292 | } | ||
3293 | |||
3294 | return __blk_mq_poll(hctx, rq); | ||
3295 | } | 3472 | } |
3473 | EXPORT_SYMBOL(blk_mq_rq_cpu); | ||
3296 | 3474 | ||
3297 | static int __init blk_mq_init(void) | 3475 | static int __init blk_mq_init(void) |
3298 | { | 3476 | { |
diff --git a/block/blk-mq.h b/block/blk-mq.h index 9497b47e2526..d943d46b0785 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h | |||
@@ -7,17 +7,22 @@ | |||
7 | 7 | ||
8 | struct blk_mq_tag_set; | 8 | struct blk_mq_tag_set; |
9 | 9 | ||
10 | struct blk_mq_ctxs { | ||
11 | struct kobject kobj; | ||
12 | struct blk_mq_ctx __percpu *queue_ctx; | ||
13 | }; | ||
14 | |||
10 | /** | 15 | /** |
11 | * struct blk_mq_ctx - State for a software queue facing the submitting CPUs | 16 | * struct blk_mq_ctx - State for a software queue facing the submitting CPUs |
12 | */ | 17 | */ |
13 | struct blk_mq_ctx { | 18 | struct blk_mq_ctx { |
14 | struct { | 19 | struct { |
15 | spinlock_t lock; | 20 | spinlock_t lock; |
16 | struct list_head rq_list; | 21 | struct list_head rq_lists[HCTX_MAX_TYPES]; |
17 | } ____cacheline_aligned_in_smp; | 22 | } ____cacheline_aligned_in_smp; |
18 | 23 | ||
19 | unsigned int cpu; | 24 | unsigned int cpu; |
20 | unsigned int index_hw; | 25 | unsigned short index_hw[HCTX_MAX_TYPES]; |
21 | 26 | ||
22 | /* incremented at dispatch time */ | 27 | /* incremented at dispatch time */ |
23 | unsigned long rq_dispatched[2]; | 28 | unsigned long rq_dispatched[2]; |
@@ -27,6 +32,7 @@ struct blk_mq_ctx { | |||
27 | unsigned long ____cacheline_aligned_in_smp rq_completed[2]; | 32 | unsigned long ____cacheline_aligned_in_smp rq_completed[2]; |
28 | 33 | ||
29 | struct request_queue *queue; | 34 | struct request_queue *queue; |
35 | struct blk_mq_ctxs *ctxs; | ||
30 | struct kobject kobj; | 36 | struct kobject kobj; |
31 | } ____cacheline_aligned_in_smp; | 37 | } ____cacheline_aligned_in_smp; |
32 | 38 | ||
@@ -62,20 +68,55 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); | |||
62 | void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, | 68 | void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, |
63 | struct list_head *list); | 69 | struct list_head *list); |
64 | 70 | ||
65 | /* Used by blk_insert_cloned_request() to issue request directly */ | 71 | blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, |
66 | blk_status_t blk_mq_request_issue_directly(struct request *rq); | 72 | struct request *rq, |
73 | blk_qc_t *cookie, | ||
74 | bool bypass, bool last); | ||
67 | void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, | 75 | void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, |
68 | struct list_head *list); | 76 | struct list_head *list); |
69 | 77 | ||
70 | /* | 78 | /* |
71 | * CPU -> queue mappings | 79 | * CPU -> queue mappings |
72 | */ | 80 | */ |
73 | extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); | 81 | extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); |
82 | |||
83 | /* | ||
84 | * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue | ||
85 | * @q: request queue | ||
86 | * @type: the hctx type index | ||
87 | * @cpu: CPU | ||
88 | */ | ||
89 | static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, | ||
90 | enum hctx_type type, | ||
91 | unsigned int cpu) | ||
92 | { | ||
93 | return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; | ||
94 | } | ||
74 | 95 | ||
96 | /* | ||
97 | * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue | ||
98 | * @q: request queue | ||
99 | * @flags: request command flags | ||
100 | * @cpu: CPU | ||
101 | */ | ||
75 | static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, | 102 | static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, |
76 | int cpu) | 103 | unsigned int flags, |
104 | unsigned int cpu) | ||
77 | { | 105 | { |
78 | return q->queue_hw_ctx[q->mq_map[cpu]]; | 106 | enum hctx_type type = HCTX_TYPE_DEFAULT; |
107 | |||
108 | if ((flags & REQ_HIPRI) && | ||
109 | q->tag_set->nr_maps > HCTX_TYPE_POLL && | ||
110 | q->tag_set->map[HCTX_TYPE_POLL].nr_queues && | ||
111 | test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) | ||
112 | type = HCTX_TYPE_POLL; | ||
113 | |||
114 | else if (((flags & REQ_OP_MASK) == REQ_OP_READ) && | ||
115 | q->tag_set->nr_maps > HCTX_TYPE_READ && | ||
116 | q->tag_set->map[HCTX_TYPE_READ].nr_queues) | ||
117 | type = HCTX_TYPE_READ; | ||
118 | |||
119 | return blk_mq_map_queue_type(q, type, cpu); | ||
79 | } | 120 | } |
80 | 121 | ||
81 | /* | 122 | /* |
@@ -126,6 +167,7 @@ struct blk_mq_alloc_data { | |||
126 | struct request_queue *q; | 167 | struct request_queue *q; |
127 | blk_mq_req_flags_t flags; | 168 | blk_mq_req_flags_t flags; |
128 | unsigned int shallow_depth; | 169 | unsigned int shallow_depth; |
170 | unsigned int cmd_flags; | ||
129 | 171 | ||
130 | /* input & output parameter */ | 172 | /* input & output parameter */ |
131 | struct blk_mq_ctx *ctx; | 173 | struct blk_mq_ctx *ctx; |
@@ -150,8 +192,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) | |||
150 | return hctx->nr_ctx && hctx->tags; | 192 | return hctx->nr_ctx && hctx->tags; |
151 | } | 193 | } |
152 | 194 | ||
153 | void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, | 195 | unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); |
154 | unsigned int inflight[2]); | ||
155 | void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, | 196 | void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, |
156 | unsigned int inflight[2]); | 197 | unsigned int inflight[2]); |
157 | 198 | ||
@@ -195,21 +236,18 @@ static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, | |||
195 | 236 | ||
196 | static inline void blk_mq_put_driver_tag(struct request *rq) | 237 | static inline void blk_mq_put_driver_tag(struct request *rq) |
197 | { | 238 | { |
198 | struct blk_mq_hw_ctx *hctx; | ||
199 | |||
200 | if (rq->tag == -1 || rq->internal_tag == -1) | 239 | if (rq->tag == -1 || rq->internal_tag == -1) |
201 | return; | 240 | return; |
202 | 241 | ||
203 | hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); | 242 | __blk_mq_put_driver_tag(rq->mq_hctx, rq); |
204 | __blk_mq_put_driver_tag(hctx, rq); | ||
205 | } | 243 | } |
206 | 244 | ||
207 | static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set) | 245 | static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) |
208 | { | 246 | { |
209 | int cpu; | 247 | int cpu; |
210 | 248 | ||
211 | for_each_possible_cpu(cpu) | 249 | for_each_possible_cpu(cpu) |
212 | set->mq_map[cpu] = 0; | 250 | qmap->mq_map[cpu] = 0; |
213 | } | 251 | } |
214 | 252 | ||
215 | #endif | 253 | #endif |
diff --git a/block/blk-pm.c b/block/blk-pm.c index f8fdae01bea2..0a028c189897 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c | |||
@@ -89,12 +89,12 @@ int blk_pre_runtime_suspend(struct request_queue *q) | |||
89 | /* Switch q_usage_counter back to per-cpu mode. */ | 89 | /* Switch q_usage_counter back to per-cpu mode. */ |
90 | blk_mq_unfreeze_queue(q); | 90 | blk_mq_unfreeze_queue(q); |
91 | 91 | ||
92 | spin_lock_irq(q->queue_lock); | 92 | spin_lock_irq(&q->queue_lock); |
93 | if (ret < 0) | 93 | if (ret < 0) |
94 | pm_runtime_mark_last_busy(q->dev); | 94 | pm_runtime_mark_last_busy(q->dev); |
95 | else | 95 | else |
96 | q->rpm_status = RPM_SUSPENDING; | 96 | q->rpm_status = RPM_SUSPENDING; |
97 | spin_unlock_irq(q->queue_lock); | 97 | spin_unlock_irq(&q->queue_lock); |
98 | 98 | ||
99 | if (ret) | 99 | if (ret) |
100 | blk_clear_pm_only(q); | 100 | blk_clear_pm_only(q); |
@@ -121,14 +121,14 @@ void blk_post_runtime_suspend(struct request_queue *q, int err) | |||
121 | if (!q->dev) | 121 | if (!q->dev) |
122 | return; | 122 | return; |
123 | 123 | ||
124 | spin_lock_irq(q->queue_lock); | 124 | spin_lock_irq(&q->queue_lock); |
125 | if (!err) { | 125 | if (!err) { |
126 | q->rpm_status = RPM_SUSPENDED; | 126 | q->rpm_status = RPM_SUSPENDED; |
127 | } else { | 127 | } else { |
128 | q->rpm_status = RPM_ACTIVE; | 128 | q->rpm_status = RPM_ACTIVE; |
129 | pm_runtime_mark_last_busy(q->dev); | 129 | pm_runtime_mark_last_busy(q->dev); |
130 | } | 130 | } |
131 | spin_unlock_irq(q->queue_lock); | 131 | spin_unlock_irq(&q->queue_lock); |
132 | 132 | ||
133 | if (err) | 133 | if (err) |
134 | blk_clear_pm_only(q); | 134 | blk_clear_pm_only(q); |
@@ -151,9 +151,9 @@ void blk_pre_runtime_resume(struct request_queue *q) | |||
151 | if (!q->dev) | 151 | if (!q->dev) |
152 | return; | 152 | return; |
153 | 153 | ||
154 | spin_lock_irq(q->queue_lock); | 154 | spin_lock_irq(&q->queue_lock); |
155 | q->rpm_status = RPM_RESUMING; | 155 | q->rpm_status = RPM_RESUMING; |
156 | spin_unlock_irq(q->queue_lock); | 156 | spin_unlock_irq(&q->queue_lock); |
157 | } | 157 | } |
158 | EXPORT_SYMBOL(blk_pre_runtime_resume); | 158 | EXPORT_SYMBOL(blk_pre_runtime_resume); |
159 | 159 | ||
@@ -176,7 +176,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err) | |||
176 | if (!q->dev) | 176 | if (!q->dev) |
177 | return; | 177 | return; |
178 | 178 | ||
179 | spin_lock_irq(q->queue_lock); | 179 | spin_lock_irq(&q->queue_lock); |
180 | if (!err) { | 180 | if (!err) { |
181 | q->rpm_status = RPM_ACTIVE; | 181 | q->rpm_status = RPM_ACTIVE; |
182 | pm_runtime_mark_last_busy(q->dev); | 182 | pm_runtime_mark_last_busy(q->dev); |
@@ -184,7 +184,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err) | |||
184 | } else { | 184 | } else { |
185 | q->rpm_status = RPM_SUSPENDED; | 185 | q->rpm_status = RPM_SUSPENDED; |
186 | } | 186 | } |
187 | spin_unlock_irq(q->queue_lock); | 187 | spin_unlock_irq(&q->queue_lock); |
188 | 188 | ||
189 | if (!err) | 189 | if (!err) |
190 | blk_clear_pm_only(q); | 190 | blk_clear_pm_only(q); |
@@ -207,10 +207,10 @@ EXPORT_SYMBOL(blk_post_runtime_resume); | |||
207 | */ | 207 | */ |
208 | void blk_set_runtime_active(struct request_queue *q) | 208 | void blk_set_runtime_active(struct request_queue *q) |
209 | { | 209 | { |
210 | spin_lock_irq(q->queue_lock); | 210 | spin_lock_irq(&q->queue_lock); |
211 | q->rpm_status = RPM_ACTIVE; | 211 | q->rpm_status = RPM_ACTIVE; |
212 | pm_runtime_mark_last_busy(q->dev); | 212 | pm_runtime_mark_last_busy(q->dev); |
213 | pm_request_autosuspend(q->dev); | 213 | pm_request_autosuspend(q->dev); |
214 | spin_unlock_irq(q->queue_lock); | 214 | spin_unlock_irq(&q->queue_lock); |
215 | } | 215 | } |
216 | EXPORT_SYMBOL(blk_set_runtime_active); | 216 | EXPORT_SYMBOL(blk_set_runtime_active); |
diff --git a/block/blk-pm.h b/block/blk-pm.h index a8564ea72a41..ea5507d23e75 100644 --- a/block/blk-pm.h +++ b/block/blk-pm.h | |||
@@ -21,7 +21,7 @@ static inline void blk_pm_mark_last_busy(struct request *rq) | |||
21 | 21 | ||
22 | static inline void blk_pm_requeue_request(struct request *rq) | 22 | static inline void blk_pm_requeue_request(struct request *rq) |
23 | { | 23 | { |
24 | lockdep_assert_held(rq->q->queue_lock); | 24 | lockdep_assert_held(&rq->q->queue_lock); |
25 | 25 | ||
26 | if (rq->q->dev && !(rq->rq_flags & RQF_PM)) | 26 | if (rq->q->dev && !(rq->rq_flags & RQF_PM)) |
27 | rq->q->nr_pending--; | 27 | rq->q->nr_pending--; |
@@ -30,7 +30,7 @@ static inline void blk_pm_requeue_request(struct request *rq) | |||
30 | static inline void blk_pm_add_request(struct request_queue *q, | 30 | static inline void blk_pm_add_request(struct request_queue *q, |
31 | struct request *rq) | 31 | struct request *rq) |
32 | { | 32 | { |
33 | lockdep_assert_held(q->queue_lock); | 33 | lockdep_assert_held(&q->queue_lock); |
34 | 34 | ||
35 | if (q->dev && !(rq->rq_flags & RQF_PM)) | 35 | if (q->dev && !(rq->rq_flags & RQF_PM)) |
36 | q->nr_pending++; | 36 | q->nr_pending++; |
@@ -38,7 +38,7 @@ static inline void blk_pm_add_request(struct request_queue *q, | |||
38 | 38 | ||
39 | static inline void blk_pm_put_request(struct request *rq) | 39 | static inline void blk_pm_put_request(struct request *rq) |
40 | { | 40 | { |
41 | lockdep_assert_held(rq->q->queue_lock); | 41 | lockdep_assert_held(&rq->q->queue_lock); |
42 | 42 | ||
43 | if (rq->q->dev && !(rq->rq_flags & RQF_PM)) | 43 | if (rq->q->dev && !(rq->rq_flags & RQF_PM)) |
44 | --rq->q->nr_pending; | 44 | --rq->q->nr_pending; |
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 0005dfd568dd..d169d7188fa6 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c | |||
@@ -27,75 +27,67 @@ bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) | |||
27 | return atomic_inc_below(&rq_wait->inflight, limit); | 27 | return atomic_inc_below(&rq_wait->inflight, limit); |
28 | } | 28 | } |
29 | 29 | ||
30 | void rq_qos_cleanup(struct request_queue *q, struct bio *bio) | 30 | void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio) |
31 | { | 31 | { |
32 | struct rq_qos *rqos; | 32 | do { |
33 | |||
34 | for (rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
35 | if (rqos->ops->cleanup) | 33 | if (rqos->ops->cleanup) |
36 | rqos->ops->cleanup(rqos, bio); | 34 | rqos->ops->cleanup(rqos, bio); |
37 | } | 35 | rqos = rqos->next; |
36 | } while (rqos); | ||
38 | } | 37 | } |
39 | 38 | ||
40 | void rq_qos_done(struct request_queue *q, struct request *rq) | 39 | void __rq_qos_done(struct rq_qos *rqos, struct request *rq) |
41 | { | 40 | { |
42 | struct rq_qos *rqos; | 41 | do { |
43 | |||
44 | for (rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
45 | if (rqos->ops->done) | 42 | if (rqos->ops->done) |
46 | rqos->ops->done(rqos, rq); | 43 | rqos->ops->done(rqos, rq); |
47 | } | 44 | rqos = rqos->next; |
45 | } while (rqos); | ||
48 | } | 46 | } |
49 | 47 | ||
50 | void rq_qos_issue(struct request_queue *q, struct request *rq) | 48 | void __rq_qos_issue(struct rq_qos *rqos, struct request *rq) |
51 | { | 49 | { |
52 | struct rq_qos *rqos; | 50 | do { |
53 | |||
54 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
55 | if (rqos->ops->issue) | 51 | if (rqos->ops->issue) |
56 | rqos->ops->issue(rqos, rq); | 52 | rqos->ops->issue(rqos, rq); |
57 | } | 53 | rqos = rqos->next; |
54 | } while (rqos); | ||
58 | } | 55 | } |
59 | 56 | ||
60 | void rq_qos_requeue(struct request_queue *q, struct request *rq) | 57 | void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq) |
61 | { | 58 | { |
62 | struct rq_qos *rqos; | 59 | do { |
63 | |||
64 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
65 | if (rqos->ops->requeue) | 60 | if (rqos->ops->requeue) |
66 | rqos->ops->requeue(rqos, rq); | 61 | rqos->ops->requeue(rqos, rq); |
67 | } | 62 | rqos = rqos->next; |
63 | } while (rqos); | ||
68 | } | 64 | } |
69 | 65 | ||
70 | void rq_qos_throttle(struct request_queue *q, struct bio *bio, | 66 | void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio) |
71 | spinlock_t *lock) | ||
72 | { | 67 | { |
73 | struct rq_qos *rqos; | 68 | do { |
74 | |||
75 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
76 | if (rqos->ops->throttle) | 69 | if (rqos->ops->throttle) |
77 | rqos->ops->throttle(rqos, bio, lock); | 70 | rqos->ops->throttle(rqos, bio); |
78 | } | 71 | rqos = rqos->next; |
72 | } while (rqos); | ||
79 | } | 73 | } |
80 | 74 | ||
81 | void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) | 75 | void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) |
82 | { | 76 | { |
83 | struct rq_qos *rqos; | 77 | do { |
84 | |||
85 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
86 | if (rqos->ops->track) | 78 | if (rqos->ops->track) |
87 | rqos->ops->track(rqos, rq, bio); | 79 | rqos->ops->track(rqos, rq, bio); |
88 | } | 80 | rqos = rqos->next; |
81 | } while (rqos); | ||
89 | } | 82 | } |
90 | 83 | ||
91 | void rq_qos_done_bio(struct request_queue *q, struct bio *bio) | 84 | void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) |
92 | { | 85 | { |
93 | struct rq_qos *rqos; | 86 | do { |
94 | |||
95 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
96 | if (rqos->ops->done_bio) | 87 | if (rqos->ops->done_bio) |
97 | rqos->ops->done_bio(rqos, bio); | 88 | rqos->ops->done_bio(rqos, bio); |
98 | } | 89 | rqos = rqos->next; |
90 | } while (rqos); | ||
99 | } | 91 | } |
100 | 92 | ||
101 | /* | 93 | /* |
@@ -184,8 +176,96 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) | |||
184 | rq_depth_calc_max_depth(rqd); | 176 | rq_depth_calc_max_depth(rqd); |
185 | } | 177 | } |
186 | 178 | ||
179 | struct rq_qos_wait_data { | ||
180 | struct wait_queue_entry wq; | ||
181 | struct task_struct *task; | ||
182 | struct rq_wait *rqw; | ||
183 | acquire_inflight_cb_t *cb; | ||
184 | void *private_data; | ||
185 | bool got_token; | ||
186 | }; | ||
187 | |||
188 | static int rq_qos_wake_function(struct wait_queue_entry *curr, | ||
189 | unsigned int mode, int wake_flags, void *key) | ||
190 | { | ||
191 | struct rq_qos_wait_data *data = container_of(curr, | ||
192 | struct rq_qos_wait_data, | ||
193 | wq); | ||
194 | |||
195 | /* | ||
196 | * If we fail to get a budget, return -1 to interrupt the wake up loop | ||
197 | * in __wake_up_common. | ||
198 | */ | ||
199 | if (!data->cb(data->rqw, data->private_data)) | ||
200 | return -1; | ||
201 | |||
202 | data->got_token = true; | ||
203 | list_del_init(&curr->entry); | ||
204 | wake_up_process(data->task); | ||
205 | return 1; | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * rq_qos_wait - throttle on a rqw if we need to | ||
210 | * @private_data - caller provided specific data | ||
211 | * @acquire_inflight_cb - inc the rqw->inflight counter if we can | ||
212 | * @cleanup_cb - the callback to cleanup in case we race with a waker | ||
213 | * | ||
214 | * This provides a uniform place for the rq_qos users to do their throttling. | ||
215 | * Since you can end up with a lot of things sleeping at once, this manages the | ||
216 | * waking up based on the resources available. The acquire_inflight_cb should | ||
217 | * inc the rqw->inflight if we have the ability to do so, or return false if not | ||
218 | * and then we will sleep until the room becomes available. | ||
219 | * | ||
220 | * cleanup_cb is in case that we race with a waker and need to cleanup the | ||
221 | * inflight count accordingly. | ||
222 | */ | ||
223 | void rq_qos_wait(struct rq_wait *rqw, void *private_data, | ||
224 | acquire_inflight_cb_t *acquire_inflight_cb, | ||
225 | cleanup_cb_t *cleanup_cb) | ||
226 | { | ||
227 | struct rq_qos_wait_data data = { | ||
228 | .wq = { | ||
229 | .func = rq_qos_wake_function, | ||
230 | .entry = LIST_HEAD_INIT(data.wq.entry), | ||
231 | }, | ||
232 | .task = current, | ||
233 | .rqw = rqw, | ||
234 | .cb = acquire_inflight_cb, | ||
235 | .private_data = private_data, | ||
236 | }; | ||
237 | bool has_sleeper; | ||
238 | |||
239 | has_sleeper = wq_has_sleeper(&rqw->wait); | ||
240 | if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) | ||
241 | return; | ||
242 | |||
243 | prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); | ||
244 | do { | ||
245 | if (data.got_token) | ||
246 | break; | ||
247 | if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { | ||
248 | finish_wait(&rqw->wait, &data.wq); | ||
249 | |||
250 | /* | ||
251 | * We raced with wbt_wake_function() getting a token, | ||
252 | * which means we now have two. Put our local token | ||
253 | * and wake anyone else potentially waiting for one. | ||
254 | */ | ||
255 | if (data.got_token) | ||
256 | cleanup_cb(rqw, private_data); | ||
257 | break; | ||
258 | } | ||
259 | io_schedule(); | ||
260 | has_sleeper = false; | ||
261 | } while (1); | ||
262 | finish_wait(&rqw->wait, &data.wq); | ||
263 | } | ||
264 | |||
187 | void rq_qos_exit(struct request_queue *q) | 265 | void rq_qos_exit(struct request_queue *q) |
188 | { | 266 | { |
267 | blk_mq_debugfs_unregister_queue_rqos(q); | ||
268 | |||
189 | while (q->rq_qos) { | 269 | while (q->rq_qos) { |
190 | struct rq_qos *rqos = q->rq_qos; | 270 | struct rq_qos *rqos = q->rq_qos; |
191 | q->rq_qos = rqos->next; | 271 | q->rq_qos = rqos->next; |
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 32b02efbfa66..564851889550 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h | |||
@@ -7,6 +7,10 @@ | |||
7 | #include <linux/atomic.h> | 7 | #include <linux/atomic.h> |
8 | #include <linux/wait.h> | 8 | #include <linux/wait.h> |
9 | 9 | ||
10 | #include "blk-mq-debugfs.h" | ||
11 | |||
12 | struct blk_mq_debugfs_attr; | ||
13 | |||
10 | enum rq_qos_id { | 14 | enum rq_qos_id { |
11 | RQ_QOS_WBT, | 15 | RQ_QOS_WBT, |
12 | RQ_QOS_CGROUP, | 16 | RQ_QOS_CGROUP, |
@@ -22,10 +26,13 @@ struct rq_qos { | |||
22 | struct request_queue *q; | 26 | struct request_queue *q; |
23 | enum rq_qos_id id; | 27 | enum rq_qos_id id; |
24 | struct rq_qos *next; | 28 | struct rq_qos *next; |
29 | #ifdef CONFIG_BLK_DEBUG_FS | ||
30 | struct dentry *debugfs_dir; | ||
31 | #endif | ||
25 | }; | 32 | }; |
26 | 33 | ||
27 | struct rq_qos_ops { | 34 | struct rq_qos_ops { |
28 | void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *); | 35 | void (*throttle)(struct rq_qos *, struct bio *); |
29 | void (*track)(struct rq_qos *, struct request *, struct bio *); | 36 | void (*track)(struct rq_qos *, struct request *, struct bio *); |
30 | void (*issue)(struct rq_qos *, struct request *); | 37 | void (*issue)(struct rq_qos *, struct request *); |
31 | void (*requeue)(struct rq_qos *, struct request *); | 38 | void (*requeue)(struct rq_qos *, struct request *); |
@@ -33,6 +40,7 @@ struct rq_qos_ops { | |||
33 | void (*done_bio)(struct rq_qos *, struct bio *); | 40 | void (*done_bio)(struct rq_qos *, struct bio *); |
34 | void (*cleanup)(struct rq_qos *, struct bio *); | 41 | void (*cleanup)(struct rq_qos *, struct bio *); |
35 | void (*exit)(struct rq_qos *); | 42 | void (*exit)(struct rq_qos *); |
43 | const struct blk_mq_debugfs_attr *debugfs_attrs; | ||
36 | }; | 44 | }; |
37 | 45 | ||
38 | struct rq_depth { | 46 | struct rq_depth { |
@@ -66,6 +74,17 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) | |||
66 | return rq_qos_id(q, RQ_QOS_CGROUP); | 74 | return rq_qos_id(q, RQ_QOS_CGROUP); |
67 | } | 75 | } |
68 | 76 | ||
77 | static inline const char *rq_qos_id_to_name(enum rq_qos_id id) | ||
78 | { | ||
79 | switch (id) { | ||
80 | case RQ_QOS_WBT: | ||
81 | return "wbt"; | ||
82 | case RQ_QOS_CGROUP: | ||
83 | return "cgroup"; | ||
84 | } | ||
85 | return "unknown"; | ||
86 | } | ||
87 | |||
69 | static inline void rq_wait_init(struct rq_wait *rq_wait) | 88 | static inline void rq_wait_init(struct rq_wait *rq_wait) |
70 | { | 89 | { |
71 | atomic_set(&rq_wait->inflight, 0); | 90 | atomic_set(&rq_wait->inflight, 0); |
@@ -76,6 +95,9 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) | |||
76 | { | 95 | { |
77 | rqos->next = q->rq_qos; | 96 | rqos->next = q->rq_qos; |
78 | q->rq_qos = rqos; | 97 | q->rq_qos = rqos; |
98 | |||
99 | if (rqos->ops->debugfs_attrs) | ||
100 | blk_mq_debugfs_register_rqos(rqos); | ||
79 | } | 101 | } |
80 | 102 | ||
81 | static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) | 103 | static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) |
@@ -91,19 +113,77 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) | |||
91 | } | 113 | } |
92 | prev = cur; | 114 | prev = cur; |
93 | } | 115 | } |
116 | |||
117 | blk_mq_debugfs_unregister_rqos(rqos); | ||
94 | } | 118 | } |
95 | 119 | ||
120 | typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); | ||
121 | typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); | ||
122 | |||
123 | void rq_qos_wait(struct rq_wait *rqw, void *private_data, | ||
124 | acquire_inflight_cb_t *acquire_inflight_cb, | ||
125 | cleanup_cb_t *cleanup_cb); | ||
96 | bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); | 126 | bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); |
97 | void rq_depth_scale_up(struct rq_depth *rqd); | 127 | void rq_depth_scale_up(struct rq_depth *rqd); |
98 | void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); | 128 | void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); |
99 | bool rq_depth_calc_max_depth(struct rq_depth *rqd); | 129 | bool rq_depth_calc_max_depth(struct rq_depth *rqd); |
100 | 130 | ||
101 | void rq_qos_cleanup(struct request_queue *, struct bio *); | 131 | void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); |
102 | void rq_qos_done(struct request_queue *, struct request *); | 132 | void __rq_qos_done(struct rq_qos *rqos, struct request *rq); |
103 | void rq_qos_issue(struct request_queue *, struct request *); | 133 | void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); |
104 | void rq_qos_requeue(struct request_queue *, struct request *); | 134 | void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); |
105 | void rq_qos_done_bio(struct request_queue *q, struct bio *bio); | 135 | void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); |
106 | void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); | 136 | void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); |
107 | void rq_qos_track(struct request_queue *q, struct request *, struct bio *); | 137 | void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); |
138 | |||
139 | static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) | ||
140 | { | ||
141 | if (q->rq_qos) | ||
142 | __rq_qos_cleanup(q->rq_qos, bio); | ||
143 | } | ||
144 | |||
145 | static inline void rq_qos_done(struct request_queue *q, struct request *rq) | ||
146 | { | ||
147 | if (q->rq_qos) | ||
148 | __rq_qos_done(q->rq_qos, rq); | ||
149 | } | ||
150 | |||
151 | static inline void rq_qos_issue(struct request_queue *q, struct request *rq) | ||
152 | { | ||
153 | if (q->rq_qos) | ||
154 | __rq_qos_issue(q->rq_qos, rq); | ||
155 | } | ||
156 | |||
157 | static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) | ||
158 | { | ||
159 | if (q->rq_qos) | ||
160 | __rq_qos_requeue(q->rq_qos, rq); | ||
161 | } | ||
162 | |||
163 | static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) | ||
164 | { | ||
165 | if (q->rq_qos) | ||
166 | __rq_qos_done_bio(q->rq_qos, bio); | ||
167 | } | ||
168 | |||
169 | static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) | ||
170 | { | ||
171 | /* | ||
172 | * BIO_TRACKED lets controllers know that a bio went through the | ||
173 | * normal rq_qos path. | ||
174 | */ | ||
175 | bio_set_flag(bio, BIO_TRACKED); | ||
176 | if (q->rq_qos) | ||
177 | __rq_qos_throttle(q->rq_qos, bio); | ||
178 | } | ||
179 | |||
180 | static inline void rq_qos_track(struct request_queue *q, struct request *rq, | ||
181 | struct bio *bio) | ||
182 | { | ||
183 | if (q->rq_qos) | ||
184 | __rq_qos_track(q->rq_qos, rq, bio); | ||
185 | } | ||
186 | |||
108 | void rq_qos_exit(struct request_queue *); | 187 | void rq_qos_exit(struct request_queue *); |
188 | |||
109 | #endif | 189 | #endif |
diff --git a/block/blk-settings.c b/block/blk-settings.c index 9c8b62f8c180..3e7038e475ee 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c | |||
@@ -20,65 +20,12 @@ EXPORT_SYMBOL(blk_max_low_pfn); | |||
20 | 20 | ||
21 | unsigned long blk_max_pfn; | 21 | unsigned long blk_max_pfn; |
22 | 22 | ||
23 | /** | ||
24 | * blk_queue_prep_rq - set a prepare_request function for queue | ||
25 | * @q: queue | ||
26 | * @pfn: prepare_request function | ||
27 | * | ||
28 | * It's possible for a queue to register a prepare_request callback which | ||
29 | * is invoked before the request is handed to the request_fn. The goal of | ||
30 | * the function is to prepare a request for I/O, it can be used to build a | ||
31 | * cdb from the request data for instance. | ||
32 | * | ||
33 | */ | ||
34 | void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) | ||
35 | { | ||
36 | q->prep_rq_fn = pfn; | ||
37 | } | ||
38 | EXPORT_SYMBOL(blk_queue_prep_rq); | ||
39 | |||
40 | /** | ||
41 | * blk_queue_unprep_rq - set an unprepare_request function for queue | ||
42 | * @q: queue | ||
43 | * @ufn: unprepare_request function | ||
44 | * | ||
45 | * It's possible for a queue to register an unprepare_request callback | ||
46 | * which is invoked before the request is finally completed. The goal | ||
47 | * of the function is to deallocate any data that was allocated in the | ||
48 | * prepare_request callback. | ||
49 | * | ||
50 | */ | ||
51 | void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn) | ||
52 | { | ||
53 | q->unprep_rq_fn = ufn; | ||
54 | } | ||
55 | EXPORT_SYMBOL(blk_queue_unprep_rq); | ||
56 | |||
57 | void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) | ||
58 | { | ||
59 | q->softirq_done_fn = fn; | ||
60 | } | ||
61 | EXPORT_SYMBOL(blk_queue_softirq_done); | ||
62 | |||
63 | void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) | 23 | void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) |
64 | { | 24 | { |
65 | q->rq_timeout = timeout; | 25 | q->rq_timeout = timeout; |
66 | } | 26 | } |
67 | EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); | 27 | EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); |
68 | 28 | ||
69 | void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn) | ||
70 | { | ||
71 | WARN_ON_ONCE(q->mq_ops); | ||
72 | q->rq_timed_out_fn = fn; | ||
73 | } | ||
74 | EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out); | ||
75 | |||
76 | void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn) | ||
77 | { | ||
78 | q->lld_busy_fn = fn; | ||
79 | } | ||
80 | EXPORT_SYMBOL_GPL(blk_queue_lld_busy); | ||
81 | |||
82 | /** | 29 | /** |
83 | * blk_set_default_limits - reset limits to default values | 30 | * blk_set_default_limits - reset limits to default values |
84 | * @lim: the queue_limits structure to reset | 31 | * @lim: the queue_limits structure to reset |
@@ -168,8 +115,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) | |||
168 | 115 | ||
169 | q->make_request_fn = mfn; | 116 | q->make_request_fn = mfn; |
170 | blk_queue_dma_alignment(q, 511); | 117 | blk_queue_dma_alignment(q, 511); |
171 | blk_queue_congestion_threshold(q); | ||
172 | q->nr_batching = BLK_BATCH_REQ; | ||
173 | 118 | ||
174 | blk_set_default_limits(&q->limits); | 119 | blk_set_default_limits(&q->limits); |
175 | } | 120 | } |
@@ -886,16 +831,14 @@ EXPORT_SYMBOL(blk_set_queue_depth); | |||
886 | */ | 831 | */ |
887 | void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) | 832 | void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) |
888 | { | 833 | { |
889 | spin_lock_irq(q->queue_lock); | ||
890 | if (wc) | 834 | if (wc) |
891 | queue_flag_set(QUEUE_FLAG_WC, q); | 835 | blk_queue_flag_set(QUEUE_FLAG_WC, q); |
892 | else | 836 | else |
893 | queue_flag_clear(QUEUE_FLAG_WC, q); | 837 | blk_queue_flag_clear(QUEUE_FLAG_WC, q); |
894 | if (fua) | 838 | if (fua) |
895 | queue_flag_set(QUEUE_FLAG_FUA, q); | 839 | blk_queue_flag_set(QUEUE_FLAG_FUA, q); |
896 | else | 840 | else |
897 | queue_flag_clear(QUEUE_FLAG_FUA, q); | 841 | blk_queue_flag_clear(QUEUE_FLAG_FUA, q); |
898 | spin_unlock_irq(q->queue_lock); | ||
899 | 842 | ||
900 | wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); | 843 | wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); |
901 | } | 844 | } |
diff --git a/block/blk-softirq.c b/block/blk-softirq.c index e47a2f751884..457d9ba3eb20 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c | |||
@@ -34,7 +34,7 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) | |||
34 | 34 | ||
35 | rq = list_entry(local_list.next, struct request, ipi_list); | 35 | rq = list_entry(local_list.next, struct request, ipi_list); |
36 | list_del_init(&rq->ipi_list); | 36 | list_del_init(&rq->ipi_list); |
37 | rq->q->softirq_done_fn(rq); | 37 | rq->q->mq_ops->complete(rq); |
38 | } | 38 | } |
39 | } | 39 | } |
40 | 40 | ||
@@ -98,11 +98,11 @@ static int blk_softirq_cpu_dead(unsigned int cpu) | |||
98 | void __blk_complete_request(struct request *req) | 98 | void __blk_complete_request(struct request *req) |
99 | { | 99 | { |
100 | struct request_queue *q = req->q; | 100 | struct request_queue *q = req->q; |
101 | int cpu, ccpu = q->mq_ops ? req->mq_ctx->cpu : req->cpu; | 101 | int cpu, ccpu = req->mq_ctx->cpu; |
102 | unsigned long flags; | 102 | unsigned long flags; |
103 | bool shared = false; | 103 | bool shared = false; |
104 | 104 | ||
105 | BUG_ON(!q->softirq_done_fn); | 105 | BUG_ON(!q->mq_ops->complete); |
106 | 106 | ||
107 | local_irq_save(flags); | 107 | local_irq_save(flags); |
108 | cpu = smp_processor_id(); | 108 | cpu = smp_processor_id(); |
@@ -143,27 +143,6 @@ do_local: | |||
143 | 143 | ||
144 | local_irq_restore(flags); | 144 | local_irq_restore(flags); |
145 | } | 145 | } |
146 | EXPORT_SYMBOL(__blk_complete_request); | ||
147 | |||
148 | /** | ||
149 | * blk_complete_request - end I/O on a request | ||
150 | * @req: the request being processed | ||
151 | * | ||
152 | * Description: | ||
153 | * Ends all I/O on a request. It does not handle partial completions, | ||
154 | * unless the driver actually implements this in its completion callback | ||
155 | * through requeueing. The actual completion happens out-of-order, | ||
156 | * through a softirq handler. The user must have registered a completion | ||
157 | * callback through blk_queue_softirq_done(). | ||
158 | **/ | ||
159 | void blk_complete_request(struct request *req) | ||
160 | { | ||
161 | if (unlikely(blk_should_fake_timeout(req->q))) | ||
162 | return; | ||
163 | if (!blk_mark_rq_complete(req)) | ||
164 | __blk_complete_request(req); | ||
165 | } | ||
166 | EXPORT_SYMBOL(blk_complete_request); | ||
167 | 146 | ||
168 | static __init int blk_softirq_init(void) | 147 | static __init int blk_softirq_init(void) |
169 | { | 148 | { |
diff --git a/block/blk-stat.c b/block/blk-stat.c index 90561af85a62..696a04176e4d 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c | |||
@@ -130,7 +130,6 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), | |||
130 | 130 | ||
131 | return cb; | 131 | return cb; |
132 | } | 132 | } |
133 | EXPORT_SYMBOL_GPL(blk_stat_alloc_callback); | ||
134 | 133 | ||
135 | void blk_stat_add_callback(struct request_queue *q, | 134 | void blk_stat_add_callback(struct request_queue *q, |
136 | struct blk_stat_callback *cb) | 135 | struct blk_stat_callback *cb) |
@@ -151,7 +150,6 @@ void blk_stat_add_callback(struct request_queue *q, | |||
151 | blk_queue_flag_set(QUEUE_FLAG_STATS, q); | 150 | blk_queue_flag_set(QUEUE_FLAG_STATS, q); |
152 | spin_unlock(&q->stats->lock); | 151 | spin_unlock(&q->stats->lock); |
153 | } | 152 | } |
154 | EXPORT_SYMBOL_GPL(blk_stat_add_callback); | ||
155 | 153 | ||
156 | void blk_stat_remove_callback(struct request_queue *q, | 154 | void blk_stat_remove_callback(struct request_queue *q, |
157 | struct blk_stat_callback *cb) | 155 | struct blk_stat_callback *cb) |
@@ -164,7 +162,6 @@ void blk_stat_remove_callback(struct request_queue *q, | |||
164 | 162 | ||
165 | del_timer_sync(&cb->timer); | 163 | del_timer_sync(&cb->timer); |
166 | } | 164 | } |
167 | EXPORT_SYMBOL_GPL(blk_stat_remove_callback); | ||
168 | 165 | ||
169 | static void blk_stat_free_callback_rcu(struct rcu_head *head) | 166 | static void blk_stat_free_callback_rcu(struct rcu_head *head) |
170 | { | 167 | { |
@@ -181,7 +178,6 @@ void blk_stat_free_callback(struct blk_stat_callback *cb) | |||
181 | if (cb) | 178 | if (cb) |
182 | call_rcu(&cb->rcu, blk_stat_free_callback_rcu); | 179 | call_rcu(&cb->rcu, blk_stat_free_callback_rcu); |
183 | } | 180 | } |
184 | EXPORT_SYMBOL_GPL(blk_stat_free_callback); | ||
185 | 181 | ||
186 | void blk_stat_enable_accounting(struct request_queue *q) | 182 | void blk_stat_enable_accounting(struct request_queue *q) |
187 | { | 183 | { |
diff --git a/block/blk-stat.h b/block/blk-stat.h index f4a1568e81a4..17b47a86eefb 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h | |||
@@ -145,6 +145,11 @@ static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb, | |||
145 | mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); | 145 | mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); |
146 | } | 146 | } |
147 | 147 | ||
148 | static inline void blk_stat_deactivate(struct blk_stat_callback *cb) | ||
149 | { | ||
150 | del_timer_sync(&cb->timer); | ||
151 | } | ||
152 | |||
148 | /** | 153 | /** |
149 | * blk_stat_activate_msecs() - Gather block statistics during a time window in | 154 | * blk_stat_activate_msecs() - Gather block statistics during a time window in |
150 | * milliseconds. | 155 | * milliseconds. |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5144707f25ea..590d1ef2f961 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -68,7 +68,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) | |||
68 | unsigned long nr; | 68 | unsigned long nr; |
69 | int ret, err; | 69 | int ret, err; |
70 | 70 | ||
71 | if (!q->request_fn && !q->mq_ops) | 71 | if (!queue_is_mq(q)) |
72 | return -EINVAL; | 72 | return -EINVAL; |
73 | 73 | ||
74 | ret = queue_var_store(&nr, page, count); | 74 | ret = queue_var_store(&nr, page, count); |
@@ -78,11 +78,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) | |||
78 | if (nr < BLKDEV_MIN_RQ) | 78 | if (nr < BLKDEV_MIN_RQ) |
79 | nr = BLKDEV_MIN_RQ; | 79 | nr = BLKDEV_MIN_RQ; |
80 | 80 | ||
81 | if (q->request_fn) | 81 | err = blk_mq_update_nr_requests(q, nr); |
82 | err = blk_update_nr_requests(q, nr); | ||
83 | else | ||
84 | err = blk_mq_update_nr_requests(q, nr); | ||
85 | |||
86 | if (err) | 82 | if (err) |
87 | return err; | 83 | return err; |
88 | 84 | ||
@@ -239,10 +235,10 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) | |||
239 | if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) | 235 | if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) |
240 | return -EINVAL; | 236 | return -EINVAL; |
241 | 237 | ||
242 | spin_lock_irq(q->queue_lock); | 238 | spin_lock_irq(&q->queue_lock); |
243 | q->limits.max_sectors = max_sectors_kb << 1; | 239 | q->limits.max_sectors = max_sectors_kb << 1; |
244 | q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); | 240 | q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); |
245 | spin_unlock_irq(q->queue_lock); | 241 | spin_unlock_irq(&q->queue_lock); |
246 | 242 | ||
247 | return ret; | 243 | return ret; |
248 | } | 244 | } |
@@ -317,14 +313,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, | |||
317 | if (ret < 0) | 313 | if (ret < 0) |
318 | return ret; | 314 | return ret; |
319 | 315 | ||
320 | spin_lock_irq(q->queue_lock); | 316 | blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); |
321 | queue_flag_clear(QUEUE_FLAG_NOMERGES, q); | 317 | blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); |
322 | queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); | ||
323 | if (nm == 2) | 318 | if (nm == 2) |
324 | queue_flag_set(QUEUE_FLAG_NOMERGES, q); | 319 | blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); |
325 | else if (nm) | 320 | else if (nm) |
326 | queue_flag_set(QUEUE_FLAG_NOXMERGES, q); | 321 | blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); |
327 | spin_unlock_irq(q->queue_lock); | ||
328 | 322 | ||
329 | return ret; | 323 | return ret; |
330 | } | 324 | } |
@@ -348,18 +342,16 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) | |||
348 | if (ret < 0) | 342 | if (ret < 0) |
349 | return ret; | 343 | return ret; |
350 | 344 | ||
351 | spin_lock_irq(q->queue_lock); | ||
352 | if (val == 2) { | 345 | if (val == 2) { |
353 | queue_flag_set(QUEUE_FLAG_SAME_COMP, q); | 346 | blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); |
354 | queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); | 347 | blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); |
355 | } else if (val == 1) { | 348 | } else if (val == 1) { |
356 | queue_flag_set(QUEUE_FLAG_SAME_COMP, q); | 349 | blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); |
357 | queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); | 350 | blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); |
358 | } else if (val == 0) { | 351 | } else if (val == 0) { |
359 | queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); | 352 | blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); |
360 | queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); | 353 | blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); |
361 | } | 354 | } |
362 | spin_unlock_irq(q->queue_lock); | ||
363 | #endif | 355 | #endif |
364 | return ret; | 356 | return ret; |
365 | } | 357 | } |
@@ -407,7 +399,8 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, | |||
407 | unsigned long poll_on; | 399 | unsigned long poll_on; |
408 | ssize_t ret; | 400 | ssize_t ret; |
409 | 401 | ||
410 | if (!q->mq_ops || !q->mq_ops->poll) | 402 | if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL || |
403 | !q->tag_set->map[HCTX_TYPE_POLL].nr_queues) | ||
411 | return -EINVAL; | 404 | return -EINVAL; |
412 | 405 | ||
413 | ret = queue_var_store(&poll_on, page, count); | 406 | ret = queue_var_store(&poll_on, page, count); |
@@ -422,6 +415,26 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, | |||
422 | return ret; | 415 | return ret; |
423 | } | 416 | } |
424 | 417 | ||
418 | static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) | ||
419 | { | ||
420 | return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout)); | ||
421 | } | ||
422 | |||
423 | static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, | ||
424 | size_t count) | ||
425 | { | ||
426 | unsigned int val; | ||
427 | int err; | ||
428 | |||
429 | err = kstrtou32(page, 10, &val); | ||
430 | if (err || val == 0) | ||
431 | return -EINVAL; | ||
432 | |||
433 | blk_queue_rq_timeout(q, msecs_to_jiffies(val)); | ||
434 | |||
435 | return count; | ||
436 | } | ||
437 | |||
425 | static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) | 438 | static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) |
426 | { | 439 | { |
427 | if (!wbt_rq_qos(q)) | 440 | if (!wbt_rq_qos(q)) |
@@ -460,20 +473,14 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, | |||
460 | * ends up either enabling or disabling wbt completely. We can't | 473 | * ends up either enabling or disabling wbt completely. We can't |
461 | * have IO inflight if that happens. | 474 | * have IO inflight if that happens. |
462 | */ | 475 | */ |
463 | if (q->mq_ops) { | 476 | blk_mq_freeze_queue(q); |
464 | blk_mq_freeze_queue(q); | 477 | blk_mq_quiesce_queue(q); |
465 | blk_mq_quiesce_queue(q); | ||
466 | } else | ||
467 | blk_queue_bypass_start(q); | ||
468 | 478 | ||
469 | wbt_set_min_lat(q, val); | 479 | wbt_set_min_lat(q, val); |
470 | wbt_update_limits(q); | 480 | wbt_update_limits(q); |
471 | 481 | ||
472 | if (q->mq_ops) { | 482 | blk_mq_unquiesce_queue(q); |
473 | blk_mq_unquiesce_queue(q); | 483 | blk_mq_unfreeze_queue(q); |
474 | blk_mq_unfreeze_queue(q); | ||
475 | } else | ||
476 | blk_queue_bypass_end(q); | ||
477 | 484 | ||
478 | return count; | 485 | return count; |
479 | } | 486 | } |
@@ -696,6 +703,12 @@ static struct queue_sysfs_entry queue_dax_entry = { | |||
696 | .show = queue_dax_show, | 703 | .show = queue_dax_show, |
697 | }; | 704 | }; |
698 | 705 | ||
706 | static struct queue_sysfs_entry queue_io_timeout_entry = { | ||
707 | .attr = {.name = "io_timeout", .mode = 0644 }, | ||
708 | .show = queue_io_timeout_show, | ||
709 | .store = queue_io_timeout_store, | ||
710 | }; | ||
711 | |||
699 | static struct queue_sysfs_entry queue_wb_lat_entry = { | 712 | static struct queue_sysfs_entry queue_wb_lat_entry = { |
700 | .attr = {.name = "wbt_lat_usec", .mode = 0644 }, | 713 | .attr = {.name = "wbt_lat_usec", .mode = 0644 }, |
701 | .show = queue_wb_lat_show, | 714 | .show = queue_wb_lat_show, |
@@ -745,6 +758,7 @@ static struct attribute *default_attrs[] = { | |||
745 | &queue_dax_entry.attr, | 758 | &queue_dax_entry.attr, |
746 | &queue_wb_lat_entry.attr, | 759 | &queue_wb_lat_entry.attr, |
747 | &queue_poll_delay_entry.attr, | 760 | &queue_poll_delay_entry.attr, |
761 | &queue_io_timeout_entry.attr, | ||
748 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | 762 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW |
749 | &throtl_sample_time_entry.attr, | 763 | &throtl_sample_time_entry.attr, |
750 | #endif | 764 | #endif |
@@ -844,24 +858,14 @@ static void __blk_release_queue(struct work_struct *work) | |||
844 | 858 | ||
845 | blk_free_queue_stats(q->stats); | 859 | blk_free_queue_stats(q->stats); |
846 | 860 | ||
847 | blk_exit_rl(q, &q->root_rl); | ||
848 | |||
849 | if (q->queue_tags) | ||
850 | __blk_queue_free_tags(q); | ||
851 | |||
852 | blk_queue_free_zone_bitmaps(q); | 861 | blk_queue_free_zone_bitmaps(q); |
853 | 862 | ||
854 | if (!q->mq_ops) { | 863 | if (queue_is_mq(q)) |
855 | if (q->exit_rq_fn) | ||
856 | q->exit_rq_fn(q, q->fq->flush_rq); | ||
857 | blk_free_flush_queue(q->fq); | ||
858 | } else { | ||
859 | blk_mq_release(q); | 864 | blk_mq_release(q); |
860 | } | ||
861 | 865 | ||
862 | blk_trace_shutdown(q); | 866 | blk_trace_shutdown(q); |
863 | 867 | ||
864 | if (q->mq_ops) | 868 | if (queue_is_mq(q)) |
865 | blk_mq_debugfs_unregister(q); | 869 | blk_mq_debugfs_unregister(q); |
866 | 870 | ||
867 | bioset_exit(&q->bio_split); | 871 | bioset_exit(&q->bio_split); |
@@ -906,7 +910,7 @@ int blk_register_queue(struct gendisk *disk) | |||
906 | WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), | 910 | WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), |
907 | "%s is registering an already registered queue\n", | 911 | "%s is registering an already registered queue\n", |
908 | kobject_name(&dev->kobj)); | 912 | kobject_name(&dev->kobj)); |
909 | queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q); | 913 | blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); |
910 | 914 | ||
911 | /* | 915 | /* |
912 | * SCSI probing may synchronously create and destroy a lot of | 916 | * SCSI probing may synchronously create and destroy a lot of |
@@ -918,9 +922,8 @@ int blk_register_queue(struct gendisk *disk) | |||
918 | * request_queues for non-existent devices never get registered. | 922 | * request_queues for non-existent devices never get registered. |
919 | */ | 923 | */ |
920 | if (!blk_queue_init_done(q)) { | 924 | if (!blk_queue_init_done(q)) { |
921 | queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); | 925 | blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); |
922 | percpu_ref_switch_to_percpu(&q->q_usage_counter); | 926 | percpu_ref_switch_to_percpu(&q->q_usage_counter); |
923 | blk_queue_bypass_end(q); | ||
924 | } | 927 | } |
925 | 928 | ||
926 | ret = blk_trace_init_sysfs(dev); | 929 | ret = blk_trace_init_sysfs(dev); |
@@ -936,7 +939,7 @@ int blk_register_queue(struct gendisk *disk) | |||
936 | goto unlock; | 939 | goto unlock; |
937 | } | 940 | } |
938 | 941 | ||
939 | if (q->mq_ops) { | 942 | if (queue_is_mq(q)) { |
940 | __blk_mq_register_dev(dev, q); | 943 | __blk_mq_register_dev(dev, q); |
941 | blk_mq_debugfs_register(q); | 944 | blk_mq_debugfs_register(q); |
942 | } | 945 | } |
@@ -947,7 +950,7 @@ int blk_register_queue(struct gendisk *disk) | |||
947 | 950 | ||
948 | blk_throtl_register_queue(q); | 951 | blk_throtl_register_queue(q); |
949 | 952 | ||
950 | if (q->request_fn || (q->mq_ops && q->elevator)) { | 953 | if (q->elevator) { |
951 | ret = elv_register_queue(q); | 954 | ret = elv_register_queue(q); |
952 | if (ret) { | 955 | if (ret) { |
953 | mutex_unlock(&q->sysfs_lock); | 956 | mutex_unlock(&q->sysfs_lock); |
@@ -996,7 +999,7 @@ void blk_unregister_queue(struct gendisk *disk) | |||
996 | * Remove the sysfs attributes before unregistering the queue data | 999 | * Remove the sysfs attributes before unregistering the queue data |
997 | * structures that can be modified through sysfs. | 1000 | * structures that can be modified through sysfs. |
998 | */ | 1001 | */ |
999 | if (q->mq_ops) | 1002 | if (queue_is_mq(q)) |
1000 | blk_mq_unregister_dev(disk_to_dev(disk), q); | 1003 | blk_mq_unregister_dev(disk_to_dev(disk), q); |
1001 | mutex_unlock(&q->sysfs_lock); | 1004 | mutex_unlock(&q->sysfs_lock); |
1002 | 1005 | ||
@@ -1005,7 +1008,7 @@ void blk_unregister_queue(struct gendisk *disk) | |||
1005 | blk_trace_remove_sysfs(disk_to_dev(disk)); | 1008 | blk_trace_remove_sysfs(disk_to_dev(disk)); |
1006 | 1009 | ||
1007 | mutex_lock(&q->sysfs_lock); | 1010 | mutex_lock(&q->sysfs_lock); |
1008 | if (q->request_fn || (q->mq_ops && q->elevator)) | 1011 | if (q->elevator) |
1009 | elv_unregister_queue(q); | 1012 | elv_unregister_queue(q); |
1010 | mutex_unlock(&q->sysfs_lock); | 1013 | mutex_unlock(&q->sysfs_lock); |
1011 | 1014 | ||
diff --git a/block/blk-tag.c b/block/blk-tag.c deleted file mode 100644 index fbc153aef166..000000000000 --- a/block/blk-tag.c +++ /dev/null | |||
@@ -1,378 +0,0 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Functions related to tagged command queuing | ||
4 | */ | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/bio.h> | ||
8 | #include <linux/blkdev.h> | ||
9 | #include <linux/slab.h> | ||
10 | |||
11 | #include "blk.h" | ||
12 | |||
13 | /** | ||
14 | * blk_queue_find_tag - find a request by its tag and queue | ||
15 | * @q: The request queue for the device | ||
16 | * @tag: The tag of the request | ||
17 | * | ||
18 | * Notes: | ||
19 | * Should be used when a device returns a tag and you want to match | ||
20 | * it with a request. | ||
21 | * | ||
22 | * no locks need be held. | ||
23 | **/ | ||
24 | struct request *blk_queue_find_tag(struct request_queue *q, int tag) | ||
25 | { | ||
26 | return blk_map_queue_find_tag(q->queue_tags, tag); | ||
27 | } | ||
28 | EXPORT_SYMBOL(blk_queue_find_tag); | ||
29 | |||
30 | /** | ||
31 | * blk_free_tags - release a given set of tag maintenance info | ||
32 | * @bqt: the tag map to free | ||
33 | * | ||
34 | * Drop the reference count on @bqt and frees it when the last reference | ||
35 | * is dropped. | ||
36 | */ | ||
37 | void blk_free_tags(struct blk_queue_tag *bqt) | ||
38 | { | ||
39 | if (atomic_dec_and_test(&bqt->refcnt)) { | ||
40 | BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) < | ||
41 | bqt->max_depth); | ||
42 | |||
43 | kfree(bqt->tag_index); | ||
44 | bqt->tag_index = NULL; | ||
45 | |||
46 | kfree(bqt->tag_map); | ||
47 | bqt->tag_map = NULL; | ||
48 | |||
49 | kfree(bqt); | ||
50 | } | ||
51 | } | ||
52 | EXPORT_SYMBOL(blk_free_tags); | ||
53 | |||
54 | /** | ||
55 | * __blk_queue_free_tags - release tag maintenance info | ||
56 | * @q: the request queue for the device | ||
57 | * | ||
58 | * Notes: | ||
59 | * blk_cleanup_queue() will take care of calling this function, if tagging | ||
60 | * has been used. So there's no need to call this directly. | ||
61 | **/ | ||
62 | void __blk_queue_free_tags(struct request_queue *q) | ||
63 | { | ||
64 | struct blk_queue_tag *bqt = q->queue_tags; | ||
65 | |||
66 | if (!bqt) | ||
67 | return; | ||
68 | |||
69 | blk_free_tags(bqt); | ||
70 | |||
71 | q->queue_tags = NULL; | ||
72 | queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q); | ||
73 | } | ||
74 | |||
75 | /** | ||
76 | * blk_queue_free_tags - release tag maintenance info | ||
77 | * @q: the request queue for the device | ||
78 | * | ||
79 | * Notes: | ||
80 | * This is used to disable tagged queuing to a device, yet leave | ||
81 | * queue in function. | ||
82 | **/ | ||
83 | void blk_queue_free_tags(struct request_queue *q) | ||
84 | { | ||
85 | queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q); | ||
86 | } | ||
87 | EXPORT_SYMBOL(blk_queue_free_tags); | ||
88 | |||
89 | static int | ||
90 | init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth) | ||
91 | { | ||
92 | struct request **tag_index; | ||
93 | unsigned long *tag_map; | ||
94 | int nr_ulongs; | ||
95 | |||
96 | if (q && depth > q->nr_requests * 2) { | ||
97 | depth = q->nr_requests * 2; | ||
98 | printk(KERN_ERR "%s: adjusted depth to %d\n", | ||
99 | __func__, depth); | ||
100 | } | ||
101 | |||
102 | tag_index = kcalloc(depth, sizeof(struct request *), GFP_ATOMIC); | ||
103 | if (!tag_index) | ||
104 | goto fail; | ||
105 | |||
106 | nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; | ||
107 | tag_map = kcalloc(nr_ulongs, sizeof(unsigned long), GFP_ATOMIC); | ||
108 | if (!tag_map) | ||
109 | goto fail; | ||
110 | |||
111 | tags->real_max_depth = depth; | ||
112 | tags->max_depth = depth; | ||
113 | tags->tag_index = tag_index; | ||
114 | tags->tag_map = tag_map; | ||
115 | |||
116 | return 0; | ||
117 | fail: | ||
118 | kfree(tag_index); | ||
119 | return -ENOMEM; | ||
120 | } | ||
121 | |||
122 | static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, | ||
123 | int depth, int alloc_policy) | ||
124 | { | ||
125 | struct blk_queue_tag *tags; | ||
126 | |||
127 | tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); | ||
128 | if (!tags) | ||
129 | goto fail; | ||
130 | |||
131 | if (init_tag_map(q, tags, depth)) | ||
132 | goto fail; | ||
133 | |||
134 | atomic_set(&tags->refcnt, 1); | ||
135 | tags->alloc_policy = alloc_policy; | ||
136 | tags->next_tag = 0; | ||
137 | return tags; | ||
138 | fail: | ||
139 | kfree(tags); | ||
140 | return NULL; | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * blk_init_tags - initialize the tag info for an external tag map | ||
145 | * @depth: the maximum queue depth supported | ||
146 | * @alloc_policy: tag allocation policy | ||
147 | **/ | ||
148 | struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy) | ||
149 | { | ||
150 | return __blk_queue_init_tags(NULL, depth, alloc_policy); | ||
151 | } | ||
152 | EXPORT_SYMBOL(blk_init_tags); | ||
153 | |||
154 | /** | ||
155 | * blk_queue_init_tags - initialize the queue tag info | ||
156 | * @q: the request queue for the device | ||
157 | * @depth: the maximum queue depth supported | ||
158 | * @tags: the tag to use | ||
159 | * @alloc_policy: tag allocation policy | ||
160 | * | ||
161 | * Queue lock must be held here if the function is called to resize an | ||
162 | * existing map. | ||
163 | **/ | ||
164 | int blk_queue_init_tags(struct request_queue *q, int depth, | ||
165 | struct blk_queue_tag *tags, int alloc_policy) | ||
166 | { | ||
167 | int rc; | ||
168 | |||
169 | BUG_ON(tags && q->queue_tags && tags != q->queue_tags); | ||
170 | |||
171 | if (!tags && !q->queue_tags) { | ||
172 | tags = __blk_queue_init_tags(q, depth, alloc_policy); | ||
173 | |||
174 | if (!tags) | ||
175 | return -ENOMEM; | ||
176 | |||
177 | } else if (q->queue_tags) { | ||
178 | rc = blk_queue_resize_tags(q, depth); | ||
179 | if (rc) | ||
180 | return rc; | ||
181 | queue_flag_set(QUEUE_FLAG_QUEUED, q); | ||
182 | return 0; | ||
183 | } else | ||
184 | atomic_inc(&tags->refcnt); | ||
185 | |||
186 | /* | ||
187 | * assign it, all done | ||
188 | */ | ||
189 | q->queue_tags = tags; | ||
190 | queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); | ||
191 | return 0; | ||
192 | } | ||
193 | EXPORT_SYMBOL(blk_queue_init_tags); | ||
194 | |||
195 | /** | ||
196 | * blk_queue_resize_tags - change the queueing depth | ||
197 | * @q: the request queue for the device | ||
198 | * @new_depth: the new max command queueing depth | ||
199 | * | ||
200 | * Notes: | ||
201 | * Must be called with the queue lock held. | ||
202 | **/ | ||
203 | int blk_queue_resize_tags(struct request_queue *q, int new_depth) | ||
204 | { | ||
205 | struct blk_queue_tag *bqt = q->queue_tags; | ||
206 | struct request **tag_index; | ||
207 | unsigned long *tag_map; | ||
208 | int max_depth, nr_ulongs; | ||
209 | |||
210 | if (!bqt) | ||
211 | return -ENXIO; | ||
212 | |||
213 | /* | ||
214 | * if we already have large enough real_max_depth. just | ||
215 | * adjust max_depth. *NOTE* as requests with tag value | ||
216 | * between new_depth and real_max_depth can be in-flight, tag | ||
217 | * map can not be shrunk blindly here. | ||
218 | */ | ||
219 | if (new_depth <= bqt->real_max_depth) { | ||
220 | bqt->max_depth = new_depth; | ||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | /* | ||
225 | * Currently cannot replace a shared tag map with a new | ||
226 | * one, so error out if this is the case | ||
227 | */ | ||
228 | if (atomic_read(&bqt->refcnt) != 1) | ||
229 | return -EBUSY; | ||
230 | |||
231 | /* | ||
232 | * save the old state info, so we can copy it back | ||
233 | */ | ||
234 | tag_index = bqt->tag_index; | ||
235 | tag_map = bqt->tag_map; | ||
236 | max_depth = bqt->real_max_depth; | ||
237 | |||
238 | if (init_tag_map(q, bqt, new_depth)) | ||
239 | return -ENOMEM; | ||
240 | |||
241 | memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); | ||
242 | nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; | ||
243 | memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); | ||
244 | |||
245 | kfree(tag_index); | ||
246 | kfree(tag_map); | ||
247 | return 0; | ||
248 | } | ||
249 | EXPORT_SYMBOL(blk_queue_resize_tags); | ||
250 | |||
251 | /** | ||
252 | * blk_queue_end_tag - end tag operations for a request | ||
253 | * @q: the request queue for the device | ||
254 | * @rq: the request that has completed | ||
255 | * | ||
256 | * Description: | ||
257 | * Typically called when end_that_request_first() returns %0, meaning | ||
258 | * all transfers have been done for a request. It's important to call | ||
259 | * this function before end_that_request_last(), as that will put the | ||
260 | * request back on the free list thus corrupting the internal tag list. | ||
261 | **/ | ||
262 | void blk_queue_end_tag(struct request_queue *q, struct request *rq) | ||
263 | { | ||
264 | struct blk_queue_tag *bqt = q->queue_tags; | ||
265 | unsigned tag = rq->tag; /* negative tags invalid */ | ||
266 | |||
267 | lockdep_assert_held(q->queue_lock); | ||
268 | |||
269 | BUG_ON(tag >= bqt->real_max_depth); | ||
270 | |||
271 | list_del_init(&rq->queuelist); | ||
272 | rq->rq_flags &= ~RQF_QUEUED; | ||
273 | rq->tag = -1; | ||
274 | rq->internal_tag = -1; | ||
275 | |||
276 | if (unlikely(bqt->tag_index[tag] == NULL)) | ||
277 | printk(KERN_ERR "%s: tag %d is missing\n", | ||
278 | __func__, tag); | ||
279 | |||
280 | bqt->tag_index[tag] = NULL; | ||
281 | |||
282 | if (unlikely(!test_bit(tag, bqt->tag_map))) { | ||
283 | printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", | ||
284 | __func__, tag); | ||
285 | return; | ||
286 | } | ||
287 | /* | ||
288 | * The tag_map bit acts as a lock for tag_index[bit], so we need | ||
289 | * unlock memory barrier semantics. | ||
290 | */ | ||
291 | clear_bit_unlock(tag, bqt->tag_map); | ||
292 | } | ||
293 | |||
294 | /** | ||
295 | * blk_queue_start_tag - find a free tag and assign it | ||
296 | * @q: the request queue for the device | ||
297 | * @rq: the block request that needs tagging | ||
298 | * | ||
299 | * Description: | ||
300 | * This can either be used as a stand-alone helper, or possibly be | ||
301 | * assigned as the queue &prep_rq_fn (in which case &struct request | ||
302 | * automagically gets a tag assigned). Note that this function | ||
303 | * assumes that any type of request can be queued! if this is not | ||
304 | * true for your device, you must check the request type before | ||
305 | * calling this function. The request will also be removed from | ||
306 | * the request queue, so it's the drivers responsibility to readd | ||
307 | * it if it should need to be restarted for some reason. | ||
308 | **/ | ||
309 | int blk_queue_start_tag(struct request_queue *q, struct request *rq) | ||
310 | { | ||
311 | struct blk_queue_tag *bqt = q->queue_tags; | ||
312 | unsigned max_depth; | ||
313 | int tag; | ||
314 | |||
315 | lockdep_assert_held(q->queue_lock); | ||
316 | |||
317 | if (unlikely((rq->rq_flags & RQF_QUEUED))) { | ||
318 | printk(KERN_ERR | ||
319 | "%s: request %p for device [%s] already tagged %d", | ||
320 | __func__, rq, | ||
321 | rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); | ||
322 | BUG(); | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Protect against shared tag maps, as we may not have exclusive | ||
327 | * access to the tag map. | ||
328 | * | ||
329 | * We reserve a few tags just for sync IO, since we don't want | ||
330 | * to starve sync IO on behalf of flooding async IO. | ||
331 | */ | ||
332 | max_depth = bqt->max_depth; | ||
333 | if (!rq_is_sync(rq) && max_depth > 1) { | ||
334 | switch (max_depth) { | ||
335 | case 2: | ||
336 | max_depth = 1; | ||
337 | break; | ||
338 | case 3: | ||
339 | max_depth = 2; | ||
340 | break; | ||
341 | default: | ||
342 | max_depth -= 2; | ||
343 | } | ||
344 | if (q->in_flight[BLK_RW_ASYNC] > max_depth) | ||
345 | return 1; | ||
346 | } | ||
347 | |||
348 | do { | ||
349 | if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) { | ||
350 | tag = find_first_zero_bit(bqt->tag_map, max_depth); | ||
351 | if (tag >= max_depth) | ||
352 | return 1; | ||
353 | } else { | ||
354 | int start = bqt->next_tag; | ||
355 | int size = min_t(int, bqt->max_depth, max_depth + start); | ||
356 | tag = find_next_zero_bit(bqt->tag_map, size, start); | ||
357 | if (tag >= size && start + size > bqt->max_depth) { | ||
358 | size = start + size - bqt->max_depth; | ||
359 | tag = find_first_zero_bit(bqt->tag_map, size); | ||
360 | } | ||
361 | if (tag >= size) | ||
362 | return 1; | ||
363 | } | ||
364 | |||
365 | } while (test_and_set_bit_lock(tag, bqt->tag_map)); | ||
366 | /* | ||
367 | * We need lock ordering semantics given by test_and_set_bit_lock. | ||
368 | * See blk_queue_end_tag for details. | ||
369 | */ | ||
370 | |||
371 | bqt->next_tag = (tag + 1) % bqt->max_depth; | ||
372 | rq->rq_flags |= RQF_QUEUED; | ||
373 | rq->tag = tag; | ||
374 | bqt->tag_index[tag] = rq; | ||
375 | blk_start_request(rq); | ||
376 | return 0; | ||
377 | } | ||
378 | EXPORT_SYMBOL(blk_queue_start_tag); | ||
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index db1a3a2ae006..1b97a73d2fb1 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -1243,7 +1243,7 @@ static void throtl_pending_timer_fn(struct timer_list *t) | |||
1243 | bool dispatched; | 1243 | bool dispatched; |
1244 | int ret; | 1244 | int ret; |
1245 | 1245 | ||
1246 | spin_lock_irq(q->queue_lock); | 1246 | spin_lock_irq(&q->queue_lock); |
1247 | if (throtl_can_upgrade(td, NULL)) | 1247 | if (throtl_can_upgrade(td, NULL)) |
1248 | throtl_upgrade_state(td); | 1248 | throtl_upgrade_state(td); |
1249 | 1249 | ||
@@ -1266,9 +1266,9 @@ again: | |||
1266 | break; | 1266 | break; |
1267 | 1267 | ||
1268 | /* this dispatch windows is still open, relax and repeat */ | 1268 | /* this dispatch windows is still open, relax and repeat */ |
1269 | spin_unlock_irq(q->queue_lock); | 1269 | spin_unlock_irq(&q->queue_lock); |
1270 | cpu_relax(); | 1270 | cpu_relax(); |
1271 | spin_lock_irq(q->queue_lock); | 1271 | spin_lock_irq(&q->queue_lock); |
1272 | } | 1272 | } |
1273 | 1273 | ||
1274 | if (!dispatched) | 1274 | if (!dispatched) |
@@ -1290,7 +1290,7 @@ again: | |||
1290 | queue_work(kthrotld_workqueue, &td->dispatch_work); | 1290 | queue_work(kthrotld_workqueue, &td->dispatch_work); |
1291 | } | 1291 | } |
1292 | out_unlock: | 1292 | out_unlock: |
1293 | spin_unlock_irq(q->queue_lock); | 1293 | spin_unlock_irq(&q->queue_lock); |
1294 | } | 1294 | } |
1295 | 1295 | ||
1296 | /** | 1296 | /** |
@@ -1314,11 +1314,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) | |||
1314 | 1314 | ||
1315 | bio_list_init(&bio_list_on_stack); | 1315 | bio_list_init(&bio_list_on_stack); |
1316 | 1316 | ||
1317 | spin_lock_irq(q->queue_lock); | 1317 | spin_lock_irq(&q->queue_lock); |
1318 | for (rw = READ; rw <= WRITE; rw++) | 1318 | for (rw = READ; rw <= WRITE; rw++) |
1319 | while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) | 1319 | while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) |
1320 | bio_list_add(&bio_list_on_stack, bio); | 1320 | bio_list_add(&bio_list_on_stack, bio); |
1321 | spin_unlock_irq(q->queue_lock); | 1321 | spin_unlock_irq(&q->queue_lock); |
1322 | 1322 | ||
1323 | if (!bio_list_empty(&bio_list_on_stack)) { | 1323 | if (!bio_list_empty(&bio_list_on_stack)) { |
1324 | blk_start_plug(&plug); | 1324 | blk_start_plug(&plug); |
@@ -2115,16 +2115,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) | |||
2115 | } | 2115 | } |
2116 | #endif | 2116 | #endif |
2117 | 2117 | ||
2118 | static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) | ||
2119 | { | ||
2120 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
2121 | /* fallback to root_blkg if we fail to get a blkg ref */ | ||
2122 | if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV)) | ||
2123 | bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg); | ||
2124 | bio_issue_init(&bio->bi_issue, bio_sectors(bio)); | ||
2125 | #endif | ||
2126 | } | ||
2127 | |||
2128 | bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, | 2118 | bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, |
2129 | struct bio *bio) | 2119 | struct bio *bio) |
2130 | { | 2120 | { |
@@ -2141,14 +2131,10 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, | |||
2141 | if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) | 2131 | if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) |
2142 | goto out; | 2132 | goto out; |
2143 | 2133 | ||
2144 | spin_lock_irq(q->queue_lock); | 2134 | spin_lock_irq(&q->queue_lock); |
2145 | 2135 | ||
2146 | throtl_update_latency_buckets(td); | 2136 | throtl_update_latency_buckets(td); |
2147 | 2137 | ||
2148 | if (unlikely(blk_queue_bypass(q))) | ||
2149 | goto out_unlock; | ||
2150 | |||
2151 | blk_throtl_assoc_bio(tg, bio); | ||
2152 | blk_throtl_update_idletime(tg); | 2138 | blk_throtl_update_idletime(tg); |
2153 | 2139 | ||
2154 | sq = &tg->service_queue; | 2140 | sq = &tg->service_queue; |
@@ -2227,7 +2213,7 @@ again: | |||
2227 | } | 2213 | } |
2228 | 2214 | ||
2229 | out_unlock: | 2215 | out_unlock: |
2230 | spin_unlock_irq(q->queue_lock); | 2216 | spin_unlock_irq(&q->queue_lock); |
2231 | out: | 2217 | out: |
2232 | bio_set_flag(bio, BIO_THROTTLED); | 2218 | bio_set_flag(bio, BIO_THROTTLED); |
2233 | 2219 | ||
@@ -2348,7 +2334,7 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq) | |||
2348 | * Dispatch all currently throttled bios on @q through ->make_request_fn(). | 2334 | * Dispatch all currently throttled bios on @q through ->make_request_fn(). |
2349 | */ | 2335 | */ |
2350 | void blk_throtl_drain(struct request_queue *q) | 2336 | void blk_throtl_drain(struct request_queue *q) |
2351 | __releases(q->queue_lock) __acquires(q->queue_lock) | 2337 | __releases(&q->queue_lock) __acquires(&q->queue_lock) |
2352 | { | 2338 | { |
2353 | struct throtl_data *td = q->td; | 2339 | struct throtl_data *td = q->td; |
2354 | struct blkcg_gq *blkg; | 2340 | struct blkcg_gq *blkg; |
@@ -2356,7 +2342,6 @@ void blk_throtl_drain(struct request_queue *q) | |||
2356 | struct bio *bio; | 2342 | struct bio *bio; |
2357 | int rw; | 2343 | int rw; |
2358 | 2344 | ||
2359 | queue_lockdep_assert_held(q); | ||
2360 | rcu_read_lock(); | 2345 | rcu_read_lock(); |
2361 | 2346 | ||
2362 | /* | 2347 | /* |
@@ -2372,7 +2357,7 @@ void blk_throtl_drain(struct request_queue *q) | |||
2372 | tg_drain_bios(&td->service_queue); | 2357 | tg_drain_bios(&td->service_queue); |
2373 | 2358 | ||
2374 | rcu_read_unlock(); | 2359 | rcu_read_unlock(); |
2375 | spin_unlock_irq(q->queue_lock); | 2360 | spin_unlock_irq(&q->queue_lock); |
2376 | 2361 | ||
2377 | /* all bios now should be in td->service_queue, issue them */ | 2362 | /* all bios now should be in td->service_queue, issue them */ |
2378 | for (rw = READ; rw <= WRITE; rw++) | 2363 | for (rw = READ; rw <= WRITE; rw++) |
@@ -2380,7 +2365,7 @@ void blk_throtl_drain(struct request_queue *q) | |||
2380 | NULL))) | 2365 | NULL))) |
2381 | generic_make_request(bio); | 2366 | generic_make_request(bio); |
2382 | 2367 | ||
2383 | spin_lock_irq(q->queue_lock); | 2368 | spin_lock_irq(&q->queue_lock); |
2384 | } | 2369 | } |
2385 | 2370 | ||
2386 | int blk_throtl_init(struct request_queue *q) | 2371 | int blk_throtl_init(struct request_queue *q) |
@@ -2460,7 +2445,7 @@ void blk_throtl_register_queue(struct request_queue *q) | |||
2460 | td->throtl_slice = DFL_THROTL_SLICE_HD; | 2445 | td->throtl_slice = DFL_THROTL_SLICE_HD; |
2461 | #endif | 2446 | #endif |
2462 | 2447 | ||
2463 | td->track_bio_latency = !queue_is_rq_based(q); | 2448 | td->track_bio_latency = !queue_is_mq(q); |
2464 | if (!td->track_bio_latency) | 2449 | if (!td->track_bio_latency) |
2465 | blk_stat_enable_accounting(q); | 2450 | blk_stat_enable_accounting(q); |
2466 | } | 2451 | } |
diff --git a/block/blk-timeout.c b/block/blk-timeout.c index f2cfd56e1606..124c26128bf6 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c | |||
@@ -68,80 +68,6 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, | |||
68 | 68 | ||
69 | #endif /* CONFIG_FAIL_IO_TIMEOUT */ | 69 | #endif /* CONFIG_FAIL_IO_TIMEOUT */ |
70 | 70 | ||
71 | /* | ||
72 | * blk_delete_timer - Delete/cancel timer for a given function. | ||
73 | * @req: request that we are canceling timer for | ||
74 | * | ||
75 | */ | ||
76 | void blk_delete_timer(struct request *req) | ||
77 | { | ||
78 | list_del_init(&req->timeout_list); | ||
79 | } | ||
80 | |||
81 | static void blk_rq_timed_out(struct request *req) | ||
82 | { | ||
83 | struct request_queue *q = req->q; | ||
84 | enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; | ||
85 | |||
86 | if (q->rq_timed_out_fn) | ||
87 | ret = q->rq_timed_out_fn(req); | ||
88 | switch (ret) { | ||
89 | case BLK_EH_RESET_TIMER: | ||
90 | blk_add_timer(req); | ||
91 | blk_clear_rq_complete(req); | ||
92 | break; | ||
93 | case BLK_EH_DONE: | ||
94 | /* | ||
95 | * LLD handles this for now but in the future | ||
96 | * we can send a request msg to abort the command | ||
97 | * and we can move more of the generic scsi eh code to | ||
98 | * the blk layer. | ||
99 | */ | ||
100 | break; | ||
101 | default: | ||
102 | printk(KERN_ERR "block: bad eh return: %d\n", ret); | ||
103 | break; | ||
104 | } | ||
105 | } | ||
106 | |||
107 | static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, | ||
108 | unsigned int *next_set) | ||
109 | { | ||
110 | const unsigned long deadline = blk_rq_deadline(rq); | ||
111 | |||
112 | if (time_after_eq(jiffies, deadline)) { | ||
113 | list_del_init(&rq->timeout_list); | ||
114 | |||
115 | /* | ||
116 | * Check if we raced with end io completion | ||
117 | */ | ||
118 | if (!blk_mark_rq_complete(rq)) | ||
119 | blk_rq_timed_out(rq); | ||
120 | } else if (!*next_set || time_after(*next_timeout, deadline)) { | ||
121 | *next_timeout = deadline; | ||
122 | *next_set = 1; | ||
123 | } | ||
124 | } | ||
125 | |||
126 | void blk_timeout_work(struct work_struct *work) | ||
127 | { | ||
128 | struct request_queue *q = | ||
129 | container_of(work, struct request_queue, timeout_work); | ||
130 | unsigned long flags, next = 0; | ||
131 | struct request *rq, *tmp; | ||
132 | int next_set = 0; | ||
133 | |||
134 | spin_lock_irqsave(q->queue_lock, flags); | ||
135 | |||
136 | list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) | ||
137 | blk_rq_check_expired(rq, &next, &next_set); | ||
138 | |||
139 | if (next_set) | ||
140 | mod_timer(&q->timeout, round_jiffies_up(next)); | ||
141 | |||
142 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
143 | } | ||
144 | |||
145 | /** | 71 | /** |
146 | * blk_abort_request -- Request request recovery for the specified command | 72 | * blk_abort_request -- Request request recovery for the specified command |
147 | * @req: pointer to the request of interest | 73 | * @req: pointer to the request of interest |
@@ -149,24 +75,17 @@ void blk_timeout_work(struct work_struct *work) | |||
149 | * This function requests that the block layer start recovery for the | 75 | * This function requests that the block layer start recovery for the |
150 | * request by deleting the timer and calling the q's timeout function. | 76 | * request by deleting the timer and calling the q's timeout function. |
151 | * LLDDs who implement their own error recovery MAY ignore the timeout | 77 | * LLDDs who implement their own error recovery MAY ignore the timeout |
152 | * event if they generated blk_abort_req. Must hold queue lock. | 78 | * event if they generated blk_abort_request. |
153 | */ | 79 | */ |
154 | void blk_abort_request(struct request *req) | 80 | void blk_abort_request(struct request *req) |
155 | { | 81 | { |
156 | if (req->q->mq_ops) { | 82 | /* |
157 | /* | 83 | * All we need to ensure is that timeout scan takes place |
158 | * All we need to ensure is that timeout scan takes place | 84 | * immediately and that scan sees the new timeout value. |
159 | * immediately and that scan sees the new timeout value. | 85 | * No need for fancy synchronizations. |
160 | * No need for fancy synchronizations. | 86 | */ |
161 | */ | 87 | WRITE_ONCE(req->deadline, jiffies); |
162 | blk_rq_set_deadline(req, jiffies); | 88 | kblockd_schedule_work(&req->q->timeout_work); |
163 | kblockd_schedule_work(&req->q->timeout_work); | ||
164 | } else { | ||
165 | if (blk_mark_rq_complete(req)) | ||
166 | return; | ||
167 | blk_delete_timer(req); | ||
168 | blk_rq_timed_out(req); | ||
169 | } | ||
170 | } | 89 | } |
171 | EXPORT_SYMBOL_GPL(blk_abort_request); | 90 | EXPORT_SYMBOL_GPL(blk_abort_request); |
172 | 91 | ||
@@ -194,15 +113,6 @@ void blk_add_timer(struct request *req) | |||
194 | struct request_queue *q = req->q; | 113 | struct request_queue *q = req->q; |
195 | unsigned long expiry; | 114 | unsigned long expiry; |
196 | 115 | ||
197 | if (!q->mq_ops) | ||
198 | lockdep_assert_held(q->queue_lock); | ||
199 | |||
200 | /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ | ||
201 | if (!q->mq_ops && !q->rq_timed_out_fn) | ||
202 | return; | ||
203 | |||
204 | BUG_ON(!list_empty(&req->timeout_list)); | ||
205 | |||
206 | /* | 116 | /* |
207 | * Some LLDs, like scsi, peek at the timeout to prevent a | 117 | * Some LLDs, like scsi, peek at the timeout to prevent a |
208 | * command from being retried forever. | 118 | * command from being retried forever. |
@@ -211,21 +121,16 @@ void blk_add_timer(struct request *req) | |||
211 | req->timeout = q->rq_timeout; | 121 | req->timeout = q->rq_timeout; |
212 | 122 | ||
213 | req->rq_flags &= ~RQF_TIMED_OUT; | 123 | req->rq_flags &= ~RQF_TIMED_OUT; |
214 | blk_rq_set_deadline(req, jiffies + req->timeout); | ||
215 | 124 | ||
216 | /* | 125 | expiry = jiffies + req->timeout; |
217 | * Only the non-mq case needs to add the request to a protected list. | 126 | WRITE_ONCE(req->deadline, expiry); |
218 | * For the mq case we simply scan the tag map. | ||
219 | */ | ||
220 | if (!q->mq_ops) | ||
221 | list_add_tail(&req->timeout_list, &req->q->timeout_list); | ||
222 | 127 | ||
223 | /* | 128 | /* |
224 | * If the timer isn't already pending or this timeout is earlier | 129 | * If the timer isn't already pending or this timeout is earlier |
225 | * than an existing one, modify the timer. Round up to next nearest | 130 | * than an existing one, modify the timer. Round up to next nearest |
226 | * second. | 131 | * second. |
227 | */ | 132 | */ |
228 | expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req))); | 133 | expiry = blk_rq_timeout(round_jiffies_up(expiry)); |
229 | 134 | ||
230 | if (!timer_pending(&q->timeout) || | 135 | if (!timer_pending(&q->timeout) || |
231 | time_before(expiry, q->timeout.expires)) { | 136 | time_before(expiry, q->timeout.expires)) { |
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 8ac93fcbaa2e..f0c56649775f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c | |||
@@ -489,31 +489,21 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) | |||
489 | } | 489 | } |
490 | 490 | ||
491 | struct wbt_wait_data { | 491 | struct wbt_wait_data { |
492 | struct wait_queue_entry wq; | ||
493 | struct task_struct *task; | ||
494 | struct rq_wb *rwb; | 492 | struct rq_wb *rwb; |
495 | struct rq_wait *rqw; | 493 | enum wbt_flags wb_acct; |
496 | unsigned long rw; | 494 | unsigned long rw; |
497 | bool got_token; | ||
498 | }; | 495 | }; |
499 | 496 | ||
500 | static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, | 497 | static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) |
501 | int wake_flags, void *key) | ||
502 | { | 498 | { |
503 | struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data, | 499 | struct wbt_wait_data *data = private_data; |
504 | wq); | 500 | return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw)); |
505 | 501 | } | |
506 | /* | ||
507 | * If we fail to get a budget, return -1 to interrupt the wake up | ||
508 | * loop in __wake_up_common. | ||
509 | */ | ||
510 | if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw))) | ||
511 | return -1; | ||
512 | 502 | ||
513 | data->got_token = true; | 503 | static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) |
514 | list_del_init(&curr->entry); | 504 | { |
515 | wake_up_process(data->task); | 505 | struct wbt_wait_data *data = private_data; |
516 | return 1; | 506 | wbt_rqw_done(data->rwb, rqw, data->wb_acct); |
517 | } | 507 | } |
518 | 508 | ||
519 | /* | 509 | /* |
@@ -521,57 +511,16 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, | |||
521 | * the timer to kick off queuing again. | 511 | * the timer to kick off queuing again. |
522 | */ | 512 | */ |
523 | static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, | 513 | static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, |
524 | unsigned long rw, spinlock_t *lock) | 514 | unsigned long rw) |
525 | __releases(lock) | ||
526 | __acquires(lock) | ||
527 | { | 515 | { |
528 | struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); | 516 | struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); |
529 | struct wbt_wait_data data = { | 517 | struct wbt_wait_data data = { |
530 | .wq = { | ||
531 | .func = wbt_wake_function, | ||
532 | .entry = LIST_HEAD_INIT(data.wq.entry), | ||
533 | }, | ||
534 | .task = current, | ||
535 | .rwb = rwb, | 518 | .rwb = rwb, |
536 | .rqw = rqw, | 519 | .wb_acct = wb_acct, |
537 | .rw = rw, | 520 | .rw = rw, |
538 | }; | 521 | }; |
539 | bool has_sleeper; | ||
540 | |||
541 | has_sleeper = wq_has_sleeper(&rqw->wait); | ||
542 | if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) | ||
543 | return; | ||
544 | 522 | ||
545 | prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); | 523 | rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); |
546 | do { | ||
547 | if (data.got_token) | ||
548 | break; | ||
549 | |||
550 | if (!has_sleeper && | ||
551 | rq_wait_inc_below(rqw, get_limit(rwb, rw))) { | ||
552 | finish_wait(&rqw->wait, &data.wq); | ||
553 | |||
554 | /* | ||
555 | * We raced with wbt_wake_function() getting a token, | ||
556 | * which means we now have two. Put our local token | ||
557 | * and wake anyone else potentially waiting for one. | ||
558 | */ | ||
559 | if (data.got_token) | ||
560 | wbt_rqw_done(rwb, rqw, wb_acct); | ||
561 | break; | ||
562 | } | ||
563 | |||
564 | if (lock) { | ||
565 | spin_unlock_irq(lock); | ||
566 | io_schedule(); | ||
567 | spin_lock_irq(lock); | ||
568 | } else | ||
569 | io_schedule(); | ||
570 | |||
571 | has_sleeper = false; | ||
572 | } while (1); | ||
573 | |||
574 | finish_wait(&rqw->wait, &data.wq); | ||
575 | } | 524 | } |
576 | 525 | ||
577 | static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) | 526 | static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) |
@@ -624,7 +573,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) | |||
624 | * in an irq held spinlock, if it holds one when calling this function. | 573 | * in an irq held spinlock, if it holds one when calling this function. |
625 | * If we do sleep, we'll release and re-grab it. | 574 | * If we do sleep, we'll release and re-grab it. |
626 | */ | 575 | */ |
627 | static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) | 576 | static void wbt_wait(struct rq_qos *rqos, struct bio *bio) |
628 | { | 577 | { |
629 | struct rq_wb *rwb = RQWB(rqos); | 578 | struct rq_wb *rwb = RQWB(rqos); |
630 | enum wbt_flags flags; | 579 | enum wbt_flags flags; |
@@ -636,7 +585,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) | |||
636 | return; | 585 | return; |
637 | } | 586 | } |
638 | 587 | ||
639 | __wbt_wait(rwb, flags, bio->bi_opf, lock); | 588 | __wbt_wait(rwb, flags, bio->bi_opf); |
640 | 589 | ||
641 | if (!blk_stat_is_active(rwb->cb)) | 590 | if (!blk_stat_is_active(rwb->cb)) |
642 | rwb_arm_timer(rwb); | 591 | rwb_arm_timer(rwb); |
@@ -709,8 +658,7 @@ void wbt_enable_default(struct request_queue *q) | |||
709 | if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) | 658 | if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) |
710 | return; | 659 | return; |
711 | 660 | ||
712 | if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || | 661 | if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) |
713 | (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ))) | ||
714 | wbt_init(q); | 662 | wbt_init(q); |
715 | } | 663 | } |
716 | EXPORT_SYMBOL_GPL(wbt_enable_default); | 664 | EXPORT_SYMBOL_GPL(wbt_enable_default); |
@@ -760,11 +708,100 @@ void wbt_disable_default(struct request_queue *q) | |||
760 | if (!rqos) | 708 | if (!rqos) |
761 | return; | 709 | return; |
762 | rwb = RQWB(rqos); | 710 | rwb = RQWB(rqos); |
763 | if (rwb->enable_state == WBT_STATE_ON_DEFAULT) | 711 | if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { |
712 | blk_stat_deactivate(rwb->cb); | ||
764 | rwb->wb_normal = 0; | 713 | rwb->wb_normal = 0; |
714 | } | ||
765 | } | 715 | } |
766 | EXPORT_SYMBOL_GPL(wbt_disable_default); | 716 | EXPORT_SYMBOL_GPL(wbt_disable_default); |
767 | 717 | ||
718 | #ifdef CONFIG_BLK_DEBUG_FS | ||
719 | static int wbt_curr_win_nsec_show(void *data, struct seq_file *m) | ||
720 | { | ||
721 | struct rq_qos *rqos = data; | ||
722 | struct rq_wb *rwb = RQWB(rqos); | ||
723 | |||
724 | seq_printf(m, "%llu\n", rwb->cur_win_nsec); | ||
725 | return 0; | ||
726 | } | ||
727 | |||
728 | static int wbt_enabled_show(void *data, struct seq_file *m) | ||
729 | { | ||
730 | struct rq_qos *rqos = data; | ||
731 | struct rq_wb *rwb = RQWB(rqos); | ||
732 | |||
733 | seq_printf(m, "%d\n", rwb->enable_state); | ||
734 | return 0; | ||
735 | } | ||
736 | |||
737 | static int wbt_id_show(void *data, struct seq_file *m) | ||
738 | { | ||
739 | struct rq_qos *rqos = data; | ||
740 | |||
741 | seq_printf(m, "%u\n", rqos->id); | ||
742 | return 0; | ||
743 | } | ||
744 | |||
745 | static int wbt_inflight_show(void *data, struct seq_file *m) | ||
746 | { | ||
747 | struct rq_qos *rqos = data; | ||
748 | struct rq_wb *rwb = RQWB(rqos); | ||
749 | int i; | ||
750 | |||
751 | for (i = 0; i < WBT_NUM_RWQ; i++) | ||
752 | seq_printf(m, "%d: inflight %d\n", i, | ||
753 | atomic_read(&rwb->rq_wait[i].inflight)); | ||
754 | return 0; | ||
755 | } | ||
756 | |||
757 | static int wbt_min_lat_nsec_show(void *data, struct seq_file *m) | ||
758 | { | ||
759 | struct rq_qos *rqos = data; | ||
760 | struct rq_wb *rwb = RQWB(rqos); | ||
761 | |||
762 | seq_printf(m, "%lu\n", rwb->min_lat_nsec); | ||
763 | return 0; | ||
764 | } | ||
765 | |||
766 | static int wbt_unknown_cnt_show(void *data, struct seq_file *m) | ||
767 | { | ||
768 | struct rq_qos *rqos = data; | ||
769 | struct rq_wb *rwb = RQWB(rqos); | ||
770 | |||
771 | seq_printf(m, "%u\n", rwb->unknown_cnt); | ||
772 | return 0; | ||
773 | } | ||
774 | |||
775 | static int wbt_normal_show(void *data, struct seq_file *m) | ||
776 | { | ||
777 | struct rq_qos *rqos = data; | ||
778 | struct rq_wb *rwb = RQWB(rqos); | ||
779 | |||
780 | seq_printf(m, "%u\n", rwb->wb_normal); | ||
781 | return 0; | ||
782 | } | ||
783 | |||
784 | static int wbt_background_show(void *data, struct seq_file *m) | ||
785 | { | ||
786 | struct rq_qos *rqos = data; | ||
787 | struct rq_wb *rwb = RQWB(rqos); | ||
788 | |||
789 | seq_printf(m, "%u\n", rwb->wb_background); | ||
790 | return 0; | ||
791 | } | ||
792 | |||
793 | static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { | ||
794 | {"curr_win_nsec", 0400, wbt_curr_win_nsec_show}, | ||
795 | {"enabled", 0400, wbt_enabled_show}, | ||
796 | {"id", 0400, wbt_id_show}, | ||
797 | {"inflight", 0400, wbt_inflight_show}, | ||
798 | {"min_lat_nsec", 0400, wbt_min_lat_nsec_show}, | ||
799 | {"unknown_cnt", 0400, wbt_unknown_cnt_show}, | ||
800 | {"wb_normal", 0400, wbt_normal_show}, | ||
801 | {"wb_background", 0400, wbt_background_show}, | ||
802 | {}, | ||
803 | }; | ||
804 | #endif | ||
768 | 805 | ||
769 | static struct rq_qos_ops wbt_rqos_ops = { | 806 | static struct rq_qos_ops wbt_rqos_ops = { |
770 | .throttle = wbt_wait, | 807 | .throttle = wbt_wait, |
@@ -774,6 +811,9 @@ static struct rq_qos_ops wbt_rqos_ops = { | |||
774 | .done = wbt_done, | 811 | .done = wbt_done, |
775 | .cleanup = wbt_cleanup, | 812 | .cleanup = wbt_cleanup, |
776 | .exit = wbt_exit, | 813 | .exit = wbt_exit, |
814 | #ifdef CONFIG_BLK_DEBUG_FS | ||
815 | .debugfs_attrs = wbt_debugfs_attrs, | ||
816 | #endif | ||
777 | }; | 817 | }; |
778 | 818 | ||
779 | int wbt_init(struct request_queue *q) | 819 | int wbt_init(struct request_queue *q) |
diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 13ba2011a306..2d98803faec2 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c | |||
@@ -378,7 +378,7 @@ static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) | |||
378 | struct page *page; | 378 | struct page *page; |
379 | int order; | 379 | int order; |
380 | 380 | ||
381 | for (order = get_order(size); order > 0; order--) { | 381 | for (order = get_order(size); order >= 0; order--) { |
382 | page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); | 382 | page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); |
383 | if (page) { | 383 | if (page) { |
384 | *nr_zones = min_t(unsigned int, *nr_zones, | 384 | *nr_zones = min_t(unsigned int, *nr_zones, |
@@ -421,7 +421,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) | |||
421 | * BIO based queues do not use a scheduler so only q->nr_zones | 421 | * BIO based queues do not use a scheduler so only q->nr_zones |
422 | * needs to be updated so that the sysfs exposed value is correct. | 422 | * needs to be updated so that the sysfs exposed value is correct. |
423 | */ | 423 | */ |
424 | if (!queue_is_rq_based(q)) { | 424 | if (!queue_is_mq(q)) { |
425 | q->nr_zones = nr_zones; | 425 | q->nr_zones = nr_zones; |
426 | return 0; | 426 | return 0; |
427 | } | 427 | } |
diff --git a/block/blk.h b/block/blk.h index a1841b8ff129..848278c52030 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -7,12 +7,6 @@ | |||
7 | #include <xen/xen.h> | 7 | #include <xen/xen.h> |
8 | #include "blk-mq.h" | 8 | #include "blk-mq.h" |
9 | 9 | ||
10 | /* Amount of time in which a process may batch requests */ | ||
11 | #define BLK_BATCH_TIME (HZ/50UL) | ||
12 | |||
13 | /* Number of requests a "batching" process may submit */ | ||
14 | #define BLK_BATCH_REQ 32 | ||
15 | |||
16 | /* Max future timer expiry for timeouts */ | 10 | /* Max future timer expiry for timeouts */ |
17 | #define BLK_MAX_TIMEOUT (5 * HZ) | 11 | #define BLK_MAX_TIMEOUT (5 * HZ) |
18 | 12 | ||
@@ -38,85 +32,13 @@ struct blk_flush_queue { | |||
38 | }; | 32 | }; |
39 | 33 | ||
40 | extern struct kmem_cache *blk_requestq_cachep; | 34 | extern struct kmem_cache *blk_requestq_cachep; |
41 | extern struct kmem_cache *request_cachep; | ||
42 | extern struct kobj_type blk_queue_ktype; | 35 | extern struct kobj_type blk_queue_ktype; |
43 | extern struct ida blk_queue_ida; | 36 | extern struct ida blk_queue_ida; |
44 | 37 | ||
45 | /* | 38 | static inline struct blk_flush_queue * |
46 | * @q->queue_lock is set while a queue is being initialized. Since we know | 39 | blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) |
47 | * that no other threads access the queue object before @q->queue_lock has | ||
48 | * been set, it is safe to manipulate queue flags without holding the | ||
49 | * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and | ||
50 | * blk_init_allocated_queue(). | ||
51 | */ | ||
52 | static inline void queue_lockdep_assert_held(struct request_queue *q) | ||
53 | { | ||
54 | if (q->queue_lock) | ||
55 | lockdep_assert_held(q->queue_lock); | ||
56 | } | ||
57 | |||
58 | static inline void queue_flag_set_unlocked(unsigned int flag, | ||
59 | struct request_queue *q) | ||
60 | { | ||
61 | if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) && | ||
62 | kref_read(&q->kobj.kref)) | ||
63 | lockdep_assert_held(q->queue_lock); | ||
64 | __set_bit(flag, &q->queue_flags); | ||
65 | } | ||
66 | |||
67 | static inline void queue_flag_clear_unlocked(unsigned int flag, | ||
68 | struct request_queue *q) | ||
69 | { | ||
70 | if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) && | ||
71 | kref_read(&q->kobj.kref)) | ||
72 | lockdep_assert_held(q->queue_lock); | ||
73 | __clear_bit(flag, &q->queue_flags); | ||
74 | } | ||
75 | |||
76 | static inline int queue_flag_test_and_clear(unsigned int flag, | ||
77 | struct request_queue *q) | ||
78 | { | ||
79 | queue_lockdep_assert_held(q); | ||
80 | |||
81 | if (test_bit(flag, &q->queue_flags)) { | ||
82 | __clear_bit(flag, &q->queue_flags); | ||
83 | return 1; | ||
84 | } | ||
85 | |||
86 | return 0; | ||
87 | } | ||
88 | |||
89 | static inline int queue_flag_test_and_set(unsigned int flag, | ||
90 | struct request_queue *q) | ||
91 | { | 40 | { |
92 | queue_lockdep_assert_held(q); | 41 | return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq; |
93 | |||
94 | if (!test_bit(flag, &q->queue_flags)) { | ||
95 | __set_bit(flag, &q->queue_flags); | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | static inline void queue_flag_set(unsigned int flag, struct request_queue *q) | ||
103 | { | ||
104 | queue_lockdep_assert_held(q); | ||
105 | __set_bit(flag, &q->queue_flags); | ||
106 | } | ||
107 | |||
108 | static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) | ||
109 | { | ||
110 | queue_lockdep_assert_held(q); | ||
111 | __clear_bit(flag, &q->queue_flags); | ||
112 | } | ||
113 | |||
114 | static inline struct blk_flush_queue *blk_get_flush_queue( | ||
115 | struct request_queue *q, struct blk_mq_ctx *ctx) | ||
116 | { | ||
117 | if (q->mq_ops) | ||
118 | return blk_mq_map_queue(q, ctx->cpu)->fq; | ||
119 | return q->fq; | ||
120 | } | 42 | } |
121 | 43 | ||
122 | static inline void __blk_get_queue(struct request_queue *q) | 44 | static inline void __blk_get_queue(struct request_queue *q) |
@@ -128,15 +50,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, | |||
128 | int node, int cmd_size, gfp_t flags); | 50 | int node, int cmd_size, gfp_t flags); |
129 | void blk_free_flush_queue(struct blk_flush_queue *q); | 51 | void blk_free_flush_queue(struct blk_flush_queue *q); |
130 | 52 | ||
131 | int blk_init_rl(struct request_list *rl, struct request_queue *q, | ||
132 | gfp_t gfp_mask); | ||
133 | void blk_exit_rl(struct request_queue *q, struct request_list *rl); | ||
134 | void blk_exit_queue(struct request_queue *q); | 53 | void blk_exit_queue(struct request_queue *q); |
135 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | 54 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, |
136 | struct bio *bio); | 55 | struct bio *bio); |
137 | void blk_queue_bypass_start(struct request_queue *q); | ||
138 | void blk_queue_bypass_end(struct request_queue *q); | ||
139 | void __blk_queue_free_tags(struct request_queue *q); | ||
140 | void blk_freeze_queue(struct request_queue *q); | 56 | void blk_freeze_queue(struct request_queue *q); |
141 | 57 | ||
142 | static inline void blk_queue_enter_live(struct request_queue *q) | 58 | static inline void blk_queue_enter_live(struct request_queue *q) |
@@ -169,7 +85,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, | |||
169 | static inline bool __bvec_gap_to_prev(struct request_queue *q, | 85 | static inline bool __bvec_gap_to_prev(struct request_queue *q, |
170 | struct bio_vec *bprv, unsigned int offset) | 86 | struct bio_vec *bprv, unsigned int offset) |
171 | { | 87 | { |
172 | return offset || | 88 | return (offset & queue_virt_boundary(q)) || |
173 | ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); | 89 | ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); |
174 | } | 90 | } |
175 | 91 | ||
@@ -235,11 +151,8 @@ static inline bool bio_integrity_endio(struct bio *bio) | |||
235 | } | 151 | } |
236 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ | 152 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ |
237 | 153 | ||
238 | void blk_timeout_work(struct work_struct *work); | ||
239 | unsigned long blk_rq_timeout(unsigned long timeout); | 154 | unsigned long blk_rq_timeout(unsigned long timeout); |
240 | void blk_add_timer(struct request *req); | 155 | void blk_add_timer(struct request *req); |
241 | void blk_delete_timer(struct request *); | ||
242 | |||
243 | 156 | ||
244 | bool bio_attempt_front_merge(struct request_queue *q, struct request *req, | 157 | bool bio_attempt_front_merge(struct request_queue *q, struct request *req, |
245 | struct bio *bio); | 158 | struct bio *bio); |
@@ -248,58 +161,19 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | |||
248 | bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, | 161 | bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, |
249 | struct bio *bio); | 162 | struct bio *bio); |
250 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | 163 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, |
251 | unsigned int *request_count, | ||
252 | struct request **same_queue_rq); | 164 | struct request **same_queue_rq); |
253 | unsigned int blk_plug_queued_count(struct request_queue *q); | ||
254 | 165 | ||
255 | void blk_account_io_start(struct request *req, bool new_io); | 166 | void blk_account_io_start(struct request *req, bool new_io); |
256 | void blk_account_io_completion(struct request *req, unsigned int bytes); | 167 | void blk_account_io_completion(struct request *req, unsigned int bytes); |
257 | void blk_account_io_done(struct request *req, u64 now); | 168 | void blk_account_io_done(struct request *req, u64 now); |
258 | 169 | ||
259 | /* | 170 | /* |
260 | * EH timer and IO completion will both attempt to 'grab' the request, make | ||
261 | * sure that only one of them succeeds. Steal the bottom bit of the | ||
262 | * __deadline field for this. | ||
263 | */ | ||
264 | static inline int blk_mark_rq_complete(struct request *rq) | ||
265 | { | ||
266 | return test_and_set_bit(0, &rq->__deadline); | ||
267 | } | ||
268 | |||
269 | static inline void blk_clear_rq_complete(struct request *rq) | ||
270 | { | ||
271 | clear_bit(0, &rq->__deadline); | ||
272 | } | ||
273 | |||
274 | static inline bool blk_rq_is_complete(struct request *rq) | ||
275 | { | ||
276 | return test_bit(0, &rq->__deadline); | ||
277 | } | ||
278 | |||
279 | /* | ||
280 | * Internal elevator interface | 171 | * Internal elevator interface |
281 | */ | 172 | */ |
282 | #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) | 173 | #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) |
283 | 174 | ||
284 | void blk_insert_flush(struct request *rq); | 175 | void blk_insert_flush(struct request *rq); |
285 | 176 | ||
286 | static inline void elv_activate_rq(struct request_queue *q, struct request *rq) | ||
287 | { | ||
288 | struct elevator_queue *e = q->elevator; | ||
289 | |||
290 | if (e->type->ops.sq.elevator_activate_req_fn) | ||
291 | e->type->ops.sq.elevator_activate_req_fn(q, rq); | ||
292 | } | ||
293 | |||
294 | static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) | ||
295 | { | ||
296 | struct elevator_queue *e = q->elevator; | ||
297 | |||
298 | if (e->type->ops.sq.elevator_deactivate_req_fn) | ||
299 | e->type->ops.sq.elevator_deactivate_req_fn(q, rq); | ||
300 | } | ||
301 | |||
302 | int elevator_init(struct request_queue *); | ||
303 | int elevator_init_mq(struct request_queue *q); | 177 | int elevator_init_mq(struct request_queue *q); |
304 | int elevator_switch_mq(struct request_queue *q, | 178 | int elevator_switch_mq(struct request_queue *q, |
305 | struct elevator_type *new_e); | 179 | struct elevator_type *new_e); |
@@ -334,31 +208,8 @@ void blk_rq_set_mixed_merge(struct request *rq); | |||
334 | bool blk_rq_merge_ok(struct request *rq, struct bio *bio); | 208 | bool blk_rq_merge_ok(struct request *rq, struct bio *bio); |
335 | enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); | 209 | enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); |
336 | 210 | ||
337 | void blk_queue_congestion_threshold(struct request_queue *q); | ||
338 | |||
339 | int blk_dev_init(void); | 211 | int blk_dev_init(void); |
340 | 212 | ||
341 | |||
342 | /* | ||
343 | * Return the threshold (number of used requests) at which the queue is | ||
344 | * considered to be congested. It include a little hysteresis to keep the | ||
345 | * context switch rate down. | ||
346 | */ | ||
347 | static inline int queue_congestion_on_threshold(struct request_queue *q) | ||
348 | { | ||
349 | return q->nr_congestion_on; | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | * The threshold at which a queue is considered to be uncongested | ||
354 | */ | ||
355 | static inline int queue_congestion_off_threshold(struct request_queue *q) | ||
356 | { | ||
357 | return q->nr_congestion_off; | ||
358 | } | ||
359 | |||
360 | extern int blk_update_nr_requests(struct request_queue *, unsigned int); | ||
361 | |||
362 | /* | 213 | /* |
363 | * Contribute to IO statistics IFF: | 214 | * Contribute to IO statistics IFF: |
364 | * | 215 | * |
@@ -381,18 +232,13 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req) | |||
381 | } | 232 | } |
382 | 233 | ||
383 | /* | 234 | /* |
384 | * Steal a bit from this field for legacy IO path atomic IO marking. Note that | 235 | * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size |
385 | * setting the deadline clears the bottom bit, potentially clearing the | 236 | * is defined as 'unsigned int', meantime it has to aligned to with logical |
386 | * completed bit. The user has to be OK with this (current ones are fine). | 237 | * block size which is the minimum accepted unit by hardware. |
387 | */ | 238 | */ |
388 | static inline void blk_rq_set_deadline(struct request *rq, unsigned long time) | 239 | static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) |
389 | { | ||
390 | rq->__deadline = time & ~0x1UL; | ||
391 | } | ||
392 | |||
393 | static inline unsigned long blk_rq_deadline(struct request *rq) | ||
394 | { | 240 | { |
395 | return rq->__deadline & ~0x1UL; | 241 | return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9; |
396 | } | 242 | } |
397 | 243 | ||
398 | /* | 244 | /* |
@@ -407,22 +253,6 @@ void ioc_clear_queue(struct request_queue *q); | |||
407 | int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); | 253 | int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); |
408 | 254 | ||
409 | /** | 255 | /** |
410 | * rq_ioc - determine io_context for request allocation | ||
411 | * @bio: request being allocated is for this bio (can be %NULL) | ||
412 | * | ||
413 | * Determine io_context to use for request allocation for @bio. May return | ||
414 | * %NULL if %current->io_context doesn't exist. | ||
415 | */ | ||
416 | static inline struct io_context *rq_ioc(struct bio *bio) | ||
417 | { | ||
418 | #ifdef CONFIG_BLK_CGROUP | ||
419 | if (bio && bio->bi_ioc) | ||
420 | return bio->bi_ioc; | ||
421 | #endif | ||
422 | return current->io_context; | ||
423 | } | ||
424 | |||
425 | /** | ||
426 | * create_io_context - try to create task->io_context | 256 | * create_io_context - try to create task->io_context |
427 | * @gfp_mask: allocation mask | 257 | * @gfp_mask: allocation mask |
428 | * @node: allocation node | 258 | * @node: allocation node |
@@ -480,8 +310,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) | |||
480 | } | 310 | } |
481 | #endif /* CONFIG_BOUNCE */ | 311 | #endif /* CONFIG_BOUNCE */ |
482 | 312 | ||
483 | extern void blk_drain_queue(struct request_queue *q); | ||
484 | |||
485 | #ifdef CONFIG_BLK_CGROUP_IOLATENCY | 313 | #ifdef CONFIG_BLK_CGROUP_IOLATENCY |
486 | extern int blk_iolatency_init(struct request_queue *q); | 314 | extern int blk_iolatency_init(struct request_queue *q); |
487 | #else | 315 | #else |
diff --git a/block/bounce.c b/block/bounce.c index 36869afc258c..ffb9e9ecfa7e 100644 --- a/block/bounce.c +++ b/block/bounce.c | |||
@@ -248,6 +248,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, | |||
248 | return NULL; | 248 | return NULL; |
249 | bio->bi_disk = bio_src->bi_disk; | 249 | bio->bi_disk = bio_src->bi_disk; |
250 | bio->bi_opf = bio_src->bi_opf; | 250 | bio->bi_opf = bio_src->bi_opf; |
251 | bio->bi_ioprio = bio_src->bi_ioprio; | ||
251 | bio->bi_write_hint = bio_src->bi_write_hint; | 252 | bio->bi_write_hint = bio_src->bi_write_hint; |
252 | bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; | 253 | bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; |
253 | bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; | 254 | bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; |
@@ -276,7 +277,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, | |||
276 | } | 277 | } |
277 | } | 278 | } |
278 | 279 | ||
279 | bio_clone_blkcg_association(bio, bio_src); | 280 | bio_clone_blkg_association(bio, bio_src); |
281 | blkcg_bio_issue_init(bio); | ||
280 | 282 | ||
281 | return bio; | 283 | return bio; |
282 | } | 284 | } |
diff --git a/block/bsg-lib.c b/block/bsg-lib.c index f3501cdaf1a6..192129856342 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c | |||
@@ -21,7 +21,7 @@ | |||
21 | * | 21 | * |
22 | */ | 22 | */ |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/blkdev.h> | 24 | #include <linux/blk-mq.h> |
25 | #include <linux/delay.h> | 25 | #include <linux/delay.h> |
26 | #include <linux/scatterlist.h> | 26 | #include <linux/scatterlist.h> |
27 | #include <linux/bsg-lib.h> | 27 | #include <linux/bsg-lib.h> |
@@ -31,6 +31,12 @@ | |||
31 | 31 | ||
32 | #define uptr64(val) ((void __user *)(uintptr_t)(val)) | 32 | #define uptr64(val) ((void __user *)(uintptr_t)(val)) |
33 | 33 | ||
34 | struct bsg_set { | ||
35 | struct blk_mq_tag_set tag_set; | ||
36 | bsg_job_fn *job_fn; | ||
37 | bsg_timeout_fn *timeout_fn; | ||
38 | }; | ||
39 | |||
34 | static int bsg_transport_check_proto(struct sg_io_v4 *hdr) | 40 | static int bsg_transport_check_proto(struct sg_io_v4 *hdr) |
35 | { | 41 | { |
36 | if (hdr->protocol != BSG_PROTOCOL_SCSI || | 42 | if (hdr->protocol != BSG_PROTOCOL_SCSI || |
@@ -129,7 +135,7 @@ static void bsg_teardown_job(struct kref *kref) | |||
129 | kfree(job->request_payload.sg_list); | 135 | kfree(job->request_payload.sg_list); |
130 | kfree(job->reply_payload.sg_list); | 136 | kfree(job->reply_payload.sg_list); |
131 | 137 | ||
132 | blk_end_request_all(rq, BLK_STS_OK); | 138 | blk_mq_end_request(rq, BLK_STS_OK); |
133 | } | 139 | } |
134 | 140 | ||
135 | void bsg_job_put(struct bsg_job *job) | 141 | void bsg_job_put(struct bsg_job *job) |
@@ -157,15 +163,15 @@ void bsg_job_done(struct bsg_job *job, int result, | |||
157 | { | 163 | { |
158 | job->result = result; | 164 | job->result = result; |
159 | job->reply_payload_rcv_len = reply_payload_rcv_len; | 165 | job->reply_payload_rcv_len = reply_payload_rcv_len; |
160 | blk_complete_request(blk_mq_rq_from_pdu(job)); | 166 | blk_mq_complete_request(blk_mq_rq_from_pdu(job)); |
161 | } | 167 | } |
162 | EXPORT_SYMBOL_GPL(bsg_job_done); | 168 | EXPORT_SYMBOL_GPL(bsg_job_done); |
163 | 169 | ||
164 | /** | 170 | /** |
165 | * bsg_softirq_done - softirq done routine for destroying the bsg requests | 171 | * bsg_complete - softirq done routine for destroying the bsg requests |
166 | * @rq: BSG request that holds the job to be destroyed | 172 | * @rq: BSG request that holds the job to be destroyed |
167 | */ | 173 | */ |
168 | static void bsg_softirq_done(struct request *rq) | 174 | static void bsg_complete(struct request *rq) |
169 | { | 175 | { |
170 | struct bsg_job *job = blk_mq_rq_to_pdu(rq); | 176 | struct bsg_job *job = blk_mq_rq_to_pdu(rq); |
171 | 177 | ||
@@ -224,54 +230,48 @@ failjob_rls_job: | |||
224 | } | 230 | } |
225 | 231 | ||
226 | /** | 232 | /** |
227 | * bsg_request_fn - generic handler for bsg requests | 233 | * bsg_queue_rq - generic handler for bsg requests |
228 | * @q: request queue to manage | 234 | * @hctx: hardware queue |
235 | * @bd: queue data | ||
229 | * | 236 | * |
230 | * On error the create_bsg_job function should return a -Exyz error value | 237 | * On error the create_bsg_job function should return a -Exyz error value |
231 | * that will be set to ->result. | 238 | * that will be set to ->result. |
232 | * | 239 | * |
233 | * Drivers/subsys should pass this to the queue init function. | 240 | * Drivers/subsys should pass this to the queue init function. |
234 | */ | 241 | */ |
235 | static void bsg_request_fn(struct request_queue *q) | 242 | static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, |
236 | __releases(q->queue_lock) | 243 | const struct blk_mq_queue_data *bd) |
237 | __acquires(q->queue_lock) | ||
238 | { | 244 | { |
245 | struct request_queue *q = hctx->queue; | ||
239 | struct device *dev = q->queuedata; | 246 | struct device *dev = q->queuedata; |
240 | struct request *req; | 247 | struct request *req = bd->rq; |
248 | struct bsg_set *bset = | ||
249 | container_of(q->tag_set, struct bsg_set, tag_set); | ||
241 | int ret; | 250 | int ret; |
242 | 251 | ||
252 | blk_mq_start_request(req); | ||
253 | |||
243 | if (!get_device(dev)) | 254 | if (!get_device(dev)) |
244 | return; | 255 | return BLK_STS_IOERR; |
245 | 256 | ||
246 | while (1) { | 257 | if (!bsg_prepare_job(dev, req)) |
247 | req = blk_fetch_request(q); | 258 | return BLK_STS_IOERR; |
248 | if (!req) | 259 | |
249 | break; | 260 | ret = bset->job_fn(blk_mq_rq_to_pdu(req)); |
250 | spin_unlock_irq(q->queue_lock); | 261 | if (ret) |
251 | 262 | return BLK_STS_IOERR; | |
252 | if (!bsg_prepare_job(dev, req)) { | ||
253 | blk_end_request_all(req, BLK_STS_OK); | ||
254 | spin_lock_irq(q->queue_lock); | ||
255 | continue; | ||
256 | } | ||
257 | |||
258 | ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req)); | ||
259 | spin_lock_irq(q->queue_lock); | ||
260 | if (ret) | ||
261 | break; | ||
262 | } | ||
263 | 263 | ||
264 | spin_unlock_irq(q->queue_lock); | ||
265 | put_device(dev); | 264 | put_device(dev); |
266 | spin_lock_irq(q->queue_lock); | 265 | return BLK_STS_OK; |
267 | } | 266 | } |
268 | 267 | ||
269 | /* called right after the request is allocated for the request_queue */ | 268 | /* called right after the request is allocated for the request_queue */ |
270 | static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) | 269 | static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req, |
270 | unsigned int hctx_idx, unsigned int numa_node) | ||
271 | { | 271 | { |
272 | struct bsg_job *job = blk_mq_rq_to_pdu(req); | 272 | struct bsg_job *job = blk_mq_rq_to_pdu(req); |
273 | 273 | ||
274 | job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp); | 274 | job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); |
275 | if (!job->reply) | 275 | if (!job->reply) |
276 | return -ENOMEM; | 276 | return -ENOMEM; |
277 | return 0; | 277 | return 0; |
@@ -289,13 +289,47 @@ static void bsg_initialize_rq(struct request *req) | |||
289 | job->dd_data = job + 1; | 289 | job->dd_data = job + 1; |
290 | } | 290 | } |
291 | 291 | ||
292 | static void bsg_exit_rq(struct request_queue *q, struct request *req) | 292 | static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req, |
293 | unsigned int hctx_idx) | ||
293 | { | 294 | { |
294 | struct bsg_job *job = blk_mq_rq_to_pdu(req); | 295 | struct bsg_job *job = blk_mq_rq_to_pdu(req); |
295 | 296 | ||
296 | kfree(job->reply); | 297 | kfree(job->reply); |
297 | } | 298 | } |
298 | 299 | ||
300 | void bsg_remove_queue(struct request_queue *q) | ||
301 | { | ||
302 | if (q) { | ||
303 | struct bsg_set *bset = | ||
304 | container_of(q->tag_set, struct bsg_set, tag_set); | ||
305 | |||
306 | bsg_unregister_queue(q); | ||
307 | blk_cleanup_queue(q); | ||
308 | blk_mq_free_tag_set(&bset->tag_set); | ||
309 | kfree(bset); | ||
310 | } | ||
311 | } | ||
312 | EXPORT_SYMBOL_GPL(bsg_remove_queue); | ||
313 | |||
314 | static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved) | ||
315 | { | ||
316 | struct bsg_set *bset = | ||
317 | container_of(rq->q->tag_set, struct bsg_set, tag_set); | ||
318 | |||
319 | if (!bset->timeout_fn) | ||
320 | return BLK_EH_DONE; | ||
321 | return bset->timeout_fn(rq); | ||
322 | } | ||
323 | |||
324 | static const struct blk_mq_ops bsg_mq_ops = { | ||
325 | .queue_rq = bsg_queue_rq, | ||
326 | .init_request = bsg_init_rq, | ||
327 | .exit_request = bsg_exit_rq, | ||
328 | .initialize_rq_fn = bsg_initialize_rq, | ||
329 | .complete = bsg_complete, | ||
330 | .timeout = bsg_timeout, | ||
331 | }; | ||
332 | |||
299 | /** | 333 | /** |
300 | * bsg_setup_queue - Create and add the bsg hooks so we can receive requests | 334 | * bsg_setup_queue - Create and add the bsg hooks so we can receive requests |
301 | * @dev: device to attach bsg device to | 335 | * @dev: device to attach bsg device to |
@@ -304,28 +338,38 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req) | |||
304 | * @dd_job_size: size of LLD data needed for each job | 338 | * @dd_job_size: size of LLD data needed for each job |
305 | */ | 339 | */ |
306 | struct request_queue *bsg_setup_queue(struct device *dev, const char *name, | 340 | struct request_queue *bsg_setup_queue(struct device *dev, const char *name, |
307 | bsg_job_fn *job_fn, int dd_job_size) | 341 | bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size) |
308 | { | 342 | { |
343 | struct bsg_set *bset; | ||
344 | struct blk_mq_tag_set *set; | ||
309 | struct request_queue *q; | 345 | struct request_queue *q; |
310 | int ret; | 346 | int ret = -ENOMEM; |
311 | 347 | ||
312 | q = blk_alloc_queue(GFP_KERNEL); | 348 | bset = kzalloc(sizeof(*bset), GFP_KERNEL); |
313 | if (!q) | 349 | if (!bset) |
314 | return ERR_PTR(-ENOMEM); | 350 | return ERR_PTR(-ENOMEM); |
315 | q->cmd_size = sizeof(struct bsg_job) + dd_job_size; | ||
316 | q->init_rq_fn = bsg_init_rq; | ||
317 | q->exit_rq_fn = bsg_exit_rq; | ||
318 | q->initialize_rq_fn = bsg_initialize_rq; | ||
319 | q->request_fn = bsg_request_fn; | ||
320 | 351 | ||
321 | ret = blk_init_allocated_queue(q); | 352 | bset->job_fn = job_fn; |
322 | if (ret) | 353 | bset->timeout_fn = timeout; |
323 | goto out_cleanup_queue; | 354 | |
355 | set = &bset->tag_set; | ||
356 | set->ops = &bsg_mq_ops, | ||
357 | set->nr_hw_queues = 1; | ||
358 | set->queue_depth = 128; | ||
359 | set->numa_node = NUMA_NO_NODE; | ||
360 | set->cmd_size = sizeof(struct bsg_job) + dd_job_size; | ||
361 | set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING; | ||
362 | if (blk_mq_alloc_tag_set(set)) | ||
363 | goto out_tag_set; | ||
364 | |||
365 | q = blk_mq_init_queue(set); | ||
366 | if (IS_ERR(q)) { | ||
367 | ret = PTR_ERR(q); | ||
368 | goto out_queue; | ||
369 | } | ||
324 | 370 | ||
325 | q->queuedata = dev; | 371 | q->queuedata = dev; |
326 | q->bsg_job_fn = job_fn; | ||
327 | blk_queue_flag_set(QUEUE_FLAG_BIDI, q); | 372 | blk_queue_flag_set(QUEUE_FLAG_BIDI, q); |
328 | blk_queue_softirq_done(q, bsg_softirq_done); | ||
329 | blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); | 373 | blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); |
330 | 374 | ||
331 | ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); | 375 | ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); |
@@ -338,6 +382,10 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, | |||
338 | return q; | 382 | return q; |
339 | out_cleanup_queue: | 383 | out_cleanup_queue: |
340 | blk_cleanup_queue(q); | 384 | blk_cleanup_queue(q); |
385 | out_queue: | ||
386 | blk_mq_free_tag_set(set); | ||
387 | out_tag_set: | ||
388 | kfree(bset); | ||
341 | return ERR_PTR(ret); | 389 | return ERR_PTR(ret); |
342 | } | 390 | } |
343 | EXPORT_SYMBOL_GPL(bsg_setup_queue); | 391 | EXPORT_SYMBOL_GPL(bsg_setup_queue); |
diff --git a/block/bsg.c b/block/bsg.c index 9a442c23a715..44f6028b9567 100644 --- a/block/bsg.c +++ b/block/bsg.c | |||
@@ -471,7 +471,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, | |||
471 | /* | 471 | /* |
472 | * we need a proper transport to send commands, not a stacked device | 472 | * we need a proper transport to send commands, not a stacked device |
473 | */ | 473 | */ |
474 | if (!queue_is_rq_based(q)) | 474 | if (!queue_is_mq(q)) |
475 | return 0; | 475 | return 0; |
476 | 476 | ||
477 | bcd = &q->bsg_dev; | 477 | bcd = &q->bsg_dev; |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c deleted file mode 100644 index ed41aa978c4a..000000000000 --- a/block/cfq-iosched.c +++ /dev/null | |||
@@ -1,4916 +0,0 @@ | |||
1 | /* | ||
2 | * CFQ, or complete fairness queueing, disk scheduler. | ||
3 | * | ||
4 | * Based on ideas from a previously unfinished io | ||
5 | * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. | ||
6 | * | ||
7 | * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> | ||
8 | */ | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <linux/sched/clock.h> | ||
12 | #include <linux/blkdev.h> | ||
13 | #include <linux/elevator.h> | ||
14 | #include <linux/ktime.h> | ||
15 | #include <linux/rbtree.h> | ||
16 | #include <linux/ioprio.h> | ||
17 | #include <linux/blktrace_api.h> | ||
18 | #include <linux/blk-cgroup.h> | ||
19 | #include "blk.h" | ||
20 | #include "blk-wbt.h" | ||
21 | |||
22 | /* | ||
23 | * tunables | ||
24 | */ | ||
25 | /* max queue in one round of service */ | ||
26 | static const int cfq_quantum = 8; | ||
27 | static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; | ||
28 | /* maximum backwards seek, in KiB */ | ||
29 | static const int cfq_back_max = 16 * 1024; | ||
30 | /* penalty of a backwards seek */ | ||
31 | static const int cfq_back_penalty = 2; | ||
32 | static const u64 cfq_slice_sync = NSEC_PER_SEC / 10; | ||
33 | static u64 cfq_slice_async = NSEC_PER_SEC / 25; | ||
34 | static const int cfq_slice_async_rq = 2; | ||
35 | static u64 cfq_slice_idle = NSEC_PER_SEC / 125; | ||
36 | static u64 cfq_group_idle = NSEC_PER_SEC / 125; | ||
37 | static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */ | ||
38 | static const int cfq_hist_divisor = 4; | ||
39 | |||
40 | /* | ||
41 | * offset from end of queue service tree for idle class | ||
42 | */ | ||
43 | #define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5) | ||
44 | /* offset from end of group service tree under time slice mode */ | ||
45 | #define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5) | ||
46 | /* offset from end of group service under IOPS mode */ | ||
47 | #define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5) | ||
48 | |||
49 | /* | ||
50 | * below this threshold, we consider thinktime immediate | ||
51 | */ | ||
52 | #define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ) | ||
53 | |||
54 | #define CFQ_SLICE_SCALE (5) | ||
55 | #define CFQ_HW_QUEUE_MIN (5) | ||
56 | #define CFQ_SERVICE_SHIFT 12 | ||
57 | |||
58 | #define CFQQ_SEEK_THR (sector_t)(8 * 100) | ||
59 | #define CFQQ_CLOSE_THR (sector_t)(8 * 1024) | ||
60 | #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) | ||
61 | #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) | ||
62 | |||
63 | #define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) | ||
64 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) | ||
65 | #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) | ||
66 | |||
67 | static struct kmem_cache *cfq_pool; | ||
68 | |||
69 | #define CFQ_PRIO_LISTS IOPRIO_BE_NR | ||
70 | #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) | ||
71 | #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) | ||
72 | |||
73 | #define sample_valid(samples) ((samples) > 80) | ||
74 | #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) | ||
75 | |||
76 | /* blkio-related constants */ | ||
77 | #define CFQ_WEIGHT_LEGACY_MIN 10 | ||
78 | #define CFQ_WEIGHT_LEGACY_DFL 500 | ||
79 | #define CFQ_WEIGHT_LEGACY_MAX 1000 | ||
80 | |||
81 | struct cfq_ttime { | ||
82 | u64 last_end_request; | ||
83 | |||
84 | u64 ttime_total; | ||
85 | u64 ttime_mean; | ||
86 | unsigned long ttime_samples; | ||
87 | }; | ||
88 | |||
89 | /* | ||
90 | * Most of our rbtree usage is for sorting with min extraction, so | ||
91 | * if we cache the leftmost node we don't have to walk down the tree | ||
92 | * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should | ||
93 | * move this into the elevator for the rq sorting as well. | ||
94 | */ | ||
95 | struct cfq_rb_root { | ||
96 | struct rb_root_cached rb; | ||
97 | struct rb_node *rb_rightmost; | ||
98 | unsigned count; | ||
99 | u64 min_vdisktime; | ||
100 | struct cfq_ttime ttime; | ||
101 | }; | ||
102 | #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \ | ||
103 | .rb_rightmost = NULL, \ | ||
104 | .ttime = {.last_end_request = ktime_get_ns(),},} | ||
105 | |||
106 | /* | ||
107 | * Per process-grouping structure | ||
108 | */ | ||
109 | struct cfq_queue { | ||
110 | /* reference count */ | ||
111 | int ref; | ||
112 | /* various state flags, see below */ | ||
113 | unsigned int flags; | ||
114 | /* parent cfq_data */ | ||
115 | struct cfq_data *cfqd; | ||
116 | /* service_tree member */ | ||
117 | struct rb_node rb_node; | ||
118 | /* service_tree key */ | ||
119 | u64 rb_key; | ||
120 | /* prio tree member */ | ||
121 | struct rb_node p_node; | ||
122 | /* prio tree root we belong to, if any */ | ||
123 | struct rb_root *p_root; | ||
124 | /* sorted list of pending requests */ | ||
125 | struct rb_root sort_list; | ||
126 | /* if fifo isn't expired, next request to serve */ | ||
127 | struct request *next_rq; | ||
128 | /* requests queued in sort_list */ | ||
129 | int queued[2]; | ||
130 | /* currently allocated requests */ | ||
131 | int allocated[2]; | ||
132 | /* fifo list of requests in sort_list */ | ||
133 | struct list_head fifo; | ||
134 | |||
135 | /* time when queue got scheduled in to dispatch first request. */ | ||
136 | u64 dispatch_start; | ||
137 | u64 allocated_slice; | ||
138 | u64 slice_dispatch; | ||
139 | /* time when first request from queue completed and slice started. */ | ||
140 | u64 slice_start; | ||
141 | u64 slice_end; | ||
142 | s64 slice_resid; | ||
143 | |||
144 | /* pending priority requests */ | ||
145 | int prio_pending; | ||
146 | /* number of requests that are on the dispatch list or inside driver */ | ||
147 | int dispatched; | ||
148 | |||
149 | /* io prio of this group */ | ||
150 | unsigned short ioprio, org_ioprio; | ||
151 | unsigned short ioprio_class, org_ioprio_class; | ||
152 | |||
153 | pid_t pid; | ||
154 | |||
155 | u32 seek_history; | ||
156 | sector_t last_request_pos; | ||
157 | |||
158 | struct cfq_rb_root *service_tree; | ||
159 | struct cfq_queue *new_cfqq; | ||
160 | struct cfq_group *cfqg; | ||
161 | /* Number of sectors dispatched from queue in single dispatch round */ | ||
162 | unsigned long nr_sectors; | ||
163 | }; | ||
164 | |||
165 | /* | ||
166 | * First index in the service_trees. | ||
167 | * IDLE is handled separately, so it has negative index | ||
168 | */ | ||
169 | enum wl_class_t { | ||
170 | BE_WORKLOAD = 0, | ||
171 | RT_WORKLOAD = 1, | ||
172 | IDLE_WORKLOAD = 2, | ||
173 | CFQ_PRIO_NR, | ||
174 | }; | ||
175 | |||
176 | /* | ||
177 | * Second index in the service_trees. | ||
178 | */ | ||
179 | enum wl_type_t { | ||
180 | ASYNC_WORKLOAD = 0, | ||
181 | SYNC_NOIDLE_WORKLOAD = 1, | ||
182 | SYNC_WORKLOAD = 2 | ||
183 | }; | ||
184 | |||
185 | struct cfqg_stats { | ||
186 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
187 | /* number of ios merged */ | ||
188 | struct blkg_rwstat merged; | ||
189 | /* total time spent on device in ns, may not be accurate w/ queueing */ | ||
190 | struct blkg_rwstat service_time; | ||
191 | /* total time spent waiting in scheduler queue in ns */ | ||
192 | struct blkg_rwstat wait_time; | ||
193 | /* number of IOs queued up */ | ||
194 | struct blkg_rwstat queued; | ||
195 | /* total disk time and nr sectors dispatched by this group */ | ||
196 | struct blkg_stat time; | ||
197 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
198 | /* time not charged to this cgroup */ | ||
199 | struct blkg_stat unaccounted_time; | ||
200 | /* sum of number of ios queued across all samples */ | ||
201 | struct blkg_stat avg_queue_size_sum; | ||
202 | /* count of samples taken for average */ | ||
203 | struct blkg_stat avg_queue_size_samples; | ||
204 | /* how many times this group has been removed from service tree */ | ||
205 | struct blkg_stat dequeue; | ||
206 | /* total time spent waiting for it to be assigned a timeslice. */ | ||
207 | struct blkg_stat group_wait_time; | ||
208 | /* time spent idling for this blkcg_gq */ | ||
209 | struct blkg_stat idle_time; | ||
210 | /* total time with empty current active q with other requests queued */ | ||
211 | struct blkg_stat empty_time; | ||
212 | /* fields after this shouldn't be cleared on stat reset */ | ||
213 | u64 start_group_wait_time; | ||
214 | u64 start_idle_time; | ||
215 | u64 start_empty_time; | ||
216 | uint16_t flags; | ||
217 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
218 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
219 | }; | ||
220 | |||
221 | /* Per-cgroup data */ | ||
222 | struct cfq_group_data { | ||
223 | /* must be the first member */ | ||
224 | struct blkcg_policy_data cpd; | ||
225 | |||
226 | unsigned int weight; | ||
227 | unsigned int leaf_weight; | ||
228 | }; | ||
229 | |||
230 | /* This is per cgroup per device grouping structure */ | ||
231 | struct cfq_group { | ||
232 | /* must be the first member */ | ||
233 | struct blkg_policy_data pd; | ||
234 | |||
235 | /* group service_tree member */ | ||
236 | struct rb_node rb_node; | ||
237 | |||
238 | /* group service_tree key */ | ||
239 | u64 vdisktime; | ||
240 | |||
241 | /* | ||
242 | * The number of active cfqgs and sum of their weights under this | ||
243 | * cfqg. This covers this cfqg's leaf_weight and all children's | ||
244 | * weights, but does not cover weights of further descendants. | ||
245 | * | ||
246 | * If a cfqg is on the service tree, it's active. An active cfqg | ||
247 | * also activates its parent and contributes to the children_weight | ||
248 | * of the parent. | ||
249 | */ | ||
250 | int nr_active; | ||
251 | unsigned int children_weight; | ||
252 | |||
253 | /* | ||
254 | * vfraction is the fraction of vdisktime that the tasks in this | ||
255 | * cfqg are entitled to. This is determined by compounding the | ||
256 | * ratios walking up from this cfqg to the root. | ||
257 | * | ||
258 | * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all | ||
259 | * vfractions on a service tree is approximately 1. The sum may | ||
260 | * deviate a bit due to rounding errors and fluctuations caused by | ||
261 | * cfqgs entering and leaving the service tree. | ||
262 | */ | ||
263 | unsigned int vfraction; | ||
264 | |||
265 | /* | ||
266 | * There are two weights - (internal) weight is the weight of this | ||
267 | * cfqg against the sibling cfqgs. leaf_weight is the wight of | ||
268 | * this cfqg against the child cfqgs. For the root cfqg, both | ||
269 | * weights are kept in sync for backward compatibility. | ||
270 | */ | ||
271 | unsigned int weight; | ||
272 | unsigned int new_weight; | ||
273 | unsigned int dev_weight; | ||
274 | |||
275 | unsigned int leaf_weight; | ||
276 | unsigned int new_leaf_weight; | ||
277 | unsigned int dev_leaf_weight; | ||
278 | |||
279 | /* number of cfqq currently on this group */ | ||
280 | int nr_cfqq; | ||
281 | |||
282 | /* | ||
283 | * Per group busy queues average. Useful for workload slice calc. We | ||
284 | * create the array for each prio class but at run time it is used | ||
285 | * only for RT and BE class and slot for IDLE class remains unused. | ||
286 | * This is primarily done to avoid confusion and a gcc warning. | ||
287 | */ | ||
288 | unsigned int busy_queues_avg[CFQ_PRIO_NR]; | ||
289 | /* | ||
290 | * rr lists of queues with requests. We maintain service trees for | ||
291 | * RT and BE classes. These trees are subdivided in subclasses | ||
292 | * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE | ||
293 | * class there is no subclassification and all the cfq queues go on | ||
294 | * a single tree service_tree_idle. | ||
295 | * Counts are embedded in the cfq_rb_root | ||
296 | */ | ||
297 | struct cfq_rb_root service_trees[2][3]; | ||
298 | struct cfq_rb_root service_tree_idle; | ||
299 | |||
300 | u64 saved_wl_slice; | ||
301 | enum wl_type_t saved_wl_type; | ||
302 | enum wl_class_t saved_wl_class; | ||
303 | |||
304 | /* number of requests that are on the dispatch list or inside driver */ | ||
305 | int dispatched; | ||
306 | struct cfq_ttime ttime; | ||
307 | struct cfqg_stats stats; /* stats for this cfqg */ | ||
308 | |||
309 | /* async queue for each priority case */ | ||
310 | struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; | ||
311 | struct cfq_queue *async_idle_cfqq; | ||
312 | |||
313 | }; | ||
314 | |||
315 | struct cfq_io_cq { | ||
316 | struct io_cq icq; /* must be the first member */ | ||
317 | struct cfq_queue *cfqq[2]; | ||
318 | struct cfq_ttime ttime; | ||
319 | int ioprio; /* the current ioprio */ | ||
320 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
321 | uint64_t blkcg_serial_nr; /* the current blkcg serial */ | ||
322 | #endif | ||
323 | }; | ||
324 | |||
325 | /* | ||
326 | * Per block device queue structure | ||
327 | */ | ||
328 | struct cfq_data { | ||
329 | struct request_queue *queue; | ||
330 | /* Root service tree for cfq_groups */ | ||
331 | struct cfq_rb_root grp_service_tree; | ||
332 | struct cfq_group *root_group; | ||
333 | |||
334 | /* | ||
335 | * The priority currently being served | ||
336 | */ | ||
337 | enum wl_class_t serving_wl_class; | ||
338 | enum wl_type_t serving_wl_type; | ||
339 | u64 workload_expires; | ||
340 | struct cfq_group *serving_group; | ||
341 | |||
342 | /* | ||
343 | * Each priority tree is sorted by next_request position. These | ||
344 | * trees are used when determining if two or more queues are | ||
345 | * interleaving requests (see cfq_close_cooperator). | ||
346 | */ | ||
347 | struct rb_root prio_trees[CFQ_PRIO_LISTS]; | ||
348 | |||
349 | unsigned int busy_queues; | ||
350 | unsigned int busy_sync_queues; | ||
351 | |||
352 | int rq_in_driver; | ||
353 | int rq_in_flight[2]; | ||
354 | |||
355 | /* | ||
356 | * queue-depth detection | ||
357 | */ | ||
358 | int rq_queued; | ||
359 | int hw_tag; | ||
360 | /* | ||
361 | * hw_tag can be | ||
362 | * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) | ||
363 | * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) | ||
364 | * 0 => no NCQ | ||
365 | */ | ||
366 | int hw_tag_est_depth; | ||
367 | unsigned int hw_tag_samples; | ||
368 | |||
369 | /* | ||
370 | * idle window management | ||
371 | */ | ||
372 | struct hrtimer idle_slice_timer; | ||
373 | struct work_struct unplug_work; | ||
374 | |||
375 | struct cfq_queue *active_queue; | ||
376 | struct cfq_io_cq *active_cic; | ||
377 | |||
378 | sector_t last_position; | ||
379 | |||
380 | /* | ||
381 | * tunables, see top of file | ||
382 | */ | ||
383 | unsigned int cfq_quantum; | ||
384 | unsigned int cfq_back_penalty; | ||
385 | unsigned int cfq_back_max; | ||
386 | unsigned int cfq_slice_async_rq; | ||
387 | unsigned int cfq_latency; | ||
388 | u64 cfq_fifo_expire[2]; | ||
389 | u64 cfq_slice[2]; | ||
390 | u64 cfq_slice_idle; | ||
391 | u64 cfq_group_idle; | ||
392 | u64 cfq_target_latency; | ||
393 | |||
394 | /* | ||
395 | * Fallback dummy cfqq for extreme OOM conditions | ||
396 | */ | ||
397 | struct cfq_queue oom_cfqq; | ||
398 | |||
399 | u64 last_delayed_sync; | ||
400 | }; | ||
401 | |||
402 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); | ||
403 | static void cfq_put_queue(struct cfq_queue *cfqq); | ||
404 | |||
405 | static struct cfq_rb_root *st_for(struct cfq_group *cfqg, | ||
406 | enum wl_class_t class, | ||
407 | enum wl_type_t type) | ||
408 | { | ||
409 | if (!cfqg) | ||
410 | return NULL; | ||
411 | |||
412 | if (class == IDLE_WORKLOAD) | ||
413 | return &cfqg->service_tree_idle; | ||
414 | |||
415 | return &cfqg->service_trees[class][type]; | ||
416 | } | ||
417 | |||
418 | enum cfqq_state_flags { | ||
419 | CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ | ||
420 | CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ | ||
421 | CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ | ||
422 | CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ | ||
423 | CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ | ||
424 | CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ | ||
425 | CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ | ||
426 | CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ | ||
427 | CFQ_CFQQ_FLAG_sync, /* synchronous queue */ | ||
428 | CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ | ||
429 | CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ | ||
430 | CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ | ||
431 | CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ | ||
432 | }; | ||
433 | |||
434 | #define CFQ_CFQQ_FNS(name) \ | ||
435 | static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ | ||
436 | { \ | ||
437 | (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ | ||
438 | } \ | ||
439 | static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ | ||
440 | { \ | ||
441 | (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ | ||
442 | } \ | ||
443 | static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ | ||
444 | { \ | ||
445 | return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ | ||
446 | } | ||
447 | |||
448 | CFQ_CFQQ_FNS(on_rr); | ||
449 | CFQ_CFQQ_FNS(wait_request); | ||
450 | CFQ_CFQQ_FNS(must_dispatch); | ||
451 | CFQ_CFQQ_FNS(must_alloc_slice); | ||
452 | CFQ_CFQQ_FNS(fifo_expire); | ||
453 | CFQ_CFQQ_FNS(idle_window); | ||
454 | CFQ_CFQQ_FNS(prio_changed); | ||
455 | CFQ_CFQQ_FNS(slice_new); | ||
456 | CFQ_CFQQ_FNS(sync); | ||
457 | CFQ_CFQQ_FNS(coop); | ||
458 | CFQ_CFQQ_FNS(split_coop); | ||
459 | CFQ_CFQQ_FNS(deep); | ||
460 | CFQ_CFQQ_FNS(wait_busy); | ||
461 | #undef CFQ_CFQQ_FNS | ||
462 | |||
463 | #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
464 | |||
465 | /* cfqg stats flags */ | ||
466 | enum cfqg_stats_flags { | ||
467 | CFQG_stats_waiting = 0, | ||
468 | CFQG_stats_idling, | ||
469 | CFQG_stats_empty, | ||
470 | }; | ||
471 | |||
472 | #define CFQG_FLAG_FNS(name) \ | ||
473 | static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ | ||
474 | { \ | ||
475 | stats->flags |= (1 << CFQG_stats_##name); \ | ||
476 | } \ | ||
477 | static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ | ||
478 | { \ | ||
479 | stats->flags &= ~(1 << CFQG_stats_##name); \ | ||
480 | } \ | ||
481 | static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ | ||
482 | { \ | ||
483 | return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ | ||
484 | } \ | ||
485 | |||
486 | CFQG_FLAG_FNS(waiting) | ||
487 | CFQG_FLAG_FNS(idling) | ||
488 | CFQG_FLAG_FNS(empty) | ||
489 | #undef CFQG_FLAG_FNS | ||
490 | |||
491 | /* This should be called with the queue_lock held. */ | ||
492 | static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) | ||
493 | { | ||
494 | u64 now; | ||
495 | |||
496 | if (!cfqg_stats_waiting(stats)) | ||
497 | return; | ||
498 | |||
499 | now = ktime_get_ns(); | ||
500 | if (now > stats->start_group_wait_time) | ||
501 | blkg_stat_add(&stats->group_wait_time, | ||
502 | now - stats->start_group_wait_time); | ||
503 | cfqg_stats_clear_waiting(stats); | ||
504 | } | ||
505 | |||
506 | /* This should be called with the queue_lock held. */ | ||
507 | static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, | ||
508 | struct cfq_group *curr_cfqg) | ||
509 | { | ||
510 | struct cfqg_stats *stats = &cfqg->stats; | ||
511 | |||
512 | if (cfqg_stats_waiting(stats)) | ||
513 | return; | ||
514 | if (cfqg == curr_cfqg) | ||
515 | return; | ||
516 | stats->start_group_wait_time = ktime_get_ns(); | ||
517 | cfqg_stats_mark_waiting(stats); | ||
518 | } | ||
519 | |||
520 | /* This should be called with the queue_lock held. */ | ||
521 | static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) | ||
522 | { | ||
523 | u64 now; | ||
524 | |||
525 | if (!cfqg_stats_empty(stats)) | ||
526 | return; | ||
527 | |||
528 | now = ktime_get_ns(); | ||
529 | if (now > stats->start_empty_time) | ||
530 | blkg_stat_add(&stats->empty_time, | ||
531 | now - stats->start_empty_time); | ||
532 | cfqg_stats_clear_empty(stats); | ||
533 | } | ||
534 | |||
535 | static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) | ||
536 | { | ||
537 | blkg_stat_add(&cfqg->stats.dequeue, 1); | ||
538 | } | ||
539 | |||
540 | static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) | ||
541 | { | ||
542 | struct cfqg_stats *stats = &cfqg->stats; | ||
543 | |||
544 | if (blkg_rwstat_total(&stats->queued)) | ||
545 | return; | ||
546 | |||
547 | /* | ||
548 | * group is already marked empty. This can happen if cfqq got new | ||
549 | * request in parent group and moved to this group while being added | ||
550 | * to service tree. Just ignore the event and move on. | ||
551 | */ | ||
552 | if (cfqg_stats_empty(stats)) | ||
553 | return; | ||
554 | |||
555 | stats->start_empty_time = ktime_get_ns(); | ||
556 | cfqg_stats_mark_empty(stats); | ||
557 | } | ||
558 | |||
559 | static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) | ||
560 | { | ||
561 | struct cfqg_stats *stats = &cfqg->stats; | ||
562 | |||
563 | if (cfqg_stats_idling(stats)) { | ||
564 | u64 now = ktime_get_ns(); | ||
565 | |||
566 | if (now > stats->start_idle_time) | ||
567 | blkg_stat_add(&stats->idle_time, | ||
568 | now - stats->start_idle_time); | ||
569 | cfqg_stats_clear_idling(stats); | ||
570 | } | ||
571 | } | ||
572 | |||
573 | static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) | ||
574 | { | ||
575 | struct cfqg_stats *stats = &cfqg->stats; | ||
576 | |||
577 | BUG_ON(cfqg_stats_idling(stats)); | ||
578 | |||
579 | stats->start_idle_time = ktime_get_ns(); | ||
580 | cfqg_stats_mark_idling(stats); | ||
581 | } | ||
582 | |||
583 | static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) | ||
584 | { | ||
585 | struct cfqg_stats *stats = &cfqg->stats; | ||
586 | |||
587 | blkg_stat_add(&stats->avg_queue_size_sum, | ||
588 | blkg_rwstat_total(&stats->queued)); | ||
589 | blkg_stat_add(&stats->avg_queue_size_samples, 1); | ||
590 | cfqg_stats_update_group_wait_time(stats); | ||
591 | } | ||
592 | |||
593 | #else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | ||
594 | |||
595 | static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } | ||
596 | static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } | ||
597 | static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } | ||
598 | static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } | ||
599 | static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } | ||
600 | static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } | ||
601 | static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } | ||
602 | |||
603 | #endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | ||
604 | |||
605 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
606 | |||
607 | static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) | ||
608 | { | ||
609 | return pd ? container_of(pd, struct cfq_group, pd) : NULL; | ||
610 | } | ||
611 | |||
612 | static struct cfq_group_data | ||
613 | *cpd_to_cfqgd(struct blkcg_policy_data *cpd) | ||
614 | { | ||
615 | return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; | ||
616 | } | ||
617 | |||
618 | static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) | ||
619 | { | ||
620 | return pd_to_blkg(&cfqg->pd); | ||
621 | } | ||
622 | |||
623 | static struct blkcg_policy blkcg_policy_cfq; | ||
624 | |||
625 | static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) | ||
626 | { | ||
627 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); | ||
628 | } | ||
629 | |||
630 | static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) | ||
631 | { | ||
632 | return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); | ||
633 | } | ||
634 | |||
635 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) | ||
636 | { | ||
637 | struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; | ||
638 | |||
639 | return pblkg ? blkg_to_cfqg(pblkg) : NULL; | ||
640 | } | ||
641 | |||
642 | static inline bool cfqg_is_descendant(struct cfq_group *cfqg, | ||
643 | struct cfq_group *ancestor) | ||
644 | { | ||
645 | return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup, | ||
646 | cfqg_to_blkg(ancestor)->blkcg->css.cgroup); | ||
647 | } | ||
648 | |||
649 | static inline void cfqg_get(struct cfq_group *cfqg) | ||
650 | { | ||
651 | return blkg_get(cfqg_to_blkg(cfqg)); | ||
652 | } | ||
653 | |||
654 | static inline void cfqg_put(struct cfq_group *cfqg) | ||
655 | { | ||
656 | return blkg_put(cfqg_to_blkg(cfqg)); | ||
657 | } | ||
658 | |||
659 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ | ||
660 | blk_add_cgroup_trace_msg((cfqd)->queue, \ | ||
661 | cfqg_to_blkg((cfqq)->cfqg)->blkcg, \ | ||
662 | "cfq%d%c%c " fmt, (cfqq)->pid, \ | ||
663 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | ||
664 | cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ | ||
665 | ##args); \ | ||
666 | } while (0) | ||
667 | |||
668 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ | ||
669 | blk_add_cgroup_trace_msg((cfqd)->queue, \ | ||
670 | cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \ | ||
671 | } while (0) | ||
672 | |||
673 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, | ||
674 | struct cfq_group *curr_cfqg, | ||
675 | unsigned int op) | ||
676 | { | ||
677 | blkg_rwstat_add(&cfqg->stats.queued, op, 1); | ||
678 | cfqg_stats_end_empty_time(&cfqg->stats); | ||
679 | cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); | ||
680 | } | ||
681 | |||
682 | static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, | ||
683 | uint64_t time, unsigned long unaccounted_time) | ||
684 | { | ||
685 | blkg_stat_add(&cfqg->stats.time, time); | ||
686 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
687 | blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); | ||
688 | #endif | ||
689 | } | ||
690 | |||
691 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, | ||
692 | unsigned int op) | ||
693 | { | ||
694 | blkg_rwstat_add(&cfqg->stats.queued, op, -1); | ||
695 | } | ||
696 | |||
697 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, | ||
698 | unsigned int op) | ||
699 | { | ||
700 | blkg_rwstat_add(&cfqg->stats.merged, op, 1); | ||
701 | } | ||
702 | |||
703 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | ||
704 | u64 start_time_ns, | ||
705 | u64 io_start_time_ns, | ||
706 | unsigned int op) | ||
707 | { | ||
708 | struct cfqg_stats *stats = &cfqg->stats; | ||
709 | u64 now = ktime_get_ns(); | ||
710 | |||
711 | if (now > io_start_time_ns) | ||
712 | blkg_rwstat_add(&stats->service_time, op, | ||
713 | now - io_start_time_ns); | ||
714 | if (io_start_time_ns > start_time_ns) | ||
715 | blkg_rwstat_add(&stats->wait_time, op, | ||
716 | io_start_time_ns - start_time_ns); | ||
717 | } | ||
718 | |||
719 | /* @stats = 0 */ | ||
720 | static void cfqg_stats_reset(struct cfqg_stats *stats) | ||
721 | { | ||
722 | /* queued stats shouldn't be cleared */ | ||
723 | blkg_rwstat_reset(&stats->merged); | ||
724 | blkg_rwstat_reset(&stats->service_time); | ||
725 | blkg_rwstat_reset(&stats->wait_time); | ||
726 | blkg_stat_reset(&stats->time); | ||
727 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
728 | blkg_stat_reset(&stats->unaccounted_time); | ||
729 | blkg_stat_reset(&stats->avg_queue_size_sum); | ||
730 | blkg_stat_reset(&stats->avg_queue_size_samples); | ||
731 | blkg_stat_reset(&stats->dequeue); | ||
732 | blkg_stat_reset(&stats->group_wait_time); | ||
733 | blkg_stat_reset(&stats->idle_time); | ||
734 | blkg_stat_reset(&stats->empty_time); | ||
735 | #endif | ||
736 | } | ||
737 | |||
738 | /* @to += @from */ | ||
739 | static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) | ||
740 | { | ||
741 | /* queued stats shouldn't be cleared */ | ||
742 | blkg_rwstat_add_aux(&to->merged, &from->merged); | ||
743 | blkg_rwstat_add_aux(&to->service_time, &from->service_time); | ||
744 | blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); | ||
745 | blkg_stat_add_aux(&from->time, &from->time); | ||
746 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
747 | blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); | ||
748 | blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); | ||
749 | blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); | ||
750 | blkg_stat_add_aux(&to->dequeue, &from->dequeue); | ||
751 | blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); | ||
752 | blkg_stat_add_aux(&to->idle_time, &from->idle_time); | ||
753 | blkg_stat_add_aux(&to->empty_time, &from->empty_time); | ||
754 | #endif | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' | ||
759 | * recursive stats can still account for the amount used by this cfqg after | ||
760 | * it's gone. | ||
761 | */ | ||
762 | static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) | ||
763 | { | ||
764 | struct cfq_group *parent = cfqg_parent(cfqg); | ||
765 | |||
766 | lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); | ||
767 | |||
768 | if (unlikely(!parent)) | ||
769 | return; | ||
770 | |||
771 | cfqg_stats_add_aux(&parent->stats, &cfqg->stats); | ||
772 | cfqg_stats_reset(&cfqg->stats); | ||
773 | } | ||
774 | |||
775 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
776 | |||
777 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } | ||
778 | static inline bool cfqg_is_descendant(struct cfq_group *cfqg, | ||
779 | struct cfq_group *ancestor) | ||
780 | { | ||
781 | return true; | ||
782 | } | ||
783 | static inline void cfqg_get(struct cfq_group *cfqg) { } | ||
784 | static inline void cfqg_put(struct cfq_group *cfqg) { } | ||
785 | |||
786 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | ||
787 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ | ||
788 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | ||
789 | cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ | ||
790 | ##args) | ||
791 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) | ||
792 | |||
793 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, | ||
794 | struct cfq_group *curr_cfqg, unsigned int op) { } | ||
795 | static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, | ||
796 | uint64_t time, unsigned long unaccounted_time) { } | ||
797 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, | ||
798 | unsigned int op) { } | ||
799 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, | ||
800 | unsigned int op) { } | ||
801 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | ||
802 | u64 start_time_ns, | ||
803 | u64 io_start_time_ns, | ||
804 | unsigned int op) { } | ||
805 | |||
806 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
807 | |||
808 | #define cfq_log(cfqd, fmt, args...) \ | ||
809 | blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) | ||
810 | |||
811 | /* Traverses through cfq group service trees */ | ||
812 | #define for_each_cfqg_st(cfqg, i, j, st) \ | ||
813 | for (i = 0; i <= IDLE_WORKLOAD; i++) \ | ||
814 | for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ | ||
815 | : &cfqg->service_tree_idle; \ | ||
816 | (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ | ||
817 | (i == IDLE_WORKLOAD && j == 0); \ | ||
818 | j++, st = i < IDLE_WORKLOAD ? \ | ||
819 | &cfqg->service_trees[i][j]: NULL) \ | ||
820 | |||
821 | static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, | ||
822 | struct cfq_ttime *ttime, bool group_idle) | ||
823 | { | ||
824 | u64 slice; | ||
825 | if (!sample_valid(ttime->ttime_samples)) | ||
826 | return false; | ||
827 | if (group_idle) | ||
828 | slice = cfqd->cfq_group_idle; | ||
829 | else | ||
830 | slice = cfqd->cfq_slice_idle; | ||
831 | return ttime->ttime_mean > slice; | ||
832 | } | ||
833 | |||
834 | static inline bool iops_mode(struct cfq_data *cfqd) | ||
835 | { | ||
836 | /* | ||
837 | * If we are not idling on queues and it is a NCQ drive, parallel | ||
838 | * execution of requests is on and measuring time is not possible | ||
839 | * in most of the cases until and unless we drive shallower queue | ||
840 | * depths and that becomes a performance bottleneck. In such cases | ||
841 | * switch to start providing fairness in terms of number of IOs. | ||
842 | */ | ||
843 | if (!cfqd->cfq_slice_idle && cfqd->hw_tag) | ||
844 | return true; | ||
845 | else | ||
846 | return false; | ||
847 | } | ||
848 | |||
849 | static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) | ||
850 | { | ||
851 | if (cfq_class_idle(cfqq)) | ||
852 | return IDLE_WORKLOAD; | ||
853 | if (cfq_class_rt(cfqq)) | ||
854 | return RT_WORKLOAD; | ||
855 | return BE_WORKLOAD; | ||
856 | } | ||
857 | |||
858 | |||
859 | static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) | ||
860 | { | ||
861 | if (!cfq_cfqq_sync(cfqq)) | ||
862 | return ASYNC_WORKLOAD; | ||
863 | if (!cfq_cfqq_idle_window(cfqq)) | ||
864 | return SYNC_NOIDLE_WORKLOAD; | ||
865 | return SYNC_WORKLOAD; | ||
866 | } | ||
867 | |||
868 | static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, | ||
869 | struct cfq_data *cfqd, | ||
870 | struct cfq_group *cfqg) | ||
871 | { | ||
872 | if (wl_class == IDLE_WORKLOAD) | ||
873 | return cfqg->service_tree_idle.count; | ||
874 | |||
875 | return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + | ||
876 | cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + | ||
877 | cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; | ||
878 | } | ||
879 | |||
880 | static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, | ||
881 | struct cfq_group *cfqg) | ||
882 | { | ||
883 | return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + | ||
884 | cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; | ||
885 | } | ||
886 | |||
887 | static void cfq_dispatch_insert(struct request_queue *, struct request *); | ||
888 | static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, | ||
889 | struct cfq_io_cq *cic, struct bio *bio); | ||
890 | |||
891 | static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) | ||
892 | { | ||
893 | /* cic->icq is the first member, %NULL will convert to %NULL */ | ||
894 | return container_of(icq, struct cfq_io_cq, icq); | ||
895 | } | ||
896 | |||
897 | static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, | ||
898 | struct io_context *ioc) | ||
899 | { | ||
900 | if (ioc) | ||
901 | return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue)); | ||
902 | return NULL; | ||
903 | } | ||
904 | |||
905 | static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) | ||
906 | { | ||
907 | return cic->cfqq[is_sync]; | ||
908 | } | ||
909 | |||
910 | static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, | ||
911 | bool is_sync) | ||
912 | { | ||
913 | cic->cfqq[is_sync] = cfqq; | ||
914 | } | ||
915 | |||
916 | static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) | ||
917 | { | ||
918 | return cic->icq.q->elevator->elevator_data; | ||
919 | } | ||
920 | |||
921 | /* | ||
922 | * scheduler run of queue, if there are requests pending and no one in the | ||
923 | * driver that will restart queueing | ||
924 | */ | ||
925 | static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) | ||
926 | { | ||
927 | if (cfqd->busy_queues) { | ||
928 | cfq_log(cfqd, "schedule dispatch"); | ||
929 | kblockd_schedule_work(&cfqd->unplug_work); | ||
930 | } | ||
931 | } | ||
932 | |||
933 | /* | ||
934 | * Scale schedule slice based on io priority. Use the sync time slice only | ||
935 | * if a queue is marked sync and has sync io queued. A sync queue with async | ||
936 | * io only, should not get full sync slice length. | ||
937 | */ | ||
938 | static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync, | ||
939 | unsigned short prio) | ||
940 | { | ||
941 | u64 base_slice = cfqd->cfq_slice[sync]; | ||
942 | u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE); | ||
943 | |||
944 | WARN_ON(prio >= IOPRIO_BE_NR); | ||
945 | |||
946 | return base_slice + (slice * (4 - prio)); | ||
947 | } | ||
948 | |||
949 | static inline u64 | ||
950 | cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
951 | { | ||
952 | return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); | ||
953 | } | ||
954 | |||
955 | /** | ||
956 | * cfqg_scale_charge - scale disk time charge according to cfqg weight | ||
957 | * @charge: disk time being charged | ||
958 | * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT | ||
959 | * | ||
960 | * Scale @charge according to @vfraction, which is in range (0, 1]. The | ||
961 | * scaling is inversely proportional. | ||
962 | * | ||
963 | * scaled = charge / vfraction | ||
964 | * | ||
965 | * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. | ||
966 | */ | ||
967 | static inline u64 cfqg_scale_charge(u64 charge, | ||
968 | unsigned int vfraction) | ||
969 | { | ||
970 | u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ | ||
971 | |||
972 | /* charge / vfraction */ | ||
973 | c <<= CFQ_SERVICE_SHIFT; | ||
974 | return div_u64(c, vfraction); | ||
975 | } | ||
976 | |||
977 | static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) | ||
978 | { | ||
979 | s64 delta = (s64)(vdisktime - min_vdisktime); | ||
980 | if (delta > 0) | ||
981 | min_vdisktime = vdisktime; | ||
982 | |||
983 | return min_vdisktime; | ||
984 | } | ||
985 | |||
986 | static void update_min_vdisktime(struct cfq_rb_root *st) | ||
987 | { | ||
988 | if (!RB_EMPTY_ROOT(&st->rb.rb_root)) { | ||
989 | struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost); | ||
990 | |||
991 | st->min_vdisktime = max_vdisktime(st->min_vdisktime, | ||
992 | cfqg->vdisktime); | ||
993 | } | ||
994 | } | ||
995 | |||
996 | /* | ||
997 | * get averaged number of queues of RT/BE priority. | ||
998 | * average is updated, with a formula that gives more weight to higher numbers, | ||
999 | * to quickly follows sudden increases and decrease slowly | ||
1000 | */ | ||
1001 | |||
1002 | static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, | ||
1003 | struct cfq_group *cfqg, bool rt) | ||
1004 | { | ||
1005 | unsigned min_q, max_q; | ||
1006 | unsigned mult = cfq_hist_divisor - 1; | ||
1007 | unsigned round = cfq_hist_divisor / 2; | ||
1008 | unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); | ||
1009 | |||
1010 | min_q = min(cfqg->busy_queues_avg[rt], busy); | ||
1011 | max_q = max(cfqg->busy_queues_avg[rt], busy); | ||
1012 | cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / | ||
1013 | cfq_hist_divisor; | ||
1014 | return cfqg->busy_queues_avg[rt]; | ||
1015 | } | ||
1016 | |||
1017 | static inline u64 | ||
1018 | cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
1019 | { | ||
1020 | return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; | ||
1021 | } | ||
1022 | |||
1023 | static inline u64 | ||
1024 | cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
1025 | { | ||
1026 | u64 slice = cfq_prio_to_slice(cfqd, cfqq); | ||
1027 | if (cfqd->cfq_latency) { | ||
1028 | /* | ||
1029 | * interested queues (we consider only the ones with the same | ||
1030 | * priority class in the cfq group) | ||
1031 | */ | ||
1032 | unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, | ||
1033 | cfq_class_rt(cfqq)); | ||
1034 | u64 sync_slice = cfqd->cfq_slice[1]; | ||
1035 | u64 expect_latency = sync_slice * iq; | ||
1036 | u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg); | ||
1037 | |||
1038 | if (expect_latency > group_slice) { | ||
1039 | u64 base_low_slice = 2 * cfqd->cfq_slice_idle; | ||
1040 | u64 low_slice; | ||
1041 | |||
1042 | /* scale low_slice according to IO priority | ||
1043 | * and sync vs async */ | ||
1044 | low_slice = div64_u64(base_low_slice*slice, sync_slice); | ||
1045 | low_slice = min(slice, low_slice); | ||
1046 | /* the adapted slice value is scaled to fit all iqs | ||
1047 | * into the target latency */ | ||
1048 | slice = div64_u64(slice*group_slice, expect_latency); | ||
1049 | slice = max(slice, low_slice); | ||
1050 | } | ||
1051 | } | ||
1052 | return slice; | ||
1053 | } | ||
1054 | |||
1055 | static inline void | ||
1056 | cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
1057 | { | ||
1058 | u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq); | ||
1059 | u64 now = ktime_get_ns(); | ||
1060 | |||
1061 | cfqq->slice_start = now; | ||
1062 | cfqq->slice_end = now + slice; | ||
1063 | cfqq->allocated_slice = slice; | ||
1064 | cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now); | ||
1065 | } | ||
1066 | |||
1067 | /* | ||
1068 | * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end | ||
1069 | * isn't valid until the first request from the dispatch is activated | ||
1070 | * and the slice time set. | ||
1071 | */ | ||
1072 | static inline bool cfq_slice_used(struct cfq_queue *cfqq) | ||
1073 | { | ||
1074 | if (cfq_cfqq_slice_new(cfqq)) | ||
1075 | return false; | ||
1076 | if (ktime_get_ns() < cfqq->slice_end) | ||
1077 | return false; | ||
1078 | |||
1079 | return true; | ||
1080 | } | ||
1081 | |||
1082 | /* | ||
1083 | * Lifted from AS - choose which of rq1 and rq2 that is best served now. | ||
1084 | * We choose the request that is closest to the head right now. Distance | ||
1085 | * behind the head is penalized and only allowed to a certain extent. | ||
1086 | */ | ||
1087 | static struct request * | ||
1088 | cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) | ||
1089 | { | ||
1090 | sector_t s1, s2, d1 = 0, d2 = 0; | ||
1091 | unsigned long back_max; | ||
1092 | #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ | ||
1093 | #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ | ||
1094 | unsigned wrap = 0; /* bit mask: requests behind the disk head? */ | ||
1095 | |||
1096 | if (rq1 == NULL || rq1 == rq2) | ||
1097 | return rq2; | ||
1098 | if (rq2 == NULL) | ||
1099 | return rq1; | ||
1100 | |||
1101 | if (rq_is_sync(rq1) != rq_is_sync(rq2)) | ||
1102 | return rq_is_sync(rq1) ? rq1 : rq2; | ||
1103 | |||
1104 | if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) | ||
1105 | return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; | ||
1106 | |||
1107 | s1 = blk_rq_pos(rq1); | ||
1108 | s2 = blk_rq_pos(rq2); | ||
1109 | |||
1110 | /* | ||
1111 | * by definition, 1KiB is 2 sectors | ||
1112 | */ | ||
1113 | back_max = cfqd->cfq_back_max * 2; | ||
1114 | |||
1115 | /* | ||
1116 | * Strict one way elevator _except_ in the case where we allow | ||
1117 | * short backward seeks which are biased as twice the cost of a | ||
1118 | * similar forward seek. | ||
1119 | */ | ||
1120 | if (s1 >= last) | ||
1121 | d1 = s1 - last; | ||
1122 | else if (s1 + back_max >= last) | ||
1123 | d1 = (last - s1) * cfqd->cfq_back_penalty; | ||
1124 | else | ||
1125 | wrap |= CFQ_RQ1_WRAP; | ||
1126 | |||
1127 | if (s2 >= last) | ||
1128 | d2 = s2 - last; | ||
1129 | else if (s2 + back_max >= last) | ||
1130 | d2 = (last - s2) * cfqd->cfq_back_penalty; | ||
1131 | else | ||
1132 | wrap |= CFQ_RQ2_WRAP; | ||
1133 | |||
1134 | /* Found required data */ | ||
1135 | |||
1136 | /* | ||
1137 | * By doing switch() on the bit mask "wrap" we avoid having to | ||
1138 | * check two variables for all permutations: --> faster! | ||
1139 | */ | ||
1140 | switch (wrap) { | ||
1141 | case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ | ||
1142 | if (d1 < d2) | ||
1143 | return rq1; | ||
1144 | else if (d2 < d1) | ||
1145 | return rq2; | ||
1146 | else { | ||
1147 | if (s1 >= s2) | ||
1148 | return rq1; | ||
1149 | else | ||
1150 | return rq2; | ||
1151 | } | ||
1152 | |||
1153 | case CFQ_RQ2_WRAP: | ||
1154 | return rq1; | ||
1155 | case CFQ_RQ1_WRAP: | ||
1156 | return rq2; | ||
1157 | case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ | ||
1158 | default: | ||
1159 | /* | ||
1160 | * Since both rqs are wrapped, | ||
1161 | * start with the one that's further behind head | ||
1162 | * (--> only *one* back seek required), | ||
1163 | * since back seek takes more time than forward. | ||
1164 | */ | ||
1165 | if (s1 <= s2) | ||
1166 | return rq1; | ||
1167 | else | ||
1168 | return rq2; | ||
1169 | } | ||
1170 | } | ||
1171 | |||
1172 | static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) | ||
1173 | { | ||
1174 | /* Service tree is empty */ | ||
1175 | if (!root->count) | ||
1176 | return NULL; | ||
1177 | |||
1178 | return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node); | ||
1179 | } | ||
1180 | |||
1181 | static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) | ||
1182 | { | ||
1183 | return rb_entry_cfqg(rb_first_cached(&root->rb)); | ||
1184 | } | ||
1185 | |||
1186 | static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) | ||
1187 | { | ||
1188 | if (root->rb_rightmost == n) | ||
1189 | root->rb_rightmost = rb_prev(n); | ||
1190 | |||
1191 | rb_erase_cached(n, &root->rb); | ||
1192 | RB_CLEAR_NODE(n); | ||
1193 | |||
1194 | --root->count; | ||
1195 | } | ||
1196 | |||
1197 | /* | ||
1198 | * would be nice to take fifo expire time into account as well | ||
1199 | */ | ||
1200 | static struct request * | ||
1201 | cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
1202 | struct request *last) | ||
1203 | { | ||
1204 | struct rb_node *rbnext = rb_next(&last->rb_node); | ||
1205 | struct rb_node *rbprev = rb_prev(&last->rb_node); | ||
1206 | struct request *next = NULL, *prev = NULL; | ||
1207 | |||
1208 | BUG_ON(RB_EMPTY_NODE(&last->rb_node)); | ||
1209 | |||
1210 | if (rbprev) | ||
1211 | prev = rb_entry_rq(rbprev); | ||
1212 | |||
1213 | if (rbnext) | ||
1214 | next = rb_entry_rq(rbnext); | ||
1215 | else { | ||
1216 | rbnext = rb_first(&cfqq->sort_list); | ||
1217 | if (rbnext && rbnext != &last->rb_node) | ||
1218 | next = rb_entry_rq(rbnext); | ||
1219 | } | ||
1220 | |||
1221 | return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); | ||
1222 | } | ||
1223 | |||
1224 | static u64 cfq_slice_offset(struct cfq_data *cfqd, | ||
1225 | struct cfq_queue *cfqq) | ||
1226 | { | ||
1227 | /* | ||
1228 | * just an approximation, should be ok. | ||
1229 | */ | ||
1230 | return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - | ||
1231 | cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); | ||
1232 | } | ||
1233 | |||
1234 | static inline s64 | ||
1235 | cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) | ||
1236 | { | ||
1237 | return cfqg->vdisktime - st->min_vdisktime; | ||
1238 | } | ||
1239 | |||
1240 | static void | ||
1241 | __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | ||
1242 | { | ||
1243 | struct rb_node **node = &st->rb.rb_root.rb_node; | ||
1244 | struct rb_node *parent = NULL; | ||
1245 | struct cfq_group *__cfqg; | ||
1246 | s64 key = cfqg_key(st, cfqg); | ||
1247 | bool leftmost = true, rightmost = true; | ||
1248 | |||
1249 | while (*node != NULL) { | ||
1250 | parent = *node; | ||
1251 | __cfqg = rb_entry_cfqg(parent); | ||
1252 | |||
1253 | if (key < cfqg_key(st, __cfqg)) { | ||
1254 | node = &parent->rb_left; | ||
1255 | rightmost = false; | ||
1256 | } else { | ||
1257 | node = &parent->rb_right; | ||
1258 | leftmost = false; | ||
1259 | } | ||
1260 | } | ||
1261 | |||
1262 | if (rightmost) | ||
1263 | st->rb_rightmost = &cfqg->rb_node; | ||
1264 | |||
1265 | rb_link_node(&cfqg->rb_node, parent, node); | ||
1266 | rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost); | ||
1267 | } | ||
1268 | |||
1269 | /* | ||
1270 | * This has to be called only on activation of cfqg | ||
1271 | */ | ||
1272 | static void | ||
1273 | cfq_update_group_weight(struct cfq_group *cfqg) | ||
1274 | { | ||
1275 | if (cfqg->new_weight) { | ||
1276 | cfqg->weight = cfqg->new_weight; | ||
1277 | cfqg->new_weight = 0; | ||
1278 | } | ||
1279 | } | ||
1280 | |||
1281 | static void | ||
1282 | cfq_update_group_leaf_weight(struct cfq_group *cfqg) | ||
1283 | { | ||
1284 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | ||
1285 | |||
1286 | if (cfqg->new_leaf_weight) { | ||
1287 | cfqg->leaf_weight = cfqg->new_leaf_weight; | ||
1288 | cfqg->new_leaf_weight = 0; | ||
1289 | } | ||
1290 | } | ||
1291 | |||
1292 | static void | ||
1293 | cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | ||
1294 | { | ||
1295 | unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ | ||
1296 | struct cfq_group *pos = cfqg; | ||
1297 | struct cfq_group *parent; | ||
1298 | bool propagate; | ||
1299 | |||
1300 | /* add to the service tree */ | ||
1301 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | ||
1302 | |||
1303 | /* | ||
1304 | * Update leaf_weight. We cannot update weight at this point | ||
1305 | * because cfqg might already have been activated and is | ||
1306 | * contributing its current weight to the parent's child_weight. | ||
1307 | */ | ||
1308 | cfq_update_group_leaf_weight(cfqg); | ||
1309 | __cfq_group_service_tree_add(st, cfqg); | ||
1310 | |||
1311 | /* | ||
1312 | * Activate @cfqg and calculate the portion of vfraction @cfqg is | ||
1313 | * entitled to. vfraction is calculated by walking the tree | ||
1314 | * towards the root calculating the fraction it has at each level. | ||
1315 | * The compounded ratio is how much vfraction @cfqg owns. | ||
1316 | * | ||
1317 | * Start with the proportion tasks in this cfqg has against active | ||
1318 | * children cfqgs - its leaf_weight against children_weight. | ||
1319 | */ | ||
1320 | propagate = !pos->nr_active++; | ||
1321 | pos->children_weight += pos->leaf_weight; | ||
1322 | vfr = vfr * pos->leaf_weight / pos->children_weight; | ||
1323 | |||
1324 | /* | ||
1325 | * Compound ->weight walking up the tree. Both activation and | ||
1326 | * vfraction calculation are done in the same loop. Propagation | ||
1327 | * stops once an already activated node is met. vfraction | ||
1328 | * calculation should always continue to the root. | ||
1329 | */ | ||
1330 | while ((parent = cfqg_parent(pos))) { | ||
1331 | if (propagate) { | ||
1332 | cfq_update_group_weight(pos); | ||
1333 | propagate = !parent->nr_active++; | ||
1334 | parent->children_weight += pos->weight; | ||
1335 | } | ||
1336 | vfr = vfr * pos->weight / parent->children_weight; | ||
1337 | pos = parent; | ||
1338 | } | ||
1339 | |||
1340 | cfqg->vfraction = max_t(unsigned, vfr, 1); | ||
1341 | } | ||
1342 | |||
1343 | static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd) | ||
1344 | { | ||
1345 | if (!iops_mode(cfqd)) | ||
1346 | return CFQ_SLICE_MODE_GROUP_DELAY; | ||
1347 | else | ||
1348 | return CFQ_IOPS_MODE_GROUP_DELAY; | ||
1349 | } | ||
1350 | |||
1351 | static void | ||
1352 | cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
1353 | { | ||
1354 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | ||
1355 | struct cfq_group *__cfqg; | ||
1356 | struct rb_node *n; | ||
1357 | |||
1358 | cfqg->nr_cfqq++; | ||
1359 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | ||
1360 | return; | ||
1361 | |||
1362 | /* | ||
1363 | * Currently put the group at the end. Later implement something | ||
1364 | * so that groups get lesser vtime based on their weights, so that | ||
1365 | * if group does not loose all if it was not continuously backlogged. | ||
1366 | */ | ||
1367 | n = st->rb_rightmost; | ||
1368 | if (n) { | ||
1369 | __cfqg = rb_entry_cfqg(n); | ||
1370 | cfqg->vdisktime = __cfqg->vdisktime + | ||
1371 | cfq_get_cfqg_vdisktime_delay(cfqd); | ||
1372 | } else | ||
1373 | cfqg->vdisktime = st->min_vdisktime; | ||
1374 | cfq_group_service_tree_add(st, cfqg); | ||
1375 | } | ||
1376 | |||
1377 | static void | ||
1378 | cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) | ||
1379 | { | ||
1380 | struct cfq_group *pos = cfqg; | ||
1381 | bool propagate; | ||
1382 | |||
1383 | /* | ||
1384 | * Undo activation from cfq_group_service_tree_add(). Deactivate | ||
1385 | * @cfqg and propagate deactivation upwards. | ||
1386 | */ | ||
1387 | propagate = !--pos->nr_active; | ||
1388 | pos->children_weight -= pos->leaf_weight; | ||
1389 | |||
1390 | while (propagate) { | ||
1391 | struct cfq_group *parent = cfqg_parent(pos); | ||
1392 | |||
1393 | /* @pos has 0 nr_active at this point */ | ||
1394 | WARN_ON_ONCE(pos->children_weight); | ||
1395 | pos->vfraction = 0; | ||
1396 | |||
1397 | if (!parent) | ||
1398 | break; | ||
1399 | |||
1400 | propagate = !--parent->nr_active; | ||
1401 | parent->children_weight -= pos->weight; | ||
1402 | pos = parent; | ||
1403 | } | ||
1404 | |||
1405 | /* remove from the service tree */ | ||
1406 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | ||
1407 | cfq_rb_erase(&cfqg->rb_node, st); | ||
1408 | } | ||
1409 | |||
1410 | static void | ||
1411 | cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
1412 | { | ||
1413 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | ||
1414 | |||
1415 | BUG_ON(cfqg->nr_cfqq < 1); | ||
1416 | cfqg->nr_cfqq--; | ||
1417 | |||
1418 | /* If there are other cfq queues under this group, don't delete it */ | ||
1419 | if (cfqg->nr_cfqq) | ||
1420 | return; | ||
1421 | |||
1422 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); | ||
1423 | cfq_group_service_tree_del(st, cfqg); | ||
1424 | cfqg->saved_wl_slice = 0; | ||
1425 | cfqg_stats_update_dequeue(cfqg); | ||
1426 | } | ||
1427 | |||
1428 | static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq, | ||
1429 | u64 *unaccounted_time) | ||
1430 | { | ||
1431 | u64 slice_used; | ||
1432 | u64 now = ktime_get_ns(); | ||
1433 | |||
1434 | /* | ||
1435 | * Queue got expired before even a single request completed or | ||
1436 | * got expired immediately after first request completion. | ||
1437 | */ | ||
1438 | if (!cfqq->slice_start || cfqq->slice_start == now) { | ||
1439 | /* | ||
1440 | * Also charge the seek time incurred to the group, otherwise | ||
1441 | * if there are mutiple queues in the group, each can dispatch | ||
1442 | * a single request on seeky media and cause lots of seek time | ||
1443 | * and group will never know it. | ||
1444 | */ | ||
1445 | slice_used = max_t(u64, (now - cfqq->dispatch_start), | ||
1446 | jiffies_to_nsecs(1)); | ||
1447 | } else { | ||
1448 | slice_used = now - cfqq->slice_start; | ||
1449 | if (slice_used > cfqq->allocated_slice) { | ||
1450 | *unaccounted_time = slice_used - cfqq->allocated_slice; | ||
1451 | slice_used = cfqq->allocated_slice; | ||
1452 | } | ||
1453 | if (cfqq->slice_start > cfqq->dispatch_start) | ||
1454 | *unaccounted_time += cfqq->slice_start - | ||
1455 | cfqq->dispatch_start; | ||
1456 | } | ||
1457 | |||
1458 | return slice_used; | ||
1459 | } | ||
1460 | |||
1461 | static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | ||
1462 | struct cfq_queue *cfqq) | ||
1463 | { | ||
1464 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | ||
1465 | u64 used_sl, charge, unaccounted_sl = 0; | ||
1466 | int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) | ||
1467 | - cfqg->service_tree_idle.count; | ||
1468 | unsigned int vfr; | ||
1469 | u64 now = ktime_get_ns(); | ||
1470 | |||
1471 | BUG_ON(nr_sync < 0); | ||
1472 | used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); | ||
1473 | |||
1474 | if (iops_mode(cfqd)) | ||
1475 | charge = cfqq->slice_dispatch; | ||
1476 | else if (!cfq_cfqq_sync(cfqq) && !nr_sync) | ||
1477 | charge = cfqq->allocated_slice; | ||
1478 | |||
1479 | /* | ||
1480 | * Can't update vdisktime while on service tree and cfqg->vfraction | ||
1481 | * is valid only while on it. Cache vfr, leave the service tree, | ||
1482 | * update vdisktime and go back on. The re-addition to the tree | ||
1483 | * will also update the weights as necessary. | ||
1484 | */ | ||
1485 | vfr = cfqg->vfraction; | ||
1486 | cfq_group_service_tree_del(st, cfqg); | ||
1487 | cfqg->vdisktime += cfqg_scale_charge(charge, vfr); | ||
1488 | cfq_group_service_tree_add(st, cfqg); | ||
1489 | |||
1490 | /* This group is being expired. Save the context */ | ||
1491 | if (cfqd->workload_expires > now) { | ||
1492 | cfqg->saved_wl_slice = cfqd->workload_expires - now; | ||
1493 | cfqg->saved_wl_type = cfqd->serving_wl_type; | ||
1494 | cfqg->saved_wl_class = cfqd->serving_wl_class; | ||
1495 | } else | ||
1496 | cfqg->saved_wl_slice = 0; | ||
1497 | |||
1498 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, | ||
1499 | st->min_vdisktime); | ||
1500 | cfq_log_cfqq(cfqq->cfqd, cfqq, | ||
1501 | "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu", | ||
1502 | used_sl, cfqq->slice_dispatch, charge, | ||
1503 | iops_mode(cfqd), cfqq->nr_sectors); | ||
1504 | cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); | ||
1505 | cfqg_stats_set_start_empty_time(cfqg); | ||
1506 | } | ||
1507 | |||
1508 | /** | ||
1509 | * cfq_init_cfqg_base - initialize base part of a cfq_group | ||
1510 | * @cfqg: cfq_group to initialize | ||
1511 | * | ||
1512 | * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED | ||
1513 | * is enabled or not. | ||
1514 | */ | ||
1515 | static void cfq_init_cfqg_base(struct cfq_group *cfqg) | ||
1516 | { | ||
1517 | struct cfq_rb_root *st; | ||
1518 | int i, j; | ||
1519 | |||
1520 | for_each_cfqg_st(cfqg, i, j, st) | ||
1521 | *st = CFQ_RB_ROOT; | ||
1522 | RB_CLEAR_NODE(&cfqg->rb_node); | ||
1523 | |||
1524 | cfqg->ttime.last_end_request = ktime_get_ns(); | ||
1525 | } | ||
1526 | |||
1527 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
1528 | static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, | ||
1529 | bool on_dfl, bool reset_dev, bool is_leaf_weight); | ||
1530 | |||
1531 | static void cfqg_stats_exit(struct cfqg_stats *stats) | ||
1532 | { | ||
1533 | blkg_rwstat_exit(&stats->merged); | ||
1534 | blkg_rwstat_exit(&stats->service_time); | ||
1535 | blkg_rwstat_exit(&stats->wait_time); | ||
1536 | blkg_rwstat_exit(&stats->queued); | ||
1537 | blkg_stat_exit(&stats->time); | ||
1538 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1539 | blkg_stat_exit(&stats->unaccounted_time); | ||
1540 | blkg_stat_exit(&stats->avg_queue_size_sum); | ||
1541 | blkg_stat_exit(&stats->avg_queue_size_samples); | ||
1542 | blkg_stat_exit(&stats->dequeue); | ||
1543 | blkg_stat_exit(&stats->group_wait_time); | ||
1544 | blkg_stat_exit(&stats->idle_time); | ||
1545 | blkg_stat_exit(&stats->empty_time); | ||
1546 | #endif | ||
1547 | } | ||
1548 | |||
1549 | static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) | ||
1550 | { | ||
1551 | if (blkg_rwstat_init(&stats->merged, gfp) || | ||
1552 | blkg_rwstat_init(&stats->service_time, gfp) || | ||
1553 | blkg_rwstat_init(&stats->wait_time, gfp) || | ||
1554 | blkg_rwstat_init(&stats->queued, gfp) || | ||
1555 | blkg_stat_init(&stats->time, gfp)) | ||
1556 | goto err; | ||
1557 | |||
1558 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1559 | if (blkg_stat_init(&stats->unaccounted_time, gfp) || | ||
1560 | blkg_stat_init(&stats->avg_queue_size_sum, gfp) || | ||
1561 | blkg_stat_init(&stats->avg_queue_size_samples, gfp) || | ||
1562 | blkg_stat_init(&stats->dequeue, gfp) || | ||
1563 | blkg_stat_init(&stats->group_wait_time, gfp) || | ||
1564 | blkg_stat_init(&stats->idle_time, gfp) || | ||
1565 | blkg_stat_init(&stats->empty_time, gfp)) | ||
1566 | goto err; | ||
1567 | #endif | ||
1568 | return 0; | ||
1569 | err: | ||
1570 | cfqg_stats_exit(stats); | ||
1571 | return -ENOMEM; | ||
1572 | } | ||
1573 | |||
1574 | static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) | ||
1575 | { | ||
1576 | struct cfq_group_data *cgd; | ||
1577 | |||
1578 | cgd = kzalloc(sizeof(*cgd), gfp); | ||
1579 | if (!cgd) | ||
1580 | return NULL; | ||
1581 | return &cgd->cpd; | ||
1582 | } | ||
1583 | |||
1584 | static void cfq_cpd_init(struct blkcg_policy_data *cpd) | ||
1585 | { | ||
1586 | struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); | ||
1587 | unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? | ||
1588 | CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; | ||
1589 | |||
1590 | if (cpd_to_blkcg(cpd) == &blkcg_root) | ||
1591 | weight *= 2; | ||
1592 | |||
1593 | cgd->weight = weight; | ||
1594 | cgd->leaf_weight = weight; | ||
1595 | } | ||
1596 | |||
1597 | static void cfq_cpd_free(struct blkcg_policy_data *cpd) | ||
1598 | { | ||
1599 | kfree(cpd_to_cfqgd(cpd)); | ||
1600 | } | ||
1601 | |||
1602 | static void cfq_cpd_bind(struct blkcg_policy_data *cpd) | ||
1603 | { | ||
1604 | struct blkcg *blkcg = cpd_to_blkcg(cpd); | ||
1605 | bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); | ||
1606 | unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; | ||
1607 | |||
1608 | if (blkcg == &blkcg_root) | ||
1609 | weight *= 2; | ||
1610 | |||
1611 | WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); | ||
1612 | WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); | ||
1613 | } | ||
1614 | |||
1615 | static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) | ||
1616 | { | ||
1617 | struct cfq_group *cfqg; | ||
1618 | |||
1619 | cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); | ||
1620 | if (!cfqg) | ||
1621 | return NULL; | ||
1622 | |||
1623 | cfq_init_cfqg_base(cfqg); | ||
1624 | if (cfqg_stats_init(&cfqg->stats, gfp)) { | ||
1625 | kfree(cfqg); | ||
1626 | return NULL; | ||
1627 | } | ||
1628 | |||
1629 | return &cfqg->pd; | ||
1630 | } | ||
1631 | |||
1632 | static void cfq_pd_init(struct blkg_policy_data *pd) | ||
1633 | { | ||
1634 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1635 | struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); | ||
1636 | |||
1637 | cfqg->weight = cgd->weight; | ||
1638 | cfqg->leaf_weight = cgd->leaf_weight; | ||
1639 | } | ||
1640 | |||
1641 | static void cfq_pd_offline(struct blkg_policy_data *pd) | ||
1642 | { | ||
1643 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1644 | int i; | ||
1645 | |||
1646 | for (i = 0; i < IOPRIO_BE_NR; i++) { | ||
1647 | if (cfqg->async_cfqq[0][i]) { | ||
1648 | cfq_put_queue(cfqg->async_cfqq[0][i]); | ||
1649 | cfqg->async_cfqq[0][i] = NULL; | ||
1650 | } | ||
1651 | if (cfqg->async_cfqq[1][i]) { | ||
1652 | cfq_put_queue(cfqg->async_cfqq[1][i]); | ||
1653 | cfqg->async_cfqq[1][i] = NULL; | ||
1654 | } | ||
1655 | } | ||
1656 | |||
1657 | if (cfqg->async_idle_cfqq) { | ||
1658 | cfq_put_queue(cfqg->async_idle_cfqq); | ||
1659 | cfqg->async_idle_cfqq = NULL; | ||
1660 | } | ||
1661 | |||
1662 | /* | ||
1663 | * @blkg is going offline and will be ignored by | ||
1664 | * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so | ||
1665 | * that they don't get lost. If IOs complete after this point, the | ||
1666 | * stats for them will be lost. Oh well... | ||
1667 | */ | ||
1668 | cfqg_stats_xfer_dead(cfqg); | ||
1669 | } | ||
1670 | |||
1671 | static void cfq_pd_free(struct blkg_policy_data *pd) | ||
1672 | { | ||
1673 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1674 | |||
1675 | cfqg_stats_exit(&cfqg->stats); | ||
1676 | return kfree(cfqg); | ||
1677 | } | ||
1678 | |||
1679 | static void cfq_pd_reset_stats(struct blkg_policy_data *pd) | ||
1680 | { | ||
1681 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1682 | |||
1683 | cfqg_stats_reset(&cfqg->stats); | ||
1684 | } | ||
1685 | |||
1686 | static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, | ||
1687 | struct blkcg *blkcg) | ||
1688 | { | ||
1689 | struct blkcg_gq *blkg; | ||
1690 | |||
1691 | blkg = blkg_lookup(blkcg, cfqd->queue); | ||
1692 | if (likely(blkg)) | ||
1693 | return blkg_to_cfqg(blkg); | ||
1694 | return NULL; | ||
1695 | } | ||
1696 | |||
1697 | static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) | ||
1698 | { | ||
1699 | cfqq->cfqg = cfqg; | ||
1700 | /* cfqq reference on cfqg */ | ||
1701 | cfqg_get(cfqg); | ||
1702 | } | ||
1703 | |||
1704 | static u64 cfqg_prfill_weight_device(struct seq_file *sf, | ||
1705 | struct blkg_policy_data *pd, int off) | ||
1706 | { | ||
1707 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1708 | |||
1709 | if (!cfqg->dev_weight) | ||
1710 | return 0; | ||
1711 | return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); | ||
1712 | } | ||
1713 | |||
1714 | static int cfqg_print_weight_device(struct seq_file *sf, void *v) | ||
1715 | { | ||
1716 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1717 | cfqg_prfill_weight_device, &blkcg_policy_cfq, | ||
1718 | 0, false); | ||
1719 | return 0; | ||
1720 | } | ||
1721 | |||
1722 | static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, | ||
1723 | struct blkg_policy_data *pd, int off) | ||
1724 | { | ||
1725 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1726 | |||
1727 | if (!cfqg->dev_leaf_weight) | ||
1728 | return 0; | ||
1729 | return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); | ||
1730 | } | ||
1731 | |||
1732 | static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) | ||
1733 | { | ||
1734 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1735 | cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, | ||
1736 | 0, false); | ||
1737 | return 0; | ||
1738 | } | ||
1739 | |||
1740 | static int cfq_print_weight(struct seq_file *sf, void *v) | ||
1741 | { | ||
1742 | struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); | ||
1743 | struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); | ||
1744 | unsigned int val = 0; | ||
1745 | |||
1746 | if (cgd) | ||
1747 | val = cgd->weight; | ||
1748 | |||
1749 | seq_printf(sf, "%u\n", val); | ||
1750 | return 0; | ||
1751 | } | ||
1752 | |||
1753 | static int cfq_print_leaf_weight(struct seq_file *sf, void *v) | ||
1754 | { | ||
1755 | struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); | ||
1756 | struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); | ||
1757 | unsigned int val = 0; | ||
1758 | |||
1759 | if (cgd) | ||
1760 | val = cgd->leaf_weight; | ||
1761 | |||
1762 | seq_printf(sf, "%u\n", val); | ||
1763 | return 0; | ||
1764 | } | ||
1765 | |||
1766 | static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, | ||
1767 | char *buf, size_t nbytes, loff_t off, | ||
1768 | bool on_dfl, bool is_leaf_weight) | ||
1769 | { | ||
1770 | unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; | ||
1771 | unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; | ||
1772 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | ||
1773 | struct blkg_conf_ctx ctx; | ||
1774 | struct cfq_group *cfqg; | ||
1775 | struct cfq_group_data *cfqgd; | ||
1776 | int ret; | ||
1777 | u64 v; | ||
1778 | |||
1779 | ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); | ||
1780 | if (ret) | ||
1781 | return ret; | ||
1782 | |||
1783 | if (sscanf(ctx.body, "%llu", &v) == 1) { | ||
1784 | /* require "default" on dfl */ | ||
1785 | ret = -ERANGE; | ||
1786 | if (!v && on_dfl) | ||
1787 | goto out_finish; | ||
1788 | } else if (!strcmp(strim(ctx.body), "default")) { | ||
1789 | v = 0; | ||
1790 | } else { | ||
1791 | ret = -EINVAL; | ||
1792 | goto out_finish; | ||
1793 | } | ||
1794 | |||
1795 | cfqg = blkg_to_cfqg(ctx.blkg); | ||
1796 | cfqgd = blkcg_to_cfqgd(blkcg); | ||
1797 | |||
1798 | ret = -ERANGE; | ||
1799 | if (!v || (v >= min && v <= max)) { | ||
1800 | if (!is_leaf_weight) { | ||
1801 | cfqg->dev_weight = v; | ||
1802 | cfqg->new_weight = v ?: cfqgd->weight; | ||
1803 | } else { | ||
1804 | cfqg->dev_leaf_weight = v; | ||
1805 | cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; | ||
1806 | } | ||
1807 | ret = 0; | ||
1808 | } | ||
1809 | out_finish: | ||
1810 | blkg_conf_finish(&ctx); | ||
1811 | return ret ?: nbytes; | ||
1812 | } | ||
1813 | |||
1814 | static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, | ||
1815 | char *buf, size_t nbytes, loff_t off) | ||
1816 | { | ||
1817 | return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); | ||
1818 | } | ||
1819 | |||
1820 | static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, | ||
1821 | char *buf, size_t nbytes, loff_t off) | ||
1822 | { | ||
1823 | return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); | ||
1824 | } | ||
1825 | |||
1826 | static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, | ||
1827 | bool on_dfl, bool reset_dev, bool is_leaf_weight) | ||
1828 | { | ||
1829 | unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; | ||
1830 | unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; | ||
1831 | struct blkcg *blkcg = css_to_blkcg(css); | ||
1832 | struct blkcg_gq *blkg; | ||
1833 | struct cfq_group_data *cfqgd; | ||
1834 | int ret = 0; | ||
1835 | |||
1836 | if (val < min || val > max) | ||
1837 | return -ERANGE; | ||
1838 | |||
1839 | spin_lock_irq(&blkcg->lock); | ||
1840 | cfqgd = blkcg_to_cfqgd(blkcg); | ||
1841 | if (!cfqgd) { | ||
1842 | ret = -EINVAL; | ||
1843 | goto out; | ||
1844 | } | ||
1845 | |||
1846 | if (!is_leaf_weight) | ||
1847 | cfqgd->weight = val; | ||
1848 | else | ||
1849 | cfqgd->leaf_weight = val; | ||
1850 | |||
1851 | hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { | ||
1852 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | ||
1853 | |||
1854 | if (!cfqg) | ||
1855 | continue; | ||
1856 | |||
1857 | if (!is_leaf_weight) { | ||
1858 | if (reset_dev) | ||
1859 | cfqg->dev_weight = 0; | ||
1860 | if (!cfqg->dev_weight) | ||
1861 | cfqg->new_weight = cfqgd->weight; | ||
1862 | } else { | ||
1863 | if (reset_dev) | ||
1864 | cfqg->dev_leaf_weight = 0; | ||
1865 | if (!cfqg->dev_leaf_weight) | ||
1866 | cfqg->new_leaf_weight = cfqgd->leaf_weight; | ||
1867 | } | ||
1868 | } | ||
1869 | |||
1870 | out: | ||
1871 | spin_unlock_irq(&blkcg->lock); | ||
1872 | return ret; | ||
1873 | } | ||
1874 | |||
1875 | static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, | ||
1876 | u64 val) | ||
1877 | { | ||
1878 | return __cfq_set_weight(css, val, false, false, false); | ||
1879 | } | ||
1880 | |||
1881 | static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, | ||
1882 | struct cftype *cft, u64 val) | ||
1883 | { | ||
1884 | return __cfq_set_weight(css, val, false, false, true); | ||
1885 | } | ||
1886 | |||
1887 | static int cfqg_print_stat(struct seq_file *sf, void *v) | ||
1888 | { | ||
1889 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, | ||
1890 | &blkcg_policy_cfq, seq_cft(sf)->private, false); | ||
1891 | return 0; | ||
1892 | } | ||
1893 | |||
1894 | static int cfqg_print_rwstat(struct seq_file *sf, void *v) | ||
1895 | { | ||
1896 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, | ||
1897 | &blkcg_policy_cfq, seq_cft(sf)->private, true); | ||
1898 | return 0; | ||
1899 | } | ||
1900 | |||
1901 | static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, | ||
1902 | struct blkg_policy_data *pd, int off) | ||
1903 | { | ||
1904 | u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), | ||
1905 | &blkcg_policy_cfq, off); | ||
1906 | return __blkg_prfill_u64(sf, pd, sum); | ||
1907 | } | ||
1908 | |||
1909 | static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, | ||
1910 | struct blkg_policy_data *pd, int off) | ||
1911 | { | ||
1912 | struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), | ||
1913 | &blkcg_policy_cfq, off); | ||
1914 | return __blkg_prfill_rwstat(sf, pd, &sum); | ||
1915 | } | ||
1916 | |||
1917 | static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) | ||
1918 | { | ||
1919 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1920 | cfqg_prfill_stat_recursive, &blkcg_policy_cfq, | ||
1921 | seq_cft(sf)->private, false); | ||
1922 | return 0; | ||
1923 | } | ||
1924 | |||
1925 | static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) | ||
1926 | { | ||
1927 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1928 | cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, | ||
1929 | seq_cft(sf)->private, true); | ||
1930 | return 0; | ||
1931 | } | ||
1932 | |||
1933 | static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, | ||
1934 | int off) | ||
1935 | { | ||
1936 | u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); | ||
1937 | |||
1938 | return __blkg_prfill_u64(sf, pd, sum >> 9); | ||
1939 | } | ||
1940 | |||
1941 | static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) | ||
1942 | { | ||
1943 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1944 | cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); | ||
1945 | return 0; | ||
1946 | } | ||
1947 | |||
1948 | static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, | ||
1949 | struct blkg_policy_data *pd, int off) | ||
1950 | { | ||
1951 | struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, | ||
1952 | offsetof(struct blkcg_gq, stat_bytes)); | ||
1953 | u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + | ||
1954 | atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); | ||
1955 | |||
1956 | return __blkg_prfill_u64(sf, pd, sum >> 9); | ||
1957 | } | ||
1958 | |||
1959 | static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) | ||
1960 | { | ||
1961 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1962 | cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, | ||
1963 | false); | ||
1964 | return 0; | ||
1965 | } | ||
1966 | |||
1967 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1968 | static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, | ||
1969 | struct blkg_policy_data *pd, int off) | ||
1970 | { | ||
1971 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1972 | u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); | ||
1973 | u64 v = 0; | ||
1974 | |||
1975 | if (samples) { | ||
1976 | v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); | ||
1977 | v = div64_u64(v, samples); | ||
1978 | } | ||
1979 | __blkg_prfill_u64(sf, pd, v); | ||
1980 | return 0; | ||
1981 | } | ||
1982 | |||
1983 | /* print avg_queue_size */ | ||
1984 | static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) | ||
1985 | { | ||
1986 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1987 | cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, | ||
1988 | 0, false); | ||
1989 | return 0; | ||
1990 | } | ||
1991 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
1992 | |||
1993 | static struct cftype cfq_blkcg_legacy_files[] = { | ||
1994 | /* on root, weight is mapped to leaf_weight */ | ||
1995 | { | ||
1996 | .name = "weight_device", | ||
1997 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
1998 | .seq_show = cfqg_print_leaf_weight_device, | ||
1999 | .write = cfqg_set_leaf_weight_device, | ||
2000 | }, | ||
2001 | { | ||
2002 | .name = "weight", | ||
2003 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
2004 | .seq_show = cfq_print_leaf_weight, | ||
2005 | .write_u64 = cfq_set_leaf_weight, | ||
2006 | }, | ||
2007 | |||
2008 | /* no such mapping necessary for !roots */ | ||
2009 | { | ||
2010 | .name = "weight_device", | ||
2011 | .flags = CFTYPE_NOT_ON_ROOT, | ||
2012 | .seq_show = cfqg_print_weight_device, | ||
2013 | .write = cfqg_set_weight_device, | ||
2014 | }, | ||
2015 | { | ||
2016 | .name = "weight", | ||
2017 | .flags = CFTYPE_NOT_ON_ROOT, | ||
2018 | .seq_show = cfq_print_weight, | ||
2019 | .write_u64 = cfq_set_weight, | ||
2020 | }, | ||
2021 | |||
2022 | { | ||
2023 | .name = "leaf_weight_device", | ||
2024 | .seq_show = cfqg_print_leaf_weight_device, | ||
2025 | .write = cfqg_set_leaf_weight_device, | ||
2026 | }, | ||
2027 | { | ||
2028 | .name = "leaf_weight", | ||
2029 | .seq_show = cfq_print_leaf_weight, | ||
2030 | .write_u64 = cfq_set_leaf_weight, | ||
2031 | }, | ||
2032 | |||
2033 | /* statistics, covers only the tasks in the cfqg */ | ||
2034 | { | ||
2035 | .name = "time", | ||
2036 | .private = offsetof(struct cfq_group, stats.time), | ||
2037 | .seq_show = cfqg_print_stat, | ||
2038 | }, | ||
2039 | { | ||
2040 | .name = "sectors", | ||
2041 | .seq_show = cfqg_print_stat_sectors, | ||
2042 | }, | ||
2043 | { | ||
2044 | .name = "io_service_bytes", | ||
2045 | .private = (unsigned long)&blkcg_policy_cfq, | ||
2046 | .seq_show = blkg_print_stat_bytes, | ||
2047 | }, | ||
2048 | { | ||
2049 | .name = "io_serviced", | ||
2050 | .private = (unsigned long)&blkcg_policy_cfq, | ||
2051 | .seq_show = blkg_print_stat_ios, | ||
2052 | }, | ||
2053 | { | ||
2054 | .name = "io_service_time", | ||
2055 | .private = offsetof(struct cfq_group, stats.service_time), | ||
2056 | .seq_show = cfqg_print_rwstat, | ||
2057 | }, | ||
2058 | { | ||
2059 | .name = "io_wait_time", | ||
2060 | .private = offsetof(struct cfq_group, stats.wait_time), | ||
2061 | .seq_show = cfqg_print_rwstat, | ||
2062 | }, | ||
2063 | { | ||
2064 | .name = "io_merged", | ||
2065 | .private = offsetof(struct cfq_group, stats.merged), | ||
2066 | .seq_show = cfqg_print_rwstat, | ||
2067 | }, | ||
2068 | { | ||
2069 | .name = "io_queued", | ||
2070 | .private = offsetof(struct cfq_group, stats.queued), | ||
2071 | .seq_show = cfqg_print_rwstat, | ||
2072 | }, | ||
2073 | |||
2074 | /* the same statictics which cover the cfqg and its descendants */ | ||
2075 | { | ||
2076 | .name = "time_recursive", | ||
2077 | .private = offsetof(struct cfq_group, stats.time), | ||
2078 | .seq_show = cfqg_print_stat_recursive, | ||
2079 | }, | ||
2080 | { | ||
2081 | .name = "sectors_recursive", | ||
2082 | .seq_show = cfqg_print_stat_sectors_recursive, | ||
2083 | }, | ||
2084 | { | ||
2085 | .name = "io_service_bytes_recursive", | ||
2086 | .private = (unsigned long)&blkcg_policy_cfq, | ||
2087 | .seq_show = blkg_print_stat_bytes_recursive, | ||
2088 | }, | ||
2089 | { | ||
2090 | .name = "io_serviced_recursive", | ||
2091 | .private = (unsigned long)&blkcg_policy_cfq, | ||
2092 | .seq_show = blkg_print_stat_ios_recursive, | ||
2093 | }, | ||
2094 | { | ||
2095 | .name = "io_service_time_recursive", | ||
2096 | .private = offsetof(struct cfq_group, stats.service_time), | ||
2097 | .seq_show = cfqg_print_rwstat_recursive, | ||
2098 | }, | ||
2099 | { | ||
2100 | .name = "io_wait_time_recursive", | ||
2101 | .private = offsetof(struct cfq_group, stats.wait_time), | ||
2102 | .seq_show = cfqg_print_rwstat_recursive, | ||
2103 | }, | ||
2104 | { | ||
2105 | .name = "io_merged_recursive", | ||
2106 | .private = offsetof(struct cfq_group, stats.merged), | ||
2107 | .seq_show = cfqg_print_rwstat_recursive, | ||
2108 | }, | ||
2109 | { | ||
2110 | .name = "io_queued_recursive", | ||
2111 | .private = offsetof(struct cfq_group, stats.queued), | ||
2112 | .seq_show = cfqg_print_rwstat_recursive, | ||
2113 | }, | ||
2114 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
2115 | { | ||
2116 | .name = "avg_queue_size", | ||
2117 | .seq_show = cfqg_print_avg_queue_size, | ||
2118 | }, | ||
2119 | { | ||
2120 | .name = "group_wait_time", | ||
2121 | .private = offsetof(struct cfq_group, stats.group_wait_time), | ||
2122 | .seq_show = cfqg_print_stat, | ||
2123 | }, | ||
2124 | { | ||
2125 | .name = "idle_time", | ||
2126 | .private = offsetof(struct cfq_group, stats.idle_time), | ||
2127 | .seq_show = cfqg_print_stat, | ||
2128 | }, | ||
2129 | { | ||
2130 | .name = "empty_time", | ||
2131 | .private = offsetof(struct cfq_group, stats.empty_time), | ||
2132 | .seq_show = cfqg_print_stat, | ||
2133 | }, | ||
2134 | { | ||
2135 | .name = "dequeue", | ||
2136 | .private = offsetof(struct cfq_group, stats.dequeue), | ||
2137 | .seq_show = cfqg_print_stat, | ||
2138 | }, | ||
2139 | { | ||
2140 | .name = "unaccounted_time", | ||
2141 | .private = offsetof(struct cfq_group, stats.unaccounted_time), | ||
2142 | .seq_show = cfqg_print_stat, | ||
2143 | }, | ||
2144 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
2145 | { } /* terminate */ | ||
2146 | }; | ||
2147 | |||
2148 | static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) | ||
2149 | { | ||
2150 | struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); | ||
2151 | struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); | ||
2152 | |||
2153 | seq_printf(sf, "default %u\n", cgd->weight); | ||
2154 | blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, | ||
2155 | &blkcg_policy_cfq, 0, false); | ||
2156 | return 0; | ||
2157 | } | ||
2158 | |||
2159 | static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, | ||
2160 | char *buf, size_t nbytes, loff_t off) | ||
2161 | { | ||
2162 | char *endp; | ||
2163 | int ret; | ||
2164 | u64 v; | ||
2165 | |||
2166 | buf = strim(buf); | ||
2167 | |||
2168 | /* "WEIGHT" or "default WEIGHT" sets the default weight */ | ||
2169 | v = simple_strtoull(buf, &endp, 0); | ||
2170 | if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { | ||
2171 | ret = __cfq_set_weight(of_css(of), v, true, false, false); | ||
2172 | return ret ?: nbytes; | ||
2173 | } | ||
2174 | |||
2175 | /* "MAJ:MIN WEIGHT" */ | ||
2176 | return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); | ||
2177 | } | ||
2178 | |||
2179 | static struct cftype cfq_blkcg_files[] = { | ||
2180 | { | ||
2181 | .name = "weight", | ||
2182 | .flags = CFTYPE_NOT_ON_ROOT, | ||
2183 | .seq_show = cfq_print_weight_on_dfl, | ||
2184 | .write = cfq_set_weight_on_dfl, | ||
2185 | }, | ||
2186 | { } /* terminate */ | ||
2187 | }; | ||
2188 | |||
2189 | #else /* GROUP_IOSCHED */ | ||
2190 | static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, | ||
2191 | struct blkcg *blkcg) | ||
2192 | { | ||
2193 | return cfqd->root_group; | ||
2194 | } | ||
2195 | |||
2196 | static inline void | ||
2197 | cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { | ||
2198 | cfqq->cfqg = cfqg; | ||
2199 | } | ||
2200 | |||
2201 | #endif /* GROUP_IOSCHED */ | ||
2202 | |||
2203 | /* | ||
2204 | * The cfqd->service_trees holds all pending cfq_queue's that have | ||
2205 | * requests waiting to be processed. It is sorted in the order that | ||
2206 | * we will service the queues. | ||
2207 | */ | ||
2208 | static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
2209 | bool add_front) | ||
2210 | { | ||
2211 | struct rb_node **p, *parent; | ||
2212 | struct cfq_queue *__cfqq; | ||
2213 | u64 rb_key; | ||
2214 | struct cfq_rb_root *st; | ||
2215 | bool leftmost = true; | ||
2216 | int new_cfqq = 1; | ||
2217 | u64 now = ktime_get_ns(); | ||
2218 | |||
2219 | st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq)); | ||
2220 | if (cfq_class_idle(cfqq)) { | ||
2221 | rb_key = CFQ_IDLE_DELAY; | ||
2222 | parent = st->rb_rightmost; | ||
2223 | if (parent && parent != &cfqq->rb_node) { | ||
2224 | __cfqq = rb_entry(parent, struct cfq_queue, rb_node); | ||
2225 | rb_key += __cfqq->rb_key; | ||
2226 | } else | ||
2227 | rb_key += now; | ||
2228 | } else if (!add_front) { | ||
2229 | /* | ||
2230 | * Get our rb key offset. Subtract any residual slice | ||
2231 | * value carried from last service. A negative resid | ||
2232 | * count indicates slice overrun, and this should position | ||
2233 | * the next service time further away in the tree. | ||
2234 | */ | ||
2235 | rb_key = cfq_slice_offset(cfqd, cfqq) + now; | ||
2236 | rb_key -= cfqq->slice_resid; | ||
2237 | cfqq->slice_resid = 0; | ||
2238 | } else { | ||
2239 | rb_key = -NSEC_PER_SEC; | ||
2240 | __cfqq = cfq_rb_first(st); | ||
2241 | rb_key += __cfqq ? __cfqq->rb_key : now; | ||
2242 | } | ||
2243 | |||
2244 | if (!RB_EMPTY_NODE(&cfqq->rb_node)) { | ||
2245 | new_cfqq = 0; | ||
2246 | /* | ||
2247 | * same position, nothing more to do | ||
2248 | */ | ||
2249 | if (rb_key == cfqq->rb_key && cfqq->service_tree == st) | ||
2250 | return; | ||
2251 | |||
2252 | cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); | ||
2253 | cfqq->service_tree = NULL; | ||
2254 | } | ||
2255 | |||
2256 | parent = NULL; | ||
2257 | cfqq->service_tree = st; | ||
2258 | p = &st->rb.rb_root.rb_node; | ||
2259 | while (*p) { | ||
2260 | parent = *p; | ||
2261 | __cfqq = rb_entry(parent, struct cfq_queue, rb_node); | ||
2262 | |||
2263 | /* | ||
2264 | * sort by key, that represents service time. | ||
2265 | */ | ||
2266 | if (rb_key < __cfqq->rb_key) | ||
2267 | p = &parent->rb_left; | ||
2268 | else { | ||
2269 | p = &parent->rb_right; | ||
2270 | leftmost = false; | ||
2271 | } | ||
2272 | } | ||
2273 | |||
2274 | cfqq->rb_key = rb_key; | ||
2275 | rb_link_node(&cfqq->rb_node, parent, p); | ||
2276 | rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost); | ||
2277 | st->count++; | ||
2278 | if (add_front || !new_cfqq) | ||
2279 | return; | ||
2280 | cfq_group_notify_queue_add(cfqd, cfqq->cfqg); | ||
2281 | } | ||
2282 | |||
2283 | static struct cfq_queue * | ||
2284 | cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, | ||
2285 | sector_t sector, struct rb_node **ret_parent, | ||
2286 | struct rb_node ***rb_link) | ||
2287 | { | ||
2288 | struct rb_node **p, *parent; | ||
2289 | struct cfq_queue *cfqq = NULL; | ||
2290 | |||
2291 | parent = NULL; | ||
2292 | p = &root->rb_node; | ||
2293 | while (*p) { | ||
2294 | struct rb_node **n; | ||
2295 | |||
2296 | parent = *p; | ||
2297 | cfqq = rb_entry(parent, struct cfq_queue, p_node); | ||
2298 | |||
2299 | /* | ||
2300 | * Sort strictly based on sector. Smallest to the left, | ||
2301 | * largest to the right. | ||
2302 | */ | ||
2303 | if (sector > blk_rq_pos(cfqq->next_rq)) | ||
2304 | n = &(*p)->rb_right; | ||
2305 | else if (sector < blk_rq_pos(cfqq->next_rq)) | ||
2306 | n = &(*p)->rb_left; | ||
2307 | else | ||
2308 | break; | ||
2309 | p = n; | ||
2310 | cfqq = NULL; | ||
2311 | } | ||
2312 | |||
2313 | *ret_parent = parent; | ||
2314 | if (rb_link) | ||
2315 | *rb_link = p; | ||
2316 | return cfqq; | ||
2317 | } | ||
2318 | |||
2319 | static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2320 | { | ||
2321 | struct rb_node **p, *parent; | ||
2322 | struct cfq_queue *__cfqq; | ||
2323 | |||
2324 | if (cfqq->p_root) { | ||
2325 | rb_erase(&cfqq->p_node, cfqq->p_root); | ||
2326 | cfqq->p_root = NULL; | ||
2327 | } | ||
2328 | |||
2329 | if (cfq_class_idle(cfqq)) | ||
2330 | return; | ||
2331 | if (!cfqq->next_rq) | ||
2332 | return; | ||
2333 | |||
2334 | cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; | ||
2335 | __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, | ||
2336 | blk_rq_pos(cfqq->next_rq), &parent, &p); | ||
2337 | if (!__cfqq) { | ||
2338 | rb_link_node(&cfqq->p_node, parent, p); | ||
2339 | rb_insert_color(&cfqq->p_node, cfqq->p_root); | ||
2340 | } else | ||
2341 | cfqq->p_root = NULL; | ||
2342 | } | ||
2343 | |||
2344 | /* | ||
2345 | * Update cfqq's position in the service tree. | ||
2346 | */ | ||
2347 | static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2348 | { | ||
2349 | /* | ||
2350 | * Resorting requires the cfqq to be on the RR list already. | ||
2351 | */ | ||
2352 | if (cfq_cfqq_on_rr(cfqq)) { | ||
2353 | cfq_service_tree_add(cfqd, cfqq, 0); | ||
2354 | cfq_prio_tree_add(cfqd, cfqq); | ||
2355 | } | ||
2356 | } | ||
2357 | |||
2358 | /* | ||
2359 | * add to busy list of queues for service, trying to be fair in ordering | ||
2360 | * the pending list according to last request service | ||
2361 | */ | ||
2362 | static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2363 | { | ||
2364 | cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); | ||
2365 | BUG_ON(cfq_cfqq_on_rr(cfqq)); | ||
2366 | cfq_mark_cfqq_on_rr(cfqq); | ||
2367 | cfqd->busy_queues++; | ||
2368 | if (cfq_cfqq_sync(cfqq)) | ||
2369 | cfqd->busy_sync_queues++; | ||
2370 | |||
2371 | cfq_resort_rr_list(cfqd, cfqq); | ||
2372 | } | ||
2373 | |||
2374 | /* | ||
2375 | * Called when the cfqq no longer has requests pending, remove it from | ||
2376 | * the service tree. | ||
2377 | */ | ||
2378 | static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2379 | { | ||
2380 | cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); | ||
2381 | BUG_ON(!cfq_cfqq_on_rr(cfqq)); | ||
2382 | cfq_clear_cfqq_on_rr(cfqq); | ||
2383 | |||
2384 | if (!RB_EMPTY_NODE(&cfqq->rb_node)) { | ||
2385 | cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); | ||
2386 | cfqq->service_tree = NULL; | ||
2387 | } | ||
2388 | if (cfqq->p_root) { | ||
2389 | rb_erase(&cfqq->p_node, cfqq->p_root); | ||
2390 | cfqq->p_root = NULL; | ||
2391 | } | ||
2392 | |||
2393 | cfq_group_notify_queue_del(cfqd, cfqq->cfqg); | ||
2394 | BUG_ON(!cfqd->busy_queues); | ||
2395 | cfqd->busy_queues--; | ||
2396 | if (cfq_cfqq_sync(cfqq)) | ||
2397 | cfqd->busy_sync_queues--; | ||
2398 | } | ||
2399 | |||
2400 | /* | ||
2401 | * rb tree support functions | ||
2402 | */ | ||
2403 | static void cfq_del_rq_rb(struct request *rq) | ||
2404 | { | ||
2405 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2406 | const int sync = rq_is_sync(rq); | ||
2407 | |||
2408 | BUG_ON(!cfqq->queued[sync]); | ||
2409 | cfqq->queued[sync]--; | ||
2410 | |||
2411 | elv_rb_del(&cfqq->sort_list, rq); | ||
2412 | |||
2413 | if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { | ||
2414 | /* | ||
2415 | * Queue will be deleted from service tree when we actually | ||
2416 | * expire it later. Right now just remove it from prio tree | ||
2417 | * as it is empty. | ||
2418 | */ | ||
2419 | if (cfqq->p_root) { | ||
2420 | rb_erase(&cfqq->p_node, cfqq->p_root); | ||
2421 | cfqq->p_root = NULL; | ||
2422 | } | ||
2423 | } | ||
2424 | } | ||
2425 | |||
2426 | static void cfq_add_rq_rb(struct request *rq) | ||
2427 | { | ||
2428 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2429 | struct cfq_data *cfqd = cfqq->cfqd; | ||
2430 | struct request *prev; | ||
2431 | |||
2432 | cfqq->queued[rq_is_sync(rq)]++; | ||
2433 | |||
2434 | elv_rb_add(&cfqq->sort_list, rq); | ||
2435 | |||
2436 | if (!cfq_cfqq_on_rr(cfqq)) | ||
2437 | cfq_add_cfqq_rr(cfqd, cfqq); | ||
2438 | |||
2439 | /* | ||
2440 | * check if this request is a better next-serve candidate | ||
2441 | */ | ||
2442 | prev = cfqq->next_rq; | ||
2443 | cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); | ||
2444 | |||
2445 | /* | ||
2446 | * adjust priority tree position, if ->next_rq changes | ||
2447 | */ | ||
2448 | if (prev != cfqq->next_rq) | ||
2449 | cfq_prio_tree_add(cfqd, cfqq); | ||
2450 | |||
2451 | BUG_ON(!cfqq->next_rq); | ||
2452 | } | ||
2453 | |||
2454 | static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) | ||
2455 | { | ||
2456 | elv_rb_del(&cfqq->sort_list, rq); | ||
2457 | cfqq->queued[rq_is_sync(rq)]--; | ||
2458 | cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); | ||
2459 | cfq_add_rq_rb(rq); | ||
2460 | cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, | ||
2461 | rq->cmd_flags); | ||
2462 | } | ||
2463 | |||
2464 | static struct request * | ||
2465 | cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) | ||
2466 | { | ||
2467 | struct task_struct *tsk = current; | ||
2468 | struct cfq_io_cq *cic; | ||
2469 | struct cfq_queue *cfqq; | ||
2470 | |||
2471 | cic = cfq_cic_lookup(cfqd, tsk->io_context); | ||
2472 | if (!cic) | ||
2473 | return NULL; | ||
2474 | |||
2475 | cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf)); | ||
2476 | if (cfqq) | ||
2477 | return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); | ||
2478 | |||
2479 | return NULL; | ||
2480 | } | ||
2481 | |||
2482 | static void cfq_activate_request(struct request_queue *q, struct request *rq) | ||
2483 | { | ||
2484 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2485 | |||
2486 | cfqd->rq_in_driver++; | ||
2487 | cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", | ||
2488 | cfqd->rq_in_driver); | ||
2489 | |||
2490 | cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); | ||
2491 | } | ||
2492 | |||
2493 | static void cfq_deactivate_request(struct request_queue *q, struct request *rq) | ||
2494 | { | ||
2495 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2496 | |||
2497 | WARN_ON(!cfqd->rq_in_driver); | ||
2498 | cfqd->rq_in_driver--; | ||
2499 | cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", | ||
2500 | cfqd->rq_in_driver); | ||
2501 | } | ||
2502 | |||
2503 | static void cfq_remove_request(struct request *rq) | ||
2504 | { | ||
2505 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2506 | |||
2507 | if (cfqq->next_rq == rq) | ||
2508 | cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); | ||
2509 | |||
2510 | list_del_init(&rq->queuelist); | ||
2511 | cfq_del_rq_rb(rq); | ||
2512 | |||
2513 | cfqq->cfqd->rq_queued--; | ||
2514 | cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); | ||
2515 | if (rq->cmd_flags & REQ_PRIO) { | ||
2516 | WARN_ON(!cfqq->prio_pending); | ||
2517 | cfqq->prio_pending--; | ||
2518 | } | ||
2519 | } | ||
2520 | |||
2521 | static enum elv_merge cfq_merge(struct request_queue *q, struct request **req, | ||
2522 | struct bio *bio) | ||
2523 | { | ||
2524 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2525 | struct request *__rq; | ||
2526 | |||
2527 | __rq = cfq_find_rq_fmerge(cfqd, bio); | ||
2528 | if (__rq && elv_bio_merge_ok(__rq, bio)) { | ||
2529 | *req = __rq; | ||
2530 | return ELEVATOR_FRONT_MERGE; | ||
2531 | } | ||
2532 | |||
2533 | return ELEVATOR_NO_MERGE; | ||
2534 | } | ||
2535 | |||
2536 | static void cfq_merged_request(struct request_queue *q, struct request *req, | ||
2537 | enum elv_merge type) | ||
2538 | { | ||
2539 | if (type == ELEVATOR_FRONT_MERGE) { | ||
2540 | struct cfq_queue *cfqq = RQ_CFQQ(req); | ||
2541 | |||
2542 | cfq_reposition_rq_rb(cfqq, req); | ||
2543 | } | ||
2544 | } | ||
2545 | |||
2546 | static void cfq_bio_merged(struct request_queue *q, struct request *req, | ||
2547 | struct bio *bio) | ||
2548 | { | ||
2549 | cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf); | ||
2550 | } | ||
2551 | |||
2552 | static void | ||
2553 | cfq_merged_requests(struct request_queue *q, struct request *rq, | ||
2554 | struct request *next) | ||
2555 | { | ||
2556 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2557 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2558 | |||
2559 | /* | ||
2560 | * reposition in fifo if next is older than rq | ||
2561 | */ | ||
2562 | if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && | ||
2563 | next->fifo_time < rq->fifo_time && | ||
2564 | cfqq == RQ_CFQQ(next)) { | ||
2565 | list_move(&rq->queuelist, &next->queuelist); | ||
2566 | rq->fifo_time = next->fifo_time; | ||
2567 | } | ||
2568 | |||
2569 | if (cfqq->next_rq == next) | ||
2570 | cfqq->next_rq = rq; | ||
2571 | cfq_remove_request(next); | ||
2572 | cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); | ||
2573 | |||
2574 | cfqq = RQ_CFQQ(next); | ||
2575 | /* | ||
2576 | * all requests of this queue are merged to other queues, delete it | ||
2577 | * from the service tree. If it's the active_queue, | ||
2578 | * cfq_dispatch_requests() will choose to expire it or do idle | ||
2579 | */ | ||
2580 | if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && | ||
2581 | cfqq != cfqd->active_queue) | ||
2582 | cfq_del_cfqq_rr(cfqd, cfqq); | ||
2583 | } | ||
2584 | |||
2585 | static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, | ||
2586 | struct bio *bio) | ||
2587 | { | ||
2588 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2589 | bool is_sync = op_is_sync(bio->bi_opf); | ||
2590 | struct cfq_io_cq *cic; | ||
2591 | struct cfq_queue *cfqq; | ||
2592 | |||
2593 | /* | ||
2594 | * Disallow merge of a sync bio into an async request. | ||
2595 | */ | ||
2596 | if (is_sync && !rq_is_sync(rq)) | ||
2597 | return false; | ||
2598 | |||
2599 | /* | ||
2600 | * Lookup the cfqq that this bio will be queued with and allow | ||
2601 | * merge only if rq is queued there. | ||
2602 | */ | ||
2603 | cic = cfq_cic_lookup(cfqd, current->io_context); | ||
2604 | if (!cic) | ||
2605 | return false; | ||
2606 | |||
2607 | cfqq = cic_to_cfqq(cic, is_sync); | ||
2608 | return cfqq == RQ_CFQQ(rq); | ||
2609 | } | ||
2610 | |||
2611 | static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq, | ||
2612 | struct request *next) | ||
2613 | { | ||
2614 | return RQ_CFQQ(rq) == RQ_CFQQ(next); | ||
2615 | } | ||
2616 | |||
2617 | static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2618 | { | ||
2619 | hrtimer_try_to_cancel(&cfqd->idle_slice_timer); | ||
2620 | cfqg_stats_update_idle_time(cfqq->cfqg); | ||
2621 | } | ||
2622 | |||
2623 | static void __cfq_set_active_queue(struct cfq_data *cfqd, | ||
2624 | struct cfq_queue *cfqq) | ||
2625 | { | ||
2626 | if (cfqq) { | ||
2627 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d", | ||
2628 | cfqd->serving_wl_class, cfqd->serving_wl_type); | ||
2629 | cfqg_stats_update_avg_queue_size(cfqq->cfqg); | ||
2630 | cfqq->slice_start = 0; | ||
2631 | cfqq->dispatch_start = ktime_get_ns(); | ||
2632 | cfqq->allocated_slice = 0; | ||
2633 | cfqq->slice_end = 0; | ||
2634 | cfqq->slice_dispatch = 0; | ||
2635 | cfqq->nr_sectors = 0; | ||
2636 | |||
2637 | cfq_clear_cfqq_wait_request(cfqq); | ||
2638 | cfq_clear_cfqq_must_dispatch(cfqq); | ||
2639 | cfq_clear_cfqq_must_alloc_slice(cfqq); | ||
2640 | cfq_clear_cfqq_fifo_expire(cfqq); | ||
2641 | cfq_mark_cfqq_slice_new(cfqq); | ||
2642 | |||
2643 | cfq_del_timer(cfqd, cfqq); | ||
2644 | } | ||
2645 | |||
2646 | cfqd->active_queue = cfqq; | ||
2647 | } | ||
2648 | |||
2649 | /* | ||
2650 | * current cfqq expired its slice (or was too idle), select new one | ||
2651 | */ | ||
2652 | static void | ||
2653 | __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
2654 | bool timed_out) | ||
2655 | { | ||
2656 | cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); | ||
2657 | |||
2658 | if (cfq_cfqq_wait_request(cfqq)) | ||
2659 | cfq_del_timer(cfqd, cfqq); | ||
2660 | |||
2661 | cfq_clear_cfqq_wait_request(cfqq); | ||
2662 | cfq_clear_cfqq_wait_busy(cfqq); | ||
2663 | |||
2664 | /* | ||
2665 | * If this cfqq is shared between multiple processes, check to | ||
2666 | * make sure that those processes are still issuing I/Os within | ||
2667 | * the mean seek distance. If not, it may be time to break the | ||
2668 | * queues apart again. | ||
2669 | */ | ||
2670 | if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) | ||
2671 | cfq_mark_cfqq_split_coop(cfqq); | ||
2672 | |||
2673 | /* | ||
2674 | * store what was left of this slice, if the queue idled/timed out | ||
2675 | */ | ||
2676 | if (timed_out) { | ||
2677 | if (cfq_cfqq_slice_new(cfqq)) | ||
2678 | cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq); | ||
2679 | else | ||
2680 | cfqq->slice_resid = cfqq->slice_end - ktime_get_ns(); | ||
2681 | cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid); | ||
2682 | } | ||
2683 | |||
2684 | cfq_group_served(cfqd, cfqq->cfqg, cfqq); | ||
2685 | |||
2686 | if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
2687 | cfq_del_cfqq_rr(cfqd, cfqq); | ||
2688 | |||
2689 | cfq_resort_rr_list(cfqd, cfqq); | ||
2690 | |||
2691 | if (cfqq == cfqd->active_queue) | ||
2692 | cfqd->active_queue = NULL; | ||
2693 | |||
2694 | if (cfqd->active_cic) { | ||
2695 | put_io_context(cfqd->active_cic->icq.ioc); | ||
2696 | cfqd->active_cic = NULL; | ||
2697 | } | ||
2698 | } | ||
2699 | |||
2700 | static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) | ||
2701 | { | ||
2702 | struct cfq_queue *cfqq = cfqd->active_queue; | ||
2703 | |||
2704 | if (cfqq) | ||
2705 | __cfq_slice_expired(cfqd, cfqq, timed_out); | ||
2706 | } | ||
2707 | |||
2708 | /* | ||
2709 | * Get next queue for service. Unless we have a queue preemption, | ||
2710 | * we'll simply select the first cfqq in the service tree. | ||
2711 | */ | ||
2712 | static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) | ||
2713 | { | ||
2714 | struct cfq_rb_root *st = st_for(cfqd->serving_group, | ||
2715 | cfqd->serving_wl_class, cfqd->serving_wl_type); | ||
2716 | |||
2717 | if (!cfqd->rq_queued) | ||
2718 | return NULL; | ||
2719 | |||
2720 | /* There is nothing to dispatch */ | ||
2721 | if (!st) | ||
2722 | return NULL; | ||
2723 | if (RB_EMPTY_ROOT(&st->rb.rb_root)) | ||
2724 | return NULL; | ||
2725 | return cfq_rb_first(st); | ||
2726 | } | ||
2727 | |||
2728 | static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) | ||
2729 | { | ||
2730 | struct cfq_group *cfqg; | ||
2731 | struct cfq_queue *cfqq; | ||
2732 | int i, j; | ||
2733 | struct cfq_rb_root *st; | ||
2734 | |||
2735 | if (!cfqd->rq_queued) | ||
2736 | return NULL; | ||
2737 | |||
2738 | cfqg = cfq_get_next_cfqg(cfqd); | ||
2739 | if (!cfqg) | ||
2740 | return NULL; | ||
2741 | |||
2742 | for_each_cfqg_st(cfqg, i, j, st) { | ||
2743 | cfqq = cfq_rb_first(st); | ||
2744 | if (cfqq) | ||
2745 | return cfqq; | ||
2746 | } | ||
2747 | return NULL; | ||
2748 | } | ||
2749 | |||
2750 | /* | ||
2751 | * Get and set a new active queue for service. | ||
2752 | */ | ||
2753 | static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, | ||
2754 | struct cfq_queue *cfqq) | ||
2755 | { | ||
2756 | if (!cfqq) | ||
2757 | cfqq = cfq_get_next_queue(cfqd); | ||
2758 | |||
2759 | __cfq_set_active_queue(cfqd, cfqq); | ||
2760 | return cfqq; | ||
2761 | } | ||
2762 | |||
2763 | static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, | ||
2764 | struct request *rq) | ||
2765 | { | ||
2766 | if (blk_rq_pos(rq) >= cfqd->last_position) | ||
2767 | return blk_rq_pos(rq) - cfqd->last_position; | ||
2768 | else | ||
2769 | return cfqd->last_position - blk_rq_pos(rq); | ||
2770 | } | ||
2771 | |||
2772 | static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
2773 | struct request *rq) | ||
2774 | { | ||
2775 | return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; | ||
2776 | } | ||
2777 | |||
2778 | static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, | ||
2779 | struct cfq_queue *cur_cfqq) | ||
2780 | { | ||
2781 | struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; | ||
2782 | struct rb_node *parent, *node; | ||
2783 | struct cfq_queue *__cfqq; | ||
2784 | sector_t sector = cfqd->last_position; | ||
2785 | |||
2786 | if (RB_EMPTY_ROOT(root)) | ||
2787 | return NULL; | ||
2788 | |||
2789 | /* | ||
2790 | * First, if we find a request starting at the end of the last | ||
2791 | * request, choose it. | ||
2792 | */ | ||
2793 | __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); | ||
2794 | if (__cfqq) | ||
2795 | return __cfqq; | ||
2796 | |||
2797 | /* | ||
2798 | * If the exact sector wasn't found, the parent of the NULL leaf | ||
2799 | * will contain the closest sector. | ||
2800 | */ | ||
2801 | __cfqq = rb_entry(parent, struct cfq_queue, p_node); | ||
2802 | if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) | ||
2803 | return __cfqq; | ||
2804 | |||
2805 | if (blk_rq_pos(__cfqq->next_rq) < sector) | ||
2806 | node = rb_next(&__cfqq->p_node); | ||
2807 | else | ||
2808 | node = rb_prev(&__cfqq->p_node); | ||
2809 | if (!node) | ||
2810 | return NULL; | ||
2811 | |||
2812 | __cfqq = rb_entry(node, struct cfq_queue, p_node); | ||
2813 | if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) | ||
2814 | return __cfqq; | ||
2815 | |||
2816 | return NULL; | ||
2817 | } | ||
2818 | |||
2819 | /* | ||
2820 | * cfqd - obvious | ||
2821 | * cur_cfqq - passed in so that we don't decide that the current queue is | ||
2822 | * closely cooperating with itself. | ||
2823 | * | ||
2824 | * So, basically we're assuming that that cur_cfqq has dispatched at least | ||
2825 | * one request, and that cfqd->last_position reflects a position on the disk | ||
2826 | * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid | ||
2827 | * assumption. | ||
2828 | */ | ||
2829 | static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, | ||
2830 | struct cfq_queue *cur_cfqq) | ||
2831 | { | ||
2832 | struct cfq_queue *cfqq; | ||
2833 | |||
2834 | if (cfq_class_idle(cur_cfqq)) | ||
2835 | return NULL; | ||
2836 | if (!cfq_cfqq_sync(cur_cfqq)) | ||
2837 | return NULL; | ||
2838 | if (CFQQ_SEEKY(cur_cfqq)) | ||
2839 | return NULL; | ||
2840 | |||
2841 | /* | ||
2842 | * Don't search priority tree if it's the only queue in the group. | ||
2843 | */ | ||
2844 | if (cur_cfqq->cfqg->nr_cfqq == 1) | ||
2845 | return NULL; | ||
2846 | |||
2847 | /* | ||
2848 | * We should notice if some of the queues are cooperating, eg | ||
2849 | * working closely on the same area of the disk. In that case, | ||
2850 | * we can group them together and don't waste time idling. | ||
2851 | */ | ||
2852 | cfqq = cfqq_close(cfqd, cur_cfqq); | ||
2853 | if (!cfqq) | ||
2854 | return NULL; | ||
2855 | |||
2856 | /* If new queue belongs to different cfq_group, don't choose it */ | ||
2857 | if (cur_cfqq->cfqg != cfqq->cfqg) | ||
2858 | return NULL; | ||
2859 | |||
2860 | /* | ||
2861 | * It only makes sense to merge sync queues. | ||
2862 | */ | ||
2863 | if (!cfq_cfqq_sync(cfqq)) | ||
2864 | return NULL; | ||
2865 | if (CFQQ_SEEKY(cfqq)) | ||
2866 | return NULL; | ||
2867 | |||
2868 | /* | ||
2869 | * Do not merge queues of different priority classes | ||
2870 | */ | ||
2871 | if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) | ||
2872 | return NULL; | ||
2873 | |||
2874 | return cfqq; | ||
2875 | } | ||
2876 | |||
2877 | /* | ||
2878 | * Determine whether we should enforce idle window for this queue. | ||
2879 | */ | ||
2880 | |||
2881 | static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2882 | { | ||
2883 | enum wl_class_t wl_class = cfqq_class(cfqq); | ||
2884 | struct cfq_rb_root *st = cfqq->service_tree; | ||
2885 | |||
2886 | BUG_ON(!st); | ||
2887 | BUG_ON(!st->count); | ||
2888 | |||
2889 | if (!cfqd->cfq_slice_idle) | ||
2890 | return false; | ||
2891 | |||
2892 | /* We never do for idle class queues. */ | ||
2893 | if (wl_class == IDLE_WORKLOAD) | ||
2894 | return false; | ||
2895 | |||
2896 | /* We do for queues that were marked with idle window flag. */ | ||
2897 | if (cfq_cfqq_idle_window(cfqq) && | ||
2898 | !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) | ||
2899 | return true; | ||
2900 | |||
2901 | /* | ||
2902 | * Otherwise, we do only if they are the last ones | ||
2903 | * in their service tree. | ||
2904 | */ | ||
2905 | if (st->count == 1 && cfq_cfqq_sync(cfqq) && | ||
2906 | !cfq_io_thinktime_big(cfqd, &st->ttime, false)) | ||
2907 | return true; | ||
2908 | cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count); | ||
2909 | return false; | ||
2910 | } | ||
2911 | |||
2912 | static void cfq_arm_slice_timer(struct cfq_data *cfqd) | ||
2913 | { | ||
2914 | struct cfq_queue *cfqq = cfqd->active_queue; | ||
2915 | struct cfq_rb_root *st = cfqq->service_tree; | ||
2916 | struct cfq_io_cq *cic; | ||
2917 | u64 sl, group_idle = 0; | ||
2918 | u64 now = ktime_get_ns(); | ||
2919 | |||
2920 | /* | ||
2921 | * SSD device without seek penalty, disable idling. But only do so | ||
2922 | * for devices that support queuing, otherwise we still have a problem | ||
2923 | * with sync vs async workloads. | ||
2924 | */ | ||
2925 | if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag && | ||
2926 | !cfqd->cfq_group_idle) | ||
2927 | return; | ||
2928 | |||
2929 | WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); | ||
2930 | WARN_ON(cfq_cfqq_slice_new(cfqq)); | ||
2931 | |||
2932 | /* | ||
2933 | * idle is disabled, either manually or by past process history | ||
2934 | */ | ||
2935 | if (!cfq_should_idle(cfqd, cfqq)) { | ||
2936 | /* no queue idling. Check for group idling */ | ||
2937 | if (cfqd->cfq_group_idle) | ||
2938 | group_idle = cfqd->cfq_group_idle; | ||
2939 | else | ||
2940 | return; | ||
2941 | } | ||
2942 | |||
2943 | /* | ||
2944 | * still active requests from this queue, don't idle | ||
2945 | */ | ||
2946 | if (cfqq->dispatched) | ||
2947 | return; | ||
2948 | |||
2949 | /* | ||
2950 | * task has exited, don't wait | ||
2951 | */ | ||
2952 | cic = cfqd->active_cic; | ||
2953 | if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) | ||
2954 | return; | ||
2955 | |||
2956 | /* | ||
2957 | * If our average think time is larger than the remaining time | ||
2958 | * slice, then don't idle. This avoids overrunning the allotted | ||
2959 | * time slice. | ||
2960 | */ | ||
2961 | if (sample_valid(cic->ttime.ttime_samples) && | ||
2962 | (cfqq->slice_end - now < cic->ttime.ttime_mean)) { | ||
2963 | cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu", | ||
2964 | cic->ttime.ttime_mean); | ||
2965 | return; | ||
2966 | } | ||
2967 | |||
2968 | /* | ||
2969 | * There are other queues in the group or this is the only group and | ||
2970 | * it has too big thinktime, don't do group idle. | ||
2971 | */ | ||
2972 | if (group_idle && | ||
2973 | (cfqq->cfqg->nr_cfqq > 1 || | ||
2974 | cfq_io_thinktime_big(cfqd, &st->ttime, true))) | ||
2975 | return; | ||
2976 | |||
2977 | cfq_mark_cfqq_wait_request(cfqq); | ||
2978 | |||
2979 | if (group_idle) | ||
2980 | sl = cfqd->cfq_group_idle; | ||
2981 | else | ||
2982 | sl = cfqd->cfq_slice_idle; | ||
2983 | |||
2984 | hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl), | ||
2985 | HRTIMER_MODE_REL); | ||
2986 | cfqg_stats_set_start_idle_time(cfqq->cfqg); | ||
2987 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl, | ||
2988 | group_idle ? 1 : 0); | ||
2989 | } | ||
2990 | |||
2991 | /* | ||
2992 | * Move request from internal lists to the request queue dispatch list. | ||
2993 | */ | ||
2994 | static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) | ||
2995 | { | ||
2996 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2997 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2998 | |||
2999 | cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); | ||
3000 | |||
3001 | cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); | ||
3002 | cfq_remove_request(rq); | ||
3003 | cfqq->dispatched++; | ||
3004 | (RQ_CFQG(rq))->dispatched++; | ||
3005 | elv_dispatch_sort(q, rq); | ||
3006 | |||
3007 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; | ||
3008 | cfqq->nr_sectors += blk_rq_sectors(rq); | ||
3009 | } | ||
3010 | |||
3011 | /* | ||
3012 | * return expired entry, or NULL to just start from scratch in rbtree | ||
3013 | */ | ||
3014 | static struct request *cfq_check_fifo(struct cfq_queue *cfqq) | ||
3015 | { | ||
3016 | struct request *rq = NULL; | ||
3017 | |||
3018 | if (cfq_cfqq_fifo_expire(cfqq)) | ||
3019 | return NULL; | ||
3020 | |||
3021 | cfq_mark_cfqq_fifo_expire(cfqq); | ||
3022 | |||
3023 | if (list_empty(&cfqq->fifo)) | ||
3024 | return NULL; | ||
3025 | |||
3026 | rq = rq_entry_fifo(cfqq->fifo.next); | ||
3027 | if (ktime_get_ns() < rq->fifo_time) | ||
3028 | rq = NULL; | ||
3029 | |||
3030 | return rq; | ||
3031 | } | ||
3032 | |||
3033 | static inline int | ||
3034 | cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
3035 | { | ||
3036 | const int base_rq = cfqd->cfq_slice_async_rq; | ||
3037 | |||
3038 | WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); | ||
3039 | |||
3040 | return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); | ||
3041 | } | ||
3042 | |||
3043 | /* | ||
3044 | * Must be called with the queue_lock held. | ||
3045 | */ | ||
3046 | static int cfqq_process_refs(struct cfq_queue *cfqq) | ||
3047 | { | ||
3048 | int process_refs, io_refs; | ||
3049 | |||
3050 | io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; | ||
3051 | process_refs = cfqq->ref - io_refs; | ||
3052 | BUG_ON(process_refs < 0); | ||
3053 | return process_refs; | ||
3054 | } | ||
3055 | |||
3056 | static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) | ||
3057 | { | ||
3058 | int process_refs, new_process_refs; | ||
3059 | struct cfq_queue *__cfqq; | ||
3060 | |||
3061 | /* | ||
3062 | * If there are no process references on the new_cfqq, then it is | ||
3063 | * unsafe to follow the ->new_cfqq chain as other cfqq's in the | ||
3064 | * chain may have dropped their last reference (not just their | ||
3065 | * last process reference). | ||
3066 | */ | ||
3067 | if (!cfqq_process_refs(new_cfqq)) | ||
3068 | return; | ||
3069 | |||
3070 | /* Avoid a circular list and skip interim queue merges */ | ||
3071 | while ((__cfqq = new_cfqq->new_cfqq)) { | ||
3072 | if (__cfqq == cfqq) | ||
3073 | return; | ||
3074 | new_cfqq = __cfqq; | ||
3075 | } | ||
3076 | |||
3077 | process_refs = cfqq_process_refs(cfqq); | ||
3078 | new_process_refs = cfqq_process_refs(new_cfqq); | ||
3079 | /* | ||
3080 | * If the process for the cfqq has gone away, there is no | ||
3081 | * sense in merging the queues. | ||
3082 | */ | ||
3083 | if (process_refs == 0 || new_process_refs == 0) | ||
3084 | return; | ||
3085 | |||
3086 | /* | ||
3087 | * Merge in the direction of the lesser amount of work. | ||
3088 | */ | ||
3089 | if (new_process_refs >= process_refs) { | ||
3090 | cfqq->new_cfqq = new_cfqq; | ||
3091 | new_cfqq->ref += process_refs; | ||
3092 | } else { | ||
3093 | new_cfqq->new_cfqq = cfqq; | ||
3094 | cfqq->ref += new_process_refs; | ||
3095 | } | ||
3096 | } | ||
3097 | |||
3098 | static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd, | ||
3099 | struct cfq_group *cfqg, enum wl_class_t wl_class) | ||
3100 | { | ||
3101 | struct cfq_queue *queue; | ||
3102 | int i; | ||
3103 | bool key_valid = false; | ||
3104 | u64 lowest_key = 0; | ||
3105 | enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; | ||
3106 | |||
3107 | for (i = 0; i <= SYNC_WORKLOAD; ++i) { | ||
3108 | /* select the one with lowest rb_key */ | ||
3109 | queue = cfq_rb_first(st_for(cfqg, wl_class, i)); | ||
3110 | if (queue && | ||
3111 | (!key_valid || queue->rb_key < lowest_key)) { | ||
3112 | lowest_key = queue->rb_key; | ||
3113 | cur_best = i; | ||
3114 | key_valid = true; | ||
3115 | } | ||
3116 | } | ||
3117 | |||
3118 | return cur_best; | ||
3119 | } | ||
3120 | |||
3121 | static void | ||
3122 | choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
3123 | { | ||
3124 | u64 slice; | ||
3125 | unsigned count; | ||
3126 | struct cfq_rb_root *st; | ||
3127 | u64 group_slice; | ||
3128 | enum wl_class_t original_class = cfqd->serving_wl_class; | ||
3129 | u64 now = ktime_get_ns(); | ||
3130 | |||
3131 | /* Choose next priority. RT > BE > IDLE */ | ||
3132 | if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) | ||
3133 | cfqd->serving_wl_class = RT_WORKLOAD; | ||
3134 | else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) | ||
3135 | cfqd->serving_wl_class = BE_WORKLOAD; | ||
3136 | else { | ||
3137 | cfqd->serving_wl_class = IDLE_WORKLOAD; | ||
3138 | cfqd->workload_expires = now + jiffies_to_nsecs(1); | ||
3139 | return; | ||
3140 | } | ||
3141 | |||
3142 | if (original_class != cfqd->serving_wl_class) | ||
3143 | goto new_workload; | ||
3144 | |||
3145 | /* | ||
3146 | * For RT and BE, we have to choose also the type | ||
3147 | * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload | ||
3148 | * expiration time | ||
3149 | */ | ||
3150 | st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); | ||
3151 | count = st->count; | ||
3152 | |||
3153 | /* | ||
3154 | * check workload expiration, and that we still have other queues ready | ||
3155 | */ | ||
3156 | if (count && !(now > cfqd->workload_expires)) | ||
3157 | return; | ||
3158 | |||
3159 | new_workload: | ||
3160 | /* otherwise select new workload type */ | ||
3161 | cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg, | ||
3162 | cfqd->serving_wl_class); | ||
3163 | st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); | ||
3164 | count = st->count; | ||
3165 | |||
3166 | /* | ||
3167 | * the workload slice is computed as a fraction of target latency | ||
3168 | * proportional to the number of queues in that workload, over | ||
3169 | * all the queues in the same priority class | ||
3170 | */ | ||
3171 | group_slice = cfq_group_slice(cfqd, cfqg); | ||
3172 | |||
3173 | slice = div_u64(group_slice * count, | ||
3174 | max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class], | ||
3175 | cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd, | ||
3176 | cfqg))); | ||
3177 | |||
3178 | if (cfqd->serving_wl_type == ASYNC_WORKLOAD) { | ||
3179 | u64 tmp; | ||
3180 | |||
3181 | /* | ||
3182 | * Async queues are currently system wide. Just taking | ||
3183 | * proportion of queues with-in same group will lead to higher | ||
3184 | * async ratio system wide as generally root group is going | ||
3185 | * to have higher weight. A more accurate thing would be to | ||
3186 | * calculate system wide asnc/sync ratio. | ||
3187 | */ | ||
3188 | tmp = cfqd->cfq_target_latency * | ||
3189 | cfqg_busy_async_queues(cfqd, cfqg); | ||
3190 | tmp = div_u64(tmp, cfqd->busy_queues); | ||
3191 | slice = min_t(u64, slice, tmp); | ||
3192 | |||
3193 | /* async workload slice is scaled down according to | ||
3194 | * the sync/async slice ratio. */ | ||
3195 | slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]); | ||
3196 | } else | ||
3197 | /* sync workload slice is at least 2 * cfq_slice_idle */ | ||
3198 | slice = max(slice, 2 * cfqd->cfq_slice_idle); | ||
3199 | |||
3200 | slice = max_t(u64, slice, CFQ_MIN_TT); | ||
3201 | cfq_log(cfqd, "workload slice:%llu", slice); | ||
3202 | cfqd->workload_expires = now + slice; | ||
3203 | } | ||
3204 | |||
3205 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) | ||
3206 | { | ||
3207 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | ||
3208 | struct cfq_group *cfqg; | ||
3209 | |||
3210 | if (RB_EMPTY_ROOT(&st->rb.rb_root)) | ||
3211 | return NULL; | ||
3212 | cfqg = cfq_rb_first_group(st); | ||
3213 | update_min_vdisktime(st); | ||
3214 | return cfqg; | ||
3215 | } | ||
3216 | |||
3217 | static void cfq_choose_cfqg(struct cfq_data *cfqd) | ||
3218 | { | ||
3219 | struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); | ||
3220 | u64 now = ktime_get_ns(); | ||
3221 | |||
3222 | cfqd->serving_group = cfqg; | ||
3223 | |||
3224 | /* Restore the workload type data */ | ||
3225 | if (cfqg->saved_wl_slice) { | ||
3226 | cfqd->workload_expires = now + cfqg->saved_wl_slice; | ||
3227 | cfqd->serving_wl_type = cfqg->saved_wl_type; | ||
3228 | cfqd->serving_wl_class = cfqg->saved_wl_class; | ||
3229 | } else | ||
3230 | cfqd->workload_expires = now - 1; | ||
3231 | |||
3232 | choose_wl_class_and_type(cfqd, cfqg); | ||
3233 | } | ||
3234 | |||
3235 | /* | ||
3236 | * Select a queue for service. If we have a current active queue, | ||
3237 | * check whether to continue servicing it, or retrieve and set a new one. | ||
3238 | */ | ||
3239 | static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) | ||
3240 | { | ||
3241 | struct cfq_queue *cfqq, *new_cfqq = NULL; | ||
3242 | u64 now = ktime_get_ns(); | ||
3243 | |||
3244 | cfqq = cfqd->active_queue; | ||
3245 | if (!cfqq) | ||
3246 | goto new_queue; | ||
3247 | |||
3248 | if (!cfqd->rq_queued) | ||
3249 | return NULL; | ||
3250 | |||
3251 | /* | ||
3252 | * We were waiting for group to get backlogged. Expire the queue | ||
3253 | */ | ||
3254 | if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
3255 | goto expire; | ||
3256 | |||
3257 | /* | ||
3258 | * The active queue has run out of time, expire it and select new. | ||
3259 | */ | ||
3260 | if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { | ||
3261 | /* | ||
3262 | * If slice had not expired at the completion of last request | ||
3263 | * we might not have turned on wait_busy flag. Don't expire | ||
3264 | * the queue yet. Allow the group to get backlogged. | ||
3265 | * | ||
3266 | * The very fact that we have used the slice, that means we | ||
3267 | * have been idling all along on this queue and it should be | ||
3268 | * ok to wait for this request to complete. | ||
3269 | */ | ||
3270 | if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) | ||
3271 | && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { | ||
3272 | cfqq = NULL; | ||
3273 | goto keep_queue; | ||
3274 | } else | ||
3275 | goto check_group_idle; | ||
3276 | } | ||
3277 | |||
3278 | /* | ||
3279 | * The active queue has requests and isn't expired, allow it to | ||
3280 | * dispatch. | ||
3281 | */ | ||
3282 | if (!RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
3283 | goto keep_queue; | ||
3284 | |||
3285 | /* | ||
3286 | * If another queue has a request waiting within our mean seek | ||
3287 | * distance, let it run. The expire code will check for close | ||
3288 | * cooperators and put the close queue at the front of the service | ||
3289 | * tree. If possible, merge the expiring queue with the new cfqq. | ||
3290 | */ | ||
3291 | new_cfqq = cfq_close_cooperator(cfqd, cfqq); | ||
3292 | if (new_cfqq) { | ||
3293 | if (!cfqq->new_cfqq) | ||
3294 | cfq_setup_merge(cfqq, new_cfqq); | ||
3295 | goto expire; | ||
3296 | } | ||
3297 | |||
3298 | /* | ||
3299 | * No requests pending. If the active queue still has requests in | ||
3300 | * flight or is idling for a new request, allow either of these | ||
3301 | * conditions to happen (or time out) before selecting a new queue. | ||
3302 | */ | ||
3303 | if (hrtimer_active(&cfqd->idle_slice_timer)) { | ||
3304 | cfqq = NULL; | ||
3305 | goto keep_queue; | ||
3306 | } | ||
3307 | |||
3308 | /* | ||
3309 | * This is a deep seek queue, but the device is much faster than | ||
3310 | * the queue can deliver, don't idle | ||
3311 | **/ | ||
3312 | if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && | ||
3313 | (cfq_cfqq_slice_new(cfqq) || | ||
3314 | (cfqq->slice_end - now > now - cfqq->slice_start))) { | ||
3315 | cfq_clear_cfqq_deep(cfqq); | ||
3316 | cfq_clear_cfqq_idle_window(cfqq); | ||
3317 | } | ||
3318 | |||
3319 | if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { | ||
3320 | cfqq = NULL; | ||
3321 | goto keep_queue; | ||
3322 | } | ||
3323 | |||
3324 | /* | ||
3325 | * If group idle is enabled and there are requests dispatched from | ||
3326 | * this group, wait for requests to complete. | ||
3327 | */ | ||
3328 | check_group_idle: | ||
3329 | if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && | ||
3330 | cfqq->cfqg->dispatched && | ||
3331 | !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { | ||
3332 | cfqq = NULL; | ||
3333 | goto keep_queue; | ||
3334 | } | ||
3335 | |||
3336 | expire: | ||
3337 | cfq_slice_expired(cfqd, 0); | ||
3338 | new_queue: | ||
3339 | /* | ||
3340 | * Current queue expired. Check if we have to switch to a new | ||
3341 | * service tree | ||
3342 | */ | ||
3343 | if (!new_cfqq) | ||
3344 | cfq_choose_cfqg(cfqd); | ||
3345 | |||
3346 | cfqq = cfq_set_active_queue(cfqd, new_cfqq); | ||
3347 | keep_queue: | ||
3348 | return cfqq; | ||
3349 | } | ||
3350 | |||
3351 | static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) | ||
3352 | { | ||
3353 | int dispatched = 0; | ||
3354 | |||
3355 | while (cfqq->next_rq) { | ||
3356 | cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); | ||
3357 | dispatched++; | ||
3358 | } | ||
3359 | |||
3360 | BUG_ON(!list_empty(&cfqq->fifo)); | ||
3361 | |||
3362 | /* By default cfqq is not expired if it is empty. Do it explicitly */ | ||
3363 | __cfq_slice_expired(cfqq->cfqd, cfqq, 0); | ||
3364 | return dispatched; | ||
3365 | } | ||
3366 | |||
3367 | /* | ||
3368 | * Drain our current requests. Used for barriers and when switching | ||
3369 | * io schedulers on-the-fly. | ||
3370 | */ | ||
3371 | static int cfq_forced_dispatch(struct cfq_data *cfqd) | ||
3372 | { | ||
3373 | struct cfq_queue *cfqq; | ||
3374 | int dispatched = 0; | ||
3375 | |||
3376 | /* Expire the timeslice of the current active queue first */ | ||
3377 | cfq_slice_expired(cfqd, 0); | ||
3378 | while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { | ||
3379 | __cfq_set_active_queue(cfqd, cfqq); | ||
3380 | dispatched += __cfq_forced_dispatch_cfqq(cfqq); | ||
3381 | } | ||
3382 | |||
3383 | BUG_ON(cfqd->busy_queues); | ||
3384 | |||
3385 | cfq_log(cfqd, "forced_dispatch=%d", dispatched); | ||
3386 | return dispatched; | ||
3387 | } | ||
3388 | |||
3389 | static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, | ||
3390 | struct cfq_queue *cfqq) | ||
3391 | { | ||
3392 | u64 now = ktime_get_ns(); | ||
3393 | |||
3394 | /* the queue hasn't finished any request, can't estimate */ | ||
3395 | if (cfq_cfqq_slice_new(cfqq)) | ||
3396 | return true; | ||
3397 | if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end) | ||
3398 | return true; | ||
3399 | |||
3400 | return false; | ||
3401 | } | ||
3402 | |||
3403 | static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
3404 | { | ||
3405 | unsigned int max_dispatch; | ||
3406 | |||
3407 | if (cfq_cfqq_must_dispatch(cfqq)) | ||
3408 | return true; | ||
3409 | |||
3410 | /* | ||
3411 | * Drain async requests before we start sync IO | ||
3412 | */ | ||
3413 | if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) | ||
3414 | return false; | ||
3415 | |||
3416 | /* | ||
3417 | * If this is an async queue and we have sync IO in flight, let it wait | ||
3418 | */ | ||
3419 | if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) | ||
3420 | return false; | ||
3421 | |||
3422 | max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); | ||
3423 | if (cfq_class_idle(cfqq)) | ||
3424 | max_dispatch = 1; | ||
3425 | |||
3426 | /* | ||
3427 | * Does this cfqq already have too much IO in flight? | ||
3428 | */ | ||
3429 | if (cfqq->dispatched >= max_dispatch) { | ||
3430 | bool promote_sync = false; | ||
3431 | /* | ||
3432 | * idle queue must always only have a single IO in flight | ||
3433 | */ | ||
3434 | if (cfq_class_idle(cfqq)) | ||
3435 | return false; | ||
3436 | |||
3437 | /* | ||
3438 | * If there is only one sync queue | ||
3439 | * we can ignore async queue here and give the sync | ||
3440 | * queue no dispatch limit. The reason is a sync queue can | ||
3441 | * preempt async queue, limiting the sync queue doesn't make | ||
3442 | * sense. This is useful for aiostress test. | ||
3443 | */ | ||
3444 | if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) | ||
3445 | promote_sync = true; | ||
3446 | |||
3447 | /* | ||
3448 | * We have other queues, don't allow more IO from this one | ||
3449 | */ | ||
3450 | if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) && | ||
3451 | !promote_sync) | ||
3452 | return false; | ||
3453 | |||
3454 | /* | ||
3455 | * Sole queue user, no limit | ||
3456 | */ | ||
3457 | if (cfqd->busy_queues == 1 || promote_sync) | ||
3458 | max_dispatch = -1; | ||
3459 | else | ||
3460 | /* | ||
3461 | * Normally we start throttling cfqq when cfq_quantum/2 | ||
3462 | * requests have been dispatched. But we can drive | ||
3463 | * deeper queue depths at the beginning of slice | ||
3464 | * subjected to upper limit of cfq_quantum. | ||
3465 | * */ | ||
3466 | max_dispatch = cfqd->cfq_quantum; | ||
3467 | } | ||
3468 | |||
3469 | /* | ||
3470 | * Async queues must wait a bit before being allowed dispatch. | ||
3471 | * We also ramp up the dispatch depth gradually for async IO, | ||
3472 | * based on the last sync IO we serviced | ||
3473 | */ | ||
3474 | if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { | ||
3475 | u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync; | ||
3476 | unsigned int depth; | ||
3477 | |||
3478 | depth = div64_u64(last_sync, cfqd->cfq_slice[1]); | ||
3479 | if (!depth && !cfqq->dispatched) | ||
3480 | depth = 1; | ||
3481 | if (depth < max_dispatch) | ||
3482 | max_dispatch = depth; | ||
3483 | } | ||
3484 | |||
3485 | /* | ||
3486 | * If we're below the current max, allow a dispatch | ||
3487 | */ | ||
3488 | return cfqq->dispatched < max_dispatch; | ||
3489 | } | ||
3490 | |||
3491 | /* | ||
3492 | * Dispatch a request from cfqq, moving them to the request queue | ||
3493 | * dispatch list. | ||
3494 | */ | ||
3495 | static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
3496 | { | ||
3497 | struct request *rq; | ||
3498 | |||
3499 | BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); | ||
3500 | |||
3501 | rq = cfq_check_fifo(cfqq); | ||
3502 | if (rq) | ||
3503 | cfq_mark_cfqq_must_dispatch(cfqq); | ||
3504 | |||
3505 | if (!cfq_may_dispatch(cfqd, cfqq)) | ||
3506 | return false; | ||
3507 | |||
3508 | /* | ||
3509 | * follow expired path, else get first next available | ||
3510 | */ | ||
3511 | if (!rq) | ||
3512 | rq = cfqq->next_rq; | ||
3513 | else | ||
3514 | cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); | ||
3515 | |||
3516 | /* | ||
3517 | * insert request into driver dispatch list | ||
3518 | */ | ||
3519 | cfq_dispatch_insert(cfqd->queue, rq); | ||
3520 | |||
3521 | if (!cfqd->active_cic) { | ||
3522 | struct cfq_io_cq *cic = RQ_CIC(rq); | ||
3523 | |||
3524 | atomic_long_inc(&cic->icq.ioc->refcount); | ||
3525 | cfqd->active_cic = cic; | ||
3526 | } | ||
3527 | |||
3528 | return true; | ||
3529 | } | ||
3530 | |||
3531 | /* | ||
3532 | * Find the cfqq that we need to service and move a request from that to the | ||
3533 | * dispatch list | ||
3534 | */ | ||
3535 | static int cfq_dispatch_requests(struct request_queue *q, int force) | ||
3536 | { | ||
3537 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
3538 | struct cfq_queue *cfqq; | ||
3539 | |||
3540 | if (!cfqd->busy_queues) | ||
3541 | return 0; | ||
3542 | |||
3543 | if (unlikely(force)) | ||
3544 | return cfq_forced_dispatch(cfqd); | ||
3545 | |||
3546 | cfqq = cfq_select_queue(cfqd); | ||
3547 | if (!cfqq) | ||
3548 | return 0; | ||
3549 | |||
3550 | /* | ||
3551 | * Dispatch a request from this cfqq, if it is allowed | ||
3552 | */ | ||
3553 | if (!cfq_dispatch_request(cfqd, cfqq)) | ||
3554 | return 0; | ||
3555 | |||
3556 | cfqq->slice_dispatch++; | ||
3557 | cfq_clear_cfqq_must_dispatch(cfqq); | ||
3558 | |||
3559 | /* | ||
3560 | * expire an async queue immediately if it has used up its slice. idle | ||
3561 | * queue always expire after 1 dispatch round. | ||
3562 | */ | ||
3563 | if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && | ||
3564 | cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || | ||
3565 | cfq_class_idle(cfqq))) { | ||
3566 | cfqq->slice_end = ktime_get_ns() + 1; | ||
3567 | cfq_slice_expired(cfqd, 0); | ||
3568 | } | ||
3569 | |||
3570 | cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); | ||
3571 | return 1; | ||
3572 | } | ||
3573 | |||
3574 | /* | ||
3575 | * task holds one reference to the queue, dropped when task exits. each rq | ||
3576 | * in-flight on this queue also holds a reference, dropped when rq is freed. | ||
3577 | * | ||
3578 | * Each cfq queue took a reference on the parent group. Drop it now. | ||
3579 | * queue lock must be held here. | ||
3580 | */ | ||
3581 | static void cfq_put_queue(struct cfq_queue *cfqq) | ||
3582 | { | ||
3583 | struct cfq_data *cfqd = cfqq->cfqd; | ||
3584 | struct cfq_group *cfqg; | ||
3585 | |||
3586 | BUG_ON(cfqq->ref <= 0); | ||
3587 | |||
3588 | cfqq->ref--; | ||
3589 | if (cfqq->ref) | ||
3590 | return; | ||
3591 | |||
3592 | cfq_log_cfqq(cfqd, cfqq, "put_queue"); | ||
3593 | BUG_ON(rb_first(&cfqq->sort_list)); | ||
3594 | BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); | ||
3595 | cfqg = cfqq->cfqg; | ||
3596 | |||
3597 | if (unlikely(cfqd->active_queue == cfqq)) { | ||
3598 | __cfq_slice_expired(cfqd, cfqq, 0); | ||
3599 | cfq_schedule_dispatch(cfqd); | ||
3600 | } | ||
3601 | |||
3602 | BUG_ON(cfq_cfqq_on_rr(cfqq)); | ||
3603 | kmem_cache_free(cfq_pool, cfqq); | ||
3604 | cfqg_put(cfqg); | ||
3605 | } | ||
3606 | |||
3607 | static void cfq_put_cooperator(struct cfq_queue *cfqq) | ||
3608 | { | ||
3609 | struct cfq_queue *__cfqq, *next; | ||
3610 | |||
3611 | /* | ||
3612 | * If this queue was scheduled to merge with another queue, be | ||
3613 | * sure to drop the reference taken on that queue (and others in | ||
3614 | * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. | ||
3615 | */ | ||
3616 | __cfqq = cfqq->new_cfqq; | ||
3617 | while (__cfqq) { | ||
3618 | if (__cfqq == cfqq) { | ||
3619 | WARN(1, "cfqq->new_cfqq loop detected\n"); | ||
3620 | break; | ||
3621 | } | ||
3622 | next = __cfqq->new_cfqq; | ||
3623 | cfq_put_queue(__cfqq); | ||
3624 | __cfqq = next; | ||
3625 | } | ||
3626 | } | ||
3627 | |||
3628 | static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
3629 | { | ||
3630 | if (unlikely(cfqq == cfqd->active_queue)) { | ||
3631 | __cfq_slice_expired(cfqd, cfqq, 0); | ||
3632 | cfq_schedule_dispatch(cfqd); | ||
3633 | } | ||
3634 | |||
3635 | cfq_put_cooperator(cfqq); | ||
3636 | |||
3637 | cfq_put_queue(cfqq); | ||
3638 | } | ||
3639 | |||
3640 | static void cfq_init_icq(struct io_cq *icq) | ||
3641 | { | ||
3642 | struct cfq_io_cq *cic = icq_to_cic(icq); | ||
3643 | |||
3644 | cic->ttime.last_end_request = ktime_get_ns(); | ||
3645 | } | ||
3646 | |||
3647 | static void cfq_exit_icq(struct io_cq *icq) | ||
3648 | { | ||
3649 | struct cfq_io_cq *cic = icq_to_cic(icq); | ||
3650 | struct cfq_data *cfqd = cic_to_cfqd(cic); | ||
3651 | |||
3652 | if (cic_to_cfqq(cic, false)) { | ||
3653 | cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); | ||
3654 | cic_set_cfqq(cic, NULL, false); | ||
3655 | } | ||
3656 | |||
3657 | if (cic_to_cfqq(cic, true)) { | ||
3658 | cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); | ||
3659 | cic_set_cfqq(cic, NULL, true); | ||
3660 | } | ||
3661 | } | ||
3662 | |||
3663 | static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) | ||
3664 | { | ||
3665 | struct task_struct *tsk = current; | ||
3666 | int ioprio_class; | ||
3667 | |||
3668 | if (!cfq_cfqq_prio_changed(cfqq)) | ||
3669 | return; | ||
3670 | |||
3671 | ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); | ||
3672 | switch (ioprio_class) { | ||
3673 | default: | ||
3674 | printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); | ||
3675 | /* fall through */ | ||
3676 | case IOPRIO_CLASS_NONE: | ||
3677 | /* | ||
3678 | * no prio set, inherit CPU scheduling settings | ||
3679 | */ | ||
3680 | cfqq->ioprio = task_nice_ioprio(tsk); | ||
3681 | cfqq->ioprio_class = task_nice_ioclass(tsk); | ||
3682 | break; | ||
3683 | case IOPRIO_CLASS_RT: | ||
3684 | cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | ||
3685 | cfqq->ioprio_class = IOPRIO_CLASS_RT; | ||
3686 | break; | ||
3687 | case IOPRIO_CLASS_BE: | ||
3688 | cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | ||
3689 | cfqq->ioprio_class = IOPRIO_CLASS_BE; | ||
3690 | break; | ||
3691 | case IOPRIO_CLASS_IDLE: | ||
3692 | cfqq->ioprio_class = IOPRIO_CLASS_IDLE; | ||
3693 | cfqq->ioprio = 7; | ||
3694 | cfq_clear_cfqq_idle_window(cfqq); | ||
3695 | break; | ||
3696 | } | ||
3697 | |||
3698 | /* | ||
3699 | * keep track of original prio settings in case we have to temporarily | ||
3700 | * elevate the priority of this queue | ||
3701 | */ | ||
3702 | cfqq->org_ioprio = cfqq->ioprio; | ||
3703 | cfqq->org_ioprio_class = cfqq->ioprio_class; | ||
3704 | cfq_clear_cfqq_prio_changed(cfqq); | ||
3705 | } | ||
3706 | |||
3707 | static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) | ||
3708 | { | ||
3709 | int ioprio = cic->icq.ioc->ioprio; | ||
3710 | struct cfq_data *cfqd = cic_to_cfqd(cic); | ||
3711 | struct cfq_queue *cfqq; | ||
3712 | |||
3713 | /* | ||
3714 | * Check whether ioprio has changed. The condition may trigger | ||
3715 | * spuriously on a newly created cic but there's no harm. | ||
3716 | */ | ||
3717 | if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) | ||
3718 | return; | ||
3719 | |||
3720 | cfqq = cic_to_cfqq(cic, false); | ||
3721 | if (cfqq) { | ||
3722 | cfq_put_queue(cfqq); | ||
3723 | cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); | ||
3724 | cic_set_cfqq(cic, cfqq, false); | ||
3725 | } | ||
3726 | |||
3727 | cfqq = cic_to_cfqq(cic, true); | ||
3728 | if (cfqq) | ||
3729 | cfq_mark_cfqq_prio_changed(cfqq); | ||
3730 | |||
3731 | cic->ioprio = ioprio; | ||
3732 | } | ||
3733 | |||
3734 | static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
3735 | pid_t pid, bool is_sync) | ||
3736 | { | ||
3737 | RB_CLEAR_NODE(&cfqq->rb_node); | ||
3738 | RB_CLEAR_NODE(&cfqq->p_node); | ||
3739 | INIT_LIST_HEAD(&cfqq->fifo); | ||
3740 | |||
3741 | cfqq->ref = 0; | ||
3742 | cfqq->cfqd = cfqd; | ||
3743 | |||
3744 | cfq_mark_cfqq_prio_changed(cfqq); | ||
3745 | |||
3746 | if (is_sync) { | ||
3747 | if (!cfq_class_idle(cfqq)) | ||
3748 | cfq_mark_cfqq_idle_window(cfqq); | ||
3749 | cfq_mark_cfqq_sync(cfqq); | ||
3750 | } | ||
3751 | cfqq->pid = pid; | ||
3752 | } | ||
3753 | |||
3754 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
3755 | static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | ||
3756 | { | ||
3757 | struct cfq_data *cfqd = cic_to_cfqd(cic); | ||
3758 | struct cfq_queue *cfqq; | ||
3759 | uint64_t serial_nr; | ||
3760 | |||
3761 | rcu_read_lock(); | ||
3762 | serial_nr = bio_blkcg(bio)->css.serial_nr; | ||
3763 | rcu_read_unlock(); | ||
3764 | |||
3765 | /* | ||
3766 | * Check whether blkcg has changed. The condition may trigger | ||
3767 | * spuriously on a newly created cic but there's no harm. | ||
3768 | */ | ||
3769 | if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) | ||
3770 | return; | ||
3771 | |||
3772 | /* | ||
3773 | * Drop reference to queues. New queues will be assigned in new | ||
3774 | * group upon arrival of fresh requests. | ||
3775 | */ | ||
3776 | cfqq = cic_to_cfqq(cic, false); | ||
3777 | if (cfqq) { | ||
3778 | cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); | ||
3779 | cic_set_cfqq(cic, NULL, false); | ||
3780 | cfq_put_queue(cfqq); | ||
3781 | } | ||
3782 | |||
3783 | cfqq = cic_to_cfqq(cic, true); | ||
3784 | if (cfqq) { | ||
3785 | cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); | ||
3786 | cic_set_cfqq(cic, NULL, true); | ||
3787 | cfq_put_queue(cfqq); | ||
3788 | } | ||
3789 | |||
3790 | cic->blkcg_serial_nr = serial_nr; | ||
3791 | } | ||
3792 | #else | ||
3793 | static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | ||
3794 | { | ||
3795 | } | ||
3796 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
3797 | |||
3798 | static struct cfq_queue ** | ||
3799 | cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) | ||
3800 | { | ||
3801 | switch (ioprio_class) { | ||
3802 | case IOPRIO_CLASS_RT: | ||
3803 | return &cfqg->async_cfqq[0][ioprio]; | ||
3804 | case IOPRIO_CLASS_NONE: | ||
3805 | ioprio = IOPRIO_NORM; | ||
3806 | /* fall through */ | ||
3807 | case IOPRIO_CLASS_BE: | ||
3808 | return &cfqg->async_cfqq[1][ioprio]; | ||
3809 | case IOPRIO_CLASS_IDLE: | ||
3810 | return &cfqg->async_idle_cfqq; | ||
3811 | default: | ||
3812 | BUG(); | ||
3813 | } | ||
3814 | } | ||
3815 | |||
3816 | static struct cfq_queue * | ||
3817 | cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | ||
3818 | struct bio *bio) | ||
3819 | { | ||
3820 | int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); | ||
3821 | int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | ||
3822 | struct cfq_queue **async_cfqq = NULL; | ||
3823 | struct cfq_queue *cfqq; | ||
3824 | struct cfq_group *cfqg; | ||
3825 | |||
3826 | rcu_read_lock(); | ||
3827 | cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); | ||
3828 | if (!cfqg) { | ||
3829 | cfqq = &cfqd->oom_cfqq; | ||
3830 | goto out; | ||
3831 | } | ||
3832 | |||
3833 | if (!is_sync) { | ||
3834 | if (!ioprio_valid(cic->ioprio)) { | ||
3835 | struct task_struct *tsk = current; | ||
3836 | ioprio = task_nice_ioprio(tsk); | ||
3837 | ioprio_class = task_nice_ioclass(tsk); | ||
3838 | } | ||
3839 | async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); | ||
3840 | cfqq = *async_cfqq; | ||
3841 | if (cfqq) | ||
3842 | goto out; | ||
3843 | } | ||
3844 | |||
3845 | cfqq = kmem_cache_alloc_node(cfq_pool, | ||
3846 | GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, | ||
3847 | cfqd->queue->node); | ||
3848 | if (!cfqq) { | ||
3849 | cfqq = &cfqd->oom_cfqq; | ||
3850 | goto out; | ||
3851 | } | ||
3852 | |||
3853 | /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */ | ||
3854 | cfqq->ioprio_class = IOPRIO_CLASS_NONE; | ||
3855 | cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); | ||
3856 | cfq_init_prio_data(cfqq, cic); | ||
3857 | cfq_link_cfqq_cfqg(cfqq, cfqg); | ||
3858 | cfq_log_cfqq(cfqd, cfqq, "alloced"); | ||
3859 | |||
3860 | if (async_cfqq) { | ||
3861 | /* a new async queue is created, pin and remember */ | ||
3862 | cfqq->ref++; | ||
3863 | *async_cfqq = cfqq; | ||
3864 | } | ||
3865 | out: | ||
3866 | cfqq->ref++; | ||
3867 | rcu_read_unlock(); | ||
3868 | return cfqq; | ||
3869 | } | ||
3870 | |||
3871 | static void | ||
3872 | __cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle) | ||
3873 | { | ||
3874 | u64 elapsed = ktime_get_ns() - ttime->last_end_request; | ||
3875 | elapsed = min(elapsed, 2UL * slice_idle); | ||
3876 | |||
3877 | ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; | ||
3878 | ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); | ||
3879 | ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, | ||
3880 | ttime->ttime_samples); | ||
3881 | } | ||
3882 | |||
3883 | static void | ||
3884 | cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
3885 | struct cfq_io_cq *cic) | ||
3886 | { | ||
3887 | if (cfq_cfqq_sync(cfqq)) { | ||
3888 | __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); | ||
3889 | __cfq_update_io_thinktime(&cfqq->service_tree->ttime, | ||
3890 | cfqd->cfq_slice_idle); | ||
3891 | } | ||
3892 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
3893 | __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); | ||
3894 | #endif | ||
3895 | } | ||
3896 | |||
3897 | static void | ||
3898 | cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
3899 | struct request *rq) | ||
3900 | { | ||
3901 | sector_t sdist = 0; | ||
3902 | sector_t n_sec = blk_rq_sectors(rq); | ||
3903 | if (cfqq->last_request_pos) { | ||
3904 | if (cfqq->last_request_pos < blk_rq_pos(rq)) | ||
3905 | sdist = blk_rq_pos(rq) - cfqq->last_request_pos; | ||
3906 | else | ||
3907 | sdist = cfqq->last_request_pos - blk_rq_pos(rq); | ||
3908 | } | ||
3909 | |||
3910 | cfqq->seek_history <<= 1; | ||
3911 | if (blk_queue_nonrot(cfqd->queue)) | ||
3912 | cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); | ||
3913 | else | ||
3914 | cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); | ||
3915 | } | ||
3916 | |||
3917 | static inline bool req_noidle(struct request *req) | ||
3918 | { | ||
3919 | return req_op(req) == REQ_OP_WRITE && | ||
3920 | (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC; | ||
3921 | } | ||
3922 | |||
3923 | /* | ||
3924 | * Disable idle window if the process thinks too long or seeks so much that | ||
3925 | * it doesn't matter | ||
3926 | */ | ||
3927 | static void | ||
3928 | cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
3929 | struct cfq_io_cq *cic) | ||
3930 | { | ||
3931 | int old_idle, enable_idle; | ||
3932 | |||
3933 | /* | ||
3934 | * Don't idle for async or idle io prio class | ||
3935 | */ | ||
3936 | if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) | ||
3937 | return; | ||
3938 | |||
3939 | enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); | ||
3940 | |||
3941 | if (cfqq->queued[0] + cfqq->queued[1] >= 4) | ||
3942 | cfq_mark_cfqq_deep(cfqq); | ||
3943 | |||
3944 | if (cfqq->next_rq && req_noidle(cfqq->next_rq)) | ||
3945 | enable_idle = 0; | ||
3946 | else if (!atomic_read(&cic->icq.ioc->active_ref) || | ||
3947 | !cfqd->cfq_slice_idle || | ||
3948 | (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) | ||
3949 | enable_idle = 0; | ||
3950 | else if (sample_valid(cic->ttime.ttime_samples)) { | ||
3951 | if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) | ||
3952 | enable_idle = 0; | ||
3953 | else | ||
3954 | enable_idle = 1; | ||
3955 | } | ||
3956 | |||
3957 | if (old_idle != enable_idle) { | ||
3958 | cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); | ||
3959 | if (enable_idle) | ||
3960 | cfq_mark_cfqq_idle_window(cfqq); | ||
3961 | else | ||
3962 | cfq_clear_cfqq_idle_window(cfqq); | ||
3963 | } | ||
3964 | } | ||
3965 | |||
3966 | /* | ||
3967 | * Check if new_cfqq should preempt the currently active queue. Return 0 for | ||
3968 | * no or if we aren't sure, a 1 will cause a preempt. | ||
3969 | */ | ||
3970 | static bool | ||
3971 | cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, | ||
3972 | struct request *rq) | ||
3973 | { | ||
3974 | struct cfq_queue *cfqq; | ||
3975 | |||
3976 | cfqq = cfqd->active_queue; | ||
3977 | if (!cfqq) | ||
3978 | return false; | ||
3979 | |||
3980 | if (cfq_class_idle(new_cfqq)) | ||
3981 | return false; | ||
3982 | |||
3983 | if (cfq_class_idle(cfqq)) | ||
3984 | return true; | ||
3985 | |||
3986 | /* | ||
3987 | * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. | ||
3988 | */ | ||
3989 | if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) | ||
3990 | return false; | ||
3991 | |||
3992 | /* | ||
3993 | * if the new request is sync, but the currently running queue is | ||
3994 | * not, let the sync request have priority. | ||
3995 | */ | ||
3996 | if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) | ||
3997 | return true; | ||
3998 | |||
3999 | /* | ||
4000 | * Treat ancestors of current cgroup the same way as current cgroup. | ||
4001 | * For anybody else we disallow preemption to guarantee service | ||
4002 | * fairness among cgroups. | ||
4003 | */ | ||
4004 | if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg)) | ||
4005 | return false; | ||
4006 | |||
4007 | if (cfq_slice_used(cfqq)) | ||
4008 | return true; | ||
4009 | |||
4010 | /* | ||
4011 | * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. | ||
4012 | */ | ||
4013 | if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) | ||
4014 | return true; | ||
4015 | |||
4016 | WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class); | ||
4017 | /* Allow preemption only if we are idling on sync-noidle tree */ | ||
4018 | if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && | ||
4019 | cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && | ||
4020 | RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
4021 | return true; | ||
4022 | |||
4023 | /* | ||
4024 | * So both queues are sync. Let the new request get disk time if | ||
4025 | * it's a metadata request and the current queue is doing regular IO. | ||
4026 | */ | ||
4027 | if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) | ||
4028 | return true; | ||
4029 | |||
4030 | /* An idle queue should not be idle now for some reason */ | ||
4031 | if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) | ||
4032 | return true; | ||
4033 | |||
4034 | if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) | ||
4035 | return false; | ||
4036 | |||
4037 | /* | ||
4038 | * if this request is as-good as one we would expect from the | ||
4039 | * current cfqq, let it preempt | ||
4040 | */ | ||
4041 | if (cfq_rq_close(cfqd, cfqq, rq)) | ||
4042 | return true; | ||
4043 | |||
4044 | return false; | ||
4045 | } | ||
4046 | |||
4047 | /* | ||
4048 | * cfqq preempts the active queue. if we allowed preempt with no slice left, | ||
4049 | * let it have half of its nominal slice. | ||
4050 | */ | ||
4051 | static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
4052 | { | ||
4053 | enum wl_type_t old_type = cfqq_type(cfqd->active_queue); | ||
4054 | |||
4055 | cfq_log_cfqq(cfqd, cfqq, "preempt"); | ||
4056 | cfq_slice_expired(cfqd, 1); | ||
4057 | |||
4058 | /* | ||
4059 | * workload type is changed, don't save slice, otherwise preempt | ||
4060 | * doesn't happen | ||
4061 | */ | ||
4062 | if (old_type != cfqq_type(cfqq)) | ||
4063 | cfqq->cfqg->saved_wl_slice = 0; | ||
4064 | |||
4065 | /* | ||
4066 | * Put the new queue at the front of the of the current list, | ||
4067 | * so we know that it will be selected next. | ||
4068 | */ | ||
4069 | BUG_ON(!cfq_cfqq_on_rr(cfqq)); | ||
4070 | |||
4071 | cfq_service_tree_add(cfqd, cfqq, 1); | ||
4072 | |||
4073 | cfqq->slice_end = 0; | ||
4074 | cfq_mark_cfqq_slice_new(cfqq); | ||
4075 | } | ||
4076 | |||
4077 | /* | ||
4078 | * Called when a new fs request (rq) is added (to cfqq). Check if there's | ||
4079 | * something we should do about it | ||
4080 | */ | ||
4081 | static void | ||
4082 | cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
4083 | struct request *rq) | ||
4084 | { | ||
4085 | struct cfq_io_cq *cic = RQ_CIC(rq); | ||
4086 | |||
4087 | cfqd->rq_queued++; | ||
4088 | if (rq->cmd_flags & REQ_PRIO) | ||
4089 | cfqq->prio_pending++; | ||
4090 | |||
4091 | cfq_update_io_thinktime(cfqd, cfqq, cic); | ||
4092 | cfq_update_io_seektime(cfqd, cfqq, rq); | ||
4093 | cfq_update_idle_window(cfqd, cfqq, cic); | ||
4094 | |||
4095 | cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); | ||
4096 | |||
4097 | if (cfqq == cfqd->active_queue) { | ||
4098 | /* | ||
4099 | * Remember that we saw a request from this process, but | ||
4100 | * don't start queuing just yet. Otherwise we risk seeing lots | ||
4101 | * of tiny requests, because we disrupt the normal plugging | ||
4102 | * and merging. If the request is already larger than a single | ||
4103 | * page, let it rip immediately. For that case we assume that | ||
4104 | * merging is already done. Ditto for a busy system that | ||
4105 | * has other work pending, don't risk delaying until the | ||
4106 | * idle timer unplug to continue working. | ||
4107 | */ | ||
4108 | if (cfq_cfqq_wait_request(cfqq)) { | ||
4109 | if (blk_rq_bytes(rq) > PAGE_SIZE || | ||
4110 | cfqd->busy_queues > 1) { | ||
4111 | cfq_del_timer(cfqd, cfqq); | ||
4112 | cfq_clear_cfqq_wait_request(cfqq); | ||
4113 | __blk_run_queue(cfqd->queue); | ||
4114 | } else { | ||
4115 | cfqg_stats_update_idle_time(cfqq->cfqg); | ||
4116 | cfq_mark_cfqq_must_dispatch(cfqq); | ||
4117 | } | ||
4118 | } | ||
4119 | } else if (cfq_should_preempt(cfqd, cfqq, rq)) { | ||
4120 | /* | ||
4121 | * not the active queue - expire current slice if it is | ||
4122 | * idle and has expired it's mean thinktime or this new queue | ||
4123 | * has some old slice time left and is of higher priority or | ||
4124 | * this new queue is RT and the current one is BE | ||
4125 | */ | ||
4126 | cfq_preempt_queue(cfqd, cfqq); | ||
4127 | __blk_run_queue(cfqd->queue); | ||
4128 | } | ||
4129 | } | ||
4130 | |||
4131 | static void cfq_insert_request(struct request_queue *q, struct request *rq) | ||
4132 | { | ||
4133 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
4134 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
4135 | |||
4136 | cfq_log_cfqq(cfqd, cfqq, "insert_request"); | ||
4137 | cfq_init_prio_data(cfqq, RQ_CIC(rq)); | ||
4138 | |||
4139 | rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; | ||
4140 | list_add_tail(&rq->queuelist, &cfqq->fifo); | ||
4141 | cfq_add_rq_rb(rq); | ||
4142 | cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, | ||
4143 | rq->cmd_flags); | ||
4144 | cfq_rq_enqueued(cfqd, cfqq, rq); | ||
4145 | } | ||
4146 | |||
4147 | /* | ||
4148 | * Update hw_tag based on peak queue depth over 50 samples under | ||
4149 | * sufficient load. | ||
4150 | */ | ||
4151 | static void cfq_update_hw_tag(struct cfq_data *cfqd) | ||
4152 | { | ||
4153 | struct cfq_queue *cfqq = cfqd->active_queue; | ||
4154 | |||
4155 | if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) | ||
4156 | cfqd->hw_tag_est_depth = cfqd->rq_in_driver; | ||
4157 | |||
4158 | if (cfqd->hw_tag == 1) | ||
4159 | return; | ||
4160 | |||
4161 | if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && | ||
4162 | cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) | ||
4163 | return; | ||
4164 | |||
4165 | /* | ||
4166 | * If active queue hasn't enough requests and can idle, cfq might not | ||
4167 | * dispatch sufficient requests to hardware. Don't zero hw_tag in this | ||
4168 | * case | ||
4169 | */ | ||
4170 | if (cfqq && cfq_cfqq_idle_window(cfqq) && | ||
4171 | cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < | ||
4172 | CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) | ||
4173 | return; | ||
4174 | |||
4175 | if (cfqd->hw_tag_samples++ < 50) | ||
4176 | return; | ||
4177 | |||
4178 | if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) | ||
4179 | cfqd->hw_tag = 1; | ||
4180 | else | ||
4181 | cfqd->hw_tag = 0; | ||
4182 | } | ||
4183 | |||
4184 | static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
4185 | { | ||
4186 | struct cfq_io_cq *cic = cfqd->active_cic; | ||
4187 | u64 now = ktime_get_ns(); | ||
4188 | |||
4189 | /* If the queue already has requests, don't wait */ | ||
4190 | if (!RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
4191 | return false; | ||
4192 | |||
4193 | /* If there are other queues in the group, don't wait */ | ||
4194 | if (cfqq->cfqg->nr_cfqq > 1) | ||
4195 | return false; | ||
4196 | |||
4197 | /* the only queue in the group, but think time is big */ | ||
4198 | if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) | ||
4199 | return false; | ||
4200 | |||
4201 | if (cfq_slice_used(cfqq)) | ||
4202 | return true; | ||
4203 | |||
4204 | /* if slice left is less than think time, wait busy */ | ||
4205 | if (cic && sample_valid(cic->ttime.ttime_samples) | ||
4206 | && (cfqq->slice_end - now < cic->ttime.ttime_mean)) | ||
4207 | return true; | ||
4208 | |||
4209 | /* | ||
4210 | * If think times is less than a jiffy than ttime_mean=0 and above | ||
4211 | * will not be true. It might happen that slice has not expired yet | ||
4212 | * but will expire soon (4-5 ns) during select_queue(). To cover the | ||
4213 | * case where think time is less than a jiffy, mark the queue wait | ||
4214 | * busy if only 1 jiffy is left in the slice. | ||
4215 | */ | ||
4216 | if (cfqq->slice_end - now <= jiffies_to_nsecs(1)) | ||
4217 | return true; | ||
4218 | |||
4219 | return false; | ||
4220 | } | ||
4221 | |||
4222 | static void cfq_completed_request(struct request_queue *q, struct request *rq) | ||
4223 | { | ||
4224 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
4225 | struct cfq_data *cfqd = cfqq->cfqd; | ||
4226 | const int sync = rq_is_sync(rq); | ||
4227 | u64 now = ktime_get_ns(); | ||
4228 | |||
4229 | cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq)); | ||
4230 | |||
4231 | cfq_update_hw_tag(cfqd); | ||
4232 | |||
4233 | WARN_ON(!cfqd->rq_in_driver); | ||
4234 | WARN_ON(!cfqq->dispatched); | ||
4235 | cfqd->rq_in_driver--; | ||
4236 | cfqq->dispatched--; | ||
4237 | (RQ_CFQG(rq))->dispatched--; | ||
4238 | cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns, | ||
4239 | rq->io_start_time_ns, rq->cmd_flags); | ||
4240 | |||
4241 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; | ||
4242 | |||
4243 | if (sync) { | ||
4244 | struct cfq_rb_root *st; | ||
4245 | |||
4246 | RQ_CIC(rq)->ttime.last_end_request = now; | ||
4247 | |||
4248 | if (cfq_cfqq_on_rr(cfqq)) | ||
4249 | st = cfqq->service_tree; | ||
4250 | else | ||
4251 | st = st_for(cfqq->cfqg, cfqq_class(cfqq), | ||
4252 | cfqq_type(cfqq)); | ||
4253 | |||
4254 | st->ttime.last_end_request = now; | ||
4255 | if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now) | ||
4256 | cfqd->last_delayed_sync = now; | ||
4257 | } | ||
4258 | |||
4259 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4260 | cfqq->cfqg->ttime.last_end_request = now; | ||
4261 | #endif | ||
4262 | |||
4263 | /* | ||
4264 | * If this is the active queue, check if it needs to be expired, | ||
4265 | * or if we want to idle in case it has no pending requests. | ||
4266 | */ | ||
4267 | if (cfqd->active_queue == cfqq) { | ||
4268 | const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); | ||
4269 | |||
4270 | if (cfq_cfqq_slice_new(cfqq)) { | ||
4271 | cfq_set_prio_slice(cfqd, cfqq); | ||
4272 | cfq_clear_cfqq_slice_new(cfqq); | ||
4273 | } | ||
4274 | |||
4275 | /* | ||
4276 | * Should we wait for next request to come in before we expire | ||
4277 | * the queue. | ||
4278 | */ | ||
4279 | if (cfq_should_wait_busy(cfqd, cfqq)) { | ||
4280 | u64 extend_sl = cfqd->cfq_slice_idle; | ||
4281 | if (!cfqd->cfq_slice_idle) | ||
4282 | extend_sl = cfqd->cfq_group_idle; | ||
4283 | cfqq->slice_end = now + extend_sl; | ||
4284 | cfq_mark_cfqq_wait_busy(cfqq); | ||
4285 | cfq_log_cfqq(cfqd, cfqq, "will busy wait"); | ||
4286 | } | ||
4287 | |||
4288 | /* | ||
4289 | * Idling is not enabled on: | ||
4290 | * - expired queues | ||
4291 | * - idle-priority queues | ||
4292 | * - async queues | ||
4293 | * - queues with still some requests queued | ||
4294 | * - when there is a close cooperator | ||
4295 | */ | ||
4296 | if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) | ||
4297 | cfq_slice_expired(cfqd, 1); | ||
4298 | else if (sync && cfqq_empty && | ||
4299 | !cfq_close_cooperator(cfqd, cfqq)) { | ||
4300 | cfq_arm_slice_timer(cfqd); | ||
4301 | } | ||
4302 | } | ||
4303 | |||
4304 | if (!cfqd->rq_in_driver) | ||
4305 | cfq_schedule_dispatch(cfqd); | ||
4306 | } | ||
4307 | |||
4308 | static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op) | ||
4309 | { | ||
4310 | /* | ||
4311 | * If REQ_PRIO is set, boost class and prio level, if it's below | ||
4312 | * BE/NORM. If prio is not set, restore the potentially boosted | ||
4313 | * class/prio level. | ||
4314 | */ | ||
4315 | if (!(op & REQ_PRIO)) { | ||
4316 | cfqq->ioprio_class = cfqq->org_ioprio_class; | ||
4317 | cfqq->ioprio = cfqq->org_ioprio; | ||
4318 | } else { | ||
4319 | if (cfq_class_idle(cfqq)) | ||
4320 | cfqq->ioprio_class = IOPRIO_CLASS_BE; | ||
4321 | if (cfqq->ioprio > IOPRIO_NORM) | ||
4322 | cfqq->ioprio = IOPRIO_NORM; | ||
4323 | } | ||
4324 | } | ||
4325 | |||
4326 | static inline int __cfq_may_queue(struct cfq_queue *cfqq) | ||
4327 | { | ||
4328 | if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { | ||
4329 | cfq_mark_cfqq_must_alloc_slice(cfqq); | ||
4330 | return ELV_MQUEUE_MUST; | ||
4331 | } | ||
4332 | |||
4333 | return ELV_MQUEUE_MAY; | ||
4334 | } | ||
4335 | |||
4336 | static int cfq_may_queue(struct request_queue *q, unsigned int op) | ||
4337 | { | ||
4338 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
4339 | struct task_struct *tsk = current; | ||
4340 | struct cfq_io_cq *cic; | ||
4341 | struct cfq_queue *cfqq; | ||
4342 | |||
4343 | /* | ||
4344 | * don't force setup of a queue from here, as a call to may_queue | ||
4345 | * does not necessarily imply that a request actually will be queued. | ||
4346 | * so just lookup a possibly existing queue, or return 'may queue' | ||
4347 | * if that fails | ||
4348 | */ | ||
4349 | cic = cfq_cic_lookup(cfqd, tsk->io_context); | ||
4350 | if (!cic) | ||
4351 | return ELV_MQUEUE_MAY; | ||
4352 | |||
4353 | cfqq = cic_to_cfqq(cic, op_is_sync(op)); | ||
4354 | if (cfqq) { | ||
4355 | cfq_init_prio_data(cfqq, cic); | ||
4356 | cfqq_boost_on_prio(cfqq, op); | ||
4357 | |||
4358 | return __cfq_may_queue(cfqq); | ||
4359 | } | ||
4360 | |||
4361 | return ELV_MQUEUE_MAY; | ||
4362 | } | ||
4363 | |||
4364 | /* | ||
4365 | * queue lock held here | ||
4366 | */ | ||
4367 | static void cfq_put_request(struct request *rq) | ||
4368 | { | ||
4369 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
4370 | |||
4371 | if (cfqq) { | ||
4372 | const int rw = rq_data_dir(rq); | ||
4373 | |||
4374 | BUG_ON(!cfqq->allocated[rw]); | ||
4375 | cfqq->allocated[rw]--; | ||
4376 | |||
4377 | /* Put down rq reference on cfqg */ | ||
4378 | cfqg_put(RQ_CFQG(rq)); | ||
4379 | rq->elv.priv[0] = NULL; | ||
4380 | rq->elv.priv[1] = NULL; | ||
4381 | |||
4382 | cfq_put_queue(cfqq); | ||
4383 | } | ||
4384 | } | ||
4385 | |||
4386 | static struct cfq_queue * | ||
4387 | cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, | ||
4388 | struct cfq_queue *cfqq) | ||
4389 | { | ||
4390 | cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); | ||
4391 | cic_set_cfqq(cic, cfqq->new_cfqq, 1); | ||
4392 | cfq_mark_cfqq_coop(cfqq->new_cfqq); | ||
4393 | cfq_put_queue(cfqq); | ||
4394 | return cic_to_cfqq(cic, 1); | ||
4395 | } | ||
4396 | |||
4397 | /* | ||
4398 | * Returns NULL if a new cfqq should be allocated, or the old cfqq if this | ||
4399 | * was the last process referring to said cfqq. | ||
4400 | */ | ||
4401 | static struct cfq_queue * | ||
4402 | split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) | ||
4403 | { | ||
4404 | if (cfqq_process_refs(cfqq) == 1) { | ||
4405 | cfqq->pid = current->pid; | ||
4406 | cfq_clear_cfqq_coop(cfqq); | ||
4407 | cfq_clear_cfqq_split_coop(cfqq); | ||
4408 | return cfqq; | ||
4409 | } | ||
4410 | |||
4411 | cic_set_cfqq(cic, NULL, 1); | ||
4412 | |||
4413 | cfq_put_cooperator(cfqq); | ||
4414 | |||
4415 | cfq_put_queue(cfqq); | ||
4416 | return NULL; | ||
4417 | } | ||
4418 | /* | ||
4419 | * Allocate cfq data structures associated with this request. | ||
4420 | */ | ||
4421 | static int | ||
4422 | cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, | ||
4423 | gfp_t gfp_mask) | ||
4424 | { | ||
4425 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
4426 | struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); | ||
4427 | const int rw = rq_data_dir(rq); | ||
4428 | const bool is_sync = rq_is_sync(rq); | ||
4429 | struct cfq_queue *cfqq; | ||
4430 | |||
4431 | spin_lock_irq(q->queue_lock); | ||
4432 | |||
4433 | check_ioprio_changed(cic, bio); | ||
4434 | check_blkcg_changed(cic, bio); | ||
4435 | new_queue: | ||
4436 | cfqq = cic_to_cfqq(cic, is_sync); | ||
4437 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { | ||
4438 | if (cfqq) | ||
4439 | cfq_put_queue(cfqq); | ||
4440 | cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); | ||
4441 | cic_set_cfqq(cic, cfqq, is_sync); | ||
4442 | } else { | ||
4443 | /* | ||
4444 | * If the queue was seeky for too long, break it apart. | ||
4445 | */ | ||
4446 | if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { | ||
4447 | cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); | ||
4448 | cfqq = split_cfqq(cic, cfqq); | ||
4449 | if (!cfqq) | ||
4450 | goto new_queue; | ||
4451 | } | ||
4452 | |||
4453 | /* | ||
4454 | * Check to see if this queue is scheduled to merge with | ||
4455 | * another, closely cooperating queue. The merging of | ||
4456 | * queues happens here as it must be done in process context. | ||
4457 | * The reference on new_cfqq was taken in merge_cfqqs. | ||
4458 | */ | ||
4459 | if (cfqq->new_cfqq) | ||
4460 | cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); | ||
4461 | } | ||
4462 | |||
4463 | cfqq->allocated[rw]++; | ||
4464 | |||
4465 | cfqq->ref++; | ||
4466 | cfqg_get(cfqq->cfqg); | ||
4467 | rq->elv.priv[0] = cfqq; | ||
4468 | rq->elv.priv[1] = cfqq->cfqg; | ||
4469 | spin_unlock_irq(q->queue_lock); | ||
4470 | |||
4471 | return 0; | ||
4472 | } | ||
4473 | |||
4474 | static void cfq_kick_queue(struct work_struct *work) | ||
4475 | { | ||
4476 | struct cfq_data *cfqd = | ||
4477 | container_of(work, struct cfq_data, unplug_work); | ||
4478 | struct request_queue *q = cfqd->queue; | ||
4479 | |||
4480 | spin_lock_irq(q->queue_lock); | ||
4481 | __blk_run_queue(cfqd->queue); | ||
4482 | spin_unlock_irq(q->queue_lock); | ||
4483 | } | ||
4484 | |||
4485 | /* | ||
4486 | * Timer running if the active_queue is currently idling inside its time slice | ||
4487 | */ | ||
4488 | static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer) | ||
4489 | { | ||
4490 | struct cfq_data *cfqd = container_of(timer, struct cfq_data, | ||
4491 | idle_slice_timer); | ||
4492 | struct cfq_queue *cfqq; | ||
4493 | unsigned long flags; | ||
4494 | int timed_out = 1; | ||
4495 | |||
4496 | cfq_log(cfqd, "idle timer fired"); | ||
4497 | |||
4498 | spin_lock_irqsave(cfqd->queue->queue_lock, flags); | ||
4499 | |||
4500 | cfqq = cfqd->active_queue; | ||
4501 | if (cfqq) { | ||
4502 | timed_out = 0; | ||
4503 | |||
4504 | /* | ||
4505 | * We saw a request before the queue expired, let it through | ||
4506 | */ | ||
4507 | if (cfq_cfqq_must_dispatch(cfqq)) | ||
4508 | goto out_kick; | ||
4509 | |||
4510 | /* | ||
4511 | * expired | ||
4512 | */ | ||
4513 | if (cfq_slice_used(cfqq)) | ||
4514 | goto expire; | ||
4515 | |||
4516 | /* | ||
4517 | * only expire and reinvoke request handler, if there are | ||
4518 | * other queues with pending requests | ||
4519 | */ | ||
4520 | if (!cfqd->busy_queues) | ||
4521 | goto out_cont; | ||
4522 | |||
4523 | /* | ||
4524 | * not expired and it has a request pending, let it dispatch | ||
4525 | */ | ||
4526 | if (!RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
4527 | goto out_kick; | ||
4528 | |||
4529 | /* | ||
4530 | * Queue depth flag is reset only when the idle didn't succeed | ||
4531 | */ | ||
4532 | cfq_clear_cfqq_deep(cfqq); | ||
4533 | } | ||
4534 | expire: | ||
4535 | cfq_slice_expired(cfqd, timed_out); | ||
4536 | out_kick: | ||
4537 | cfq_schedule_dispatch(cfqd); | ||
4538 | out_cont: | ||
4539 | spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); | ||
4540 | return HRTIMER_NORESTART; | ||
4541 | } | ||
4542 | |||
4543 | static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) | ||
4544 | { | ||
4545 | hrtimer_cancel(&cfqd->idle_slice_timer); | ||
4546 | cancel_work_sync(&cfqd->unplug_work); | ||
4547 | } | ||
4548 | |||
4549 | static void cfq_exit_queue(struct elevator_queue *e) | ||
4550 | { | ||
4551 | struct cfq_data *cfqd = e->elevator_data; | ||
4552 | struct request_queue *q = cfqd->queue; | ||
4553 | |||
4554 | cfq_shutdown_timer_wq(cfqd); | ||
4555 | |||
4556 | spin_lock_irq(q->queue_lock); | ||
4557 | |||
4558 | if (cfqd->active_queue) | ||
4559 | __cfq_slice_expired(cfqd, cfqd->active_queue, 0); | ||
4560 | |||
4561 | spin_unlock_irq(q->queue_lock); | ||
4562 | |||
4563 | cfq_shutdown_timer_wq(cfqd); | ||
4564 | |||
4565 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4566 | blkcg_deactivate_policy(q, &blkcg_policy_cfq); | ||
4567 | #else | ||
4568 | kfree(cfqd->root_group); | ||
4569 | #endif | ||
4570 | kfree(cfqd); | ||
4571 | } | ||
4572 | |||
4573 | static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) | ||
4574 | { | ||
4575 | struct cfq_data *cfqd; | ||
4576 | struct blkcg_gq *blkg __maybe_unused; | ||
4577 | int i, ret; | ||
4578 | struct elevator_queue *eq; | ||
4579 | |||
4580 | eq = elevator_alloc(q, e); | ||
4581 | if (!eq) | ||
4582 | return -ENOMEM; | ||
4583 | |||
4584 | cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); | ||
4585 | if (!cfqd) { | ||
4586 | kobject_put(&eq->kobj); | ||
4587 | return -ENOMEM; | ||
4588 | } | ||
4589 | eq->elevator_data = cfqd; | ||
4590 | |||
4591 | cfqd->queue = q; | ||
4592 | spin_lock_irq(q->queue_lock); | ||
4593 | q->elevator = eq; | ||
4594 | spin_unlock_irq(q->queue_lock); | ||
4595 | |||
4596 | /* Init root service tree */ | ||
4597 | cfqd->grp_service_tree = CFQ_RB_ROOT; | ||
4598 | |||
4599 | /* Init root group and prefer root group over other groups by default */ | ||
4600 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4601 | ret = blkcg_activate_policy(q, &blkcg_policy_cfq); | ||
4602 | if (ret) | ||
4603 | goto out_free; | ||
4604 | |||
4605 | cfqd->root_group = blkg_to_cfqg(q->root_blkg); | ||
4606 | #else | ||
4607 | ret = -ENOMEM; | ||
4608 | cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), | ||
4609 | GFP_KERNEL, cfqd->queue->node); | ||
4610 | if (!cfqd->root_group) | ||
4611 | goto out_free; | ||
4612 | |||
4613 | cfq_init_cfqg_base(cfqd->root_group); | ||
4614 | cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; | ||
4615 | cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; | ||
4616 | #endif | ||
4617 | |||
4618 | /* | ||
4619 | * Not strictly needed (since RB_ROOT just clears the node and we | ||
4620 | * zeroed cfqd on alloc), but better be safe in case someone decides | ||
4621 | * to add magic to the rb code | ||
4622 | */ | ||
4623 | for (i = 0; i < CFQ_PRIO_LISTS; i++) | ||
4624 | cfqd->prio_trees[i] = RB_ROOT; | ||
4625 | |||
4626 | /* | ||
4627 | * Our fallback cfqq if cfq_get_queue() runs into OOM issues. | ||
4628 | * Grab a permanent reference to it, so that the normal code flow | ||
4629 | * will not attempt to free it. oom_cfqq is linked to root_group | ||
4630 | * but shouldn't hold a reference as it'll never be unlinked. Lose | ||
4631 | * the reference from linking right away. | ||
4632 | */ | ||
4633 | cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); | ||
4634 | cfqd->oom_cfqq.ref++; | ||
4635 | |||
4636 | spin_lock_irq(q->queue_lock); | ||
4637 | cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); | ||
4638 | cfqg_put(cfqd->root_group); | ||
4639 | spin_unlock_irq(q->queue_lock); | ||
4640 | |||
4641 | hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC, | ||
4642 | HRTIMER_MODE_REL); | ||
4643 | cfqd->idle_slice_timer.function = cfq_idle_slice_timer; | ||
4644 | |||
4645 | INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); | ||
4646 | |||
4647 | cfqd->cfq_quantum = cfq_quantum; | ||
4648 | cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; | ||
4649 | cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; | ||
4650 | cfqd->cfq_back_max = cfq_back_max; | ||
4651 | cfqd->cfq_back_penalty = cfq_back_penalty; | ||
4652 | cfqd->cfq_slice[0] = cfq_slice_async; | ||
4653 | cfqd->cfq_slice[1] = cfq_slice_sync; | ||
4654 | cfqd->cfq_target_latency = cfq_target_latency; | ||
4655 | cfqd->cfq_slice_async_rq = cfq_slice_async_rq; | ||
4656 | cfqd->cfq_slice_idle = cfq_slice_idle; | ||
4657 | cfqd->cfq_group_idle = cfq_group_idle; | ||
4658 | cfqd->cfq_latency = 1; | ||
4659 | cfqd->hw_tag = -1; | ||
4660 | /* | ||
4661 | * we optimistically start assuming sync ops weren't delayed in last | ||
4662 | * second, in order to have larger depth for async operations. | ||
4663 | */ | ||
4664 | cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC; | ||
4665 | return 0; | ||
4666 | |||
4667 | out_free: | ||
4668 | kfree(cfqd); | ||
4669 | kobject_put(&eq->kobj); | ||
4670 | return ret; | ||
4671 | } | ||
4672 | |||
4673 | static void cfq_registered_queue(struct request_queue *q) | ||
4674 | { | ||
4675 | struct elevator_queue *e = q->elevator; | ||
4676 | struct cfq_data *cfqd = e->elevator_data; | ||
4677 | |||
4678 | /* | ||
4679 | * Default to IOPS mode with no idling for SSDs | ||
4680 | */ | ||
4681 | if (blk_queue_nonrot(q)) | ||
4682 | cfqd->cfq_slice_idle = 0; | ||
4683 | wbt_disable_default(q); | ||
4684 | } | ||
4685 | |||
4686 | /* | ||
4687 | * sysfs parts below --> | ||
4688 | */ | ||
4689 | static ssize_t | ||
4690 | cfq_var_show(unsigned int var, char *page) | ||
4691 | { | ||
4692 | return sprintf(page, "%u\n", var); | ||
4693 | } | ||
4694 | |||
4695 | static void | ||
4696 | cfq_var_store(unsigned int *var, const char *page) | ||
4697 | { | ||
4698 | char *p = (char *) page; | ||
4699 | |||
4700 | *var = simple_strtoul(p, &p, 10); | ||
4701 | } | ||
4702 | |||
4703 | #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ | ||
4704 | static ssize_t __FUNC(struct elevator_queue *e, char *page) \ | ||
4705 | { \ | ||
4706 | struct cfq_data *cfqd = e->elevator_data; \ | ||
4707 | u64 __data = __VAR; \ | ||
4708 | if (__CONV) \ | ||
4709 | __data = div_u64(__data, NSEC_PER_MSEC); \ | ||
4710 | return cfq_var_show(__data, (page)); \ | ||
4711 | } | ||
4712 | SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); | ||
4713 | SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); | ||
4714 | SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); | ||
4715 | SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); | ||
4716 | SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); | ||
4717 | SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); | ||
4718 | SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); | ||
4719 | SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); | ||
4720 | SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); | ||
4721 | SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); | ||
4722 | SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); | ||
4723 | SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); | ||
4724 | #undef SHOW_FUNCTION | ||
4725 | |||
4726 | #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ | ||
4727 | static ssize_t __FUNC(struct elevator_queue *e, char *page) \ | ||
4728 | { \ | ||
4729 | struct cfq_data *cfqd = e->elevator_data; \ | ||
4730 | u64 __data = __VAR; \ | ||
4731 | __data = div_u64(__data, NSEC_PER_USEC); \ | ||
4732 | return cfq_var_show(__data, (page)); \ | ||
4733 | } | ||
4734 | USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle); | ||
4735 | USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle); | ||
4736 | USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]); | ||
4737 | USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]); | ||
4738 | USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency); | ||
4739 | #undef USEC_SHOW_FUNCTION | ||
4740 | |||
4741 | #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ | ||
4742 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ | ||
4743 | { \ | ||
4744 | struct cfq_data *cfqd = e->elevator_data; \ | ||
4745 | unsigned int __data, __min = (MIN), __max = (MAX); \ | ||
4746 | \ | ||
4747 | cfq_var_store(&__data, (page)); \ | ||
4748 | if (__data < __min) \ | ||
4749 | __data = __min; \ | ||
4750 | else if (__data > __max) \ | ||
4751 | __data = __max; \ | ||
4752 | if (__CONV) \ | ||
4753 | *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ | ||
4754 | else \ | ||
4755 | *(__PTR) = __data; \ | ||
4756 | return count; \ | ||
4757 | } | ||
4758 | STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); | ||
4759 | STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, | ||
4760 | UINT_MAX, 1); | ||
4761 | STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, | ||
4762 | UINT_MAX, 1); | ||
4763 | STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); | ||
4764 | STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, | ||
4765 | UINT_MAX, 0); | ||
4766 | STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); | ||
4767 | STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); | ||
4768 | STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); | ||
4769 | STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); | ||
4770 | STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, | ||
4771 | UINT_MAX, 0); | ||
4772 | STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); | ||
4773 | STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); | ||
4774 | #undef STORE_FUNCTION | ||
4775 | |||
4776 | #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ | ||
4777 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ | ||
4778 | { \ | ||
4779 | struct cfq_data *cfqd = e->elevator_data; \ | ||
4780 | unsigned int __data, __min = (MIN), __max = (MAX); \ | ||
4781 | \ | ||
4782 | cfq_var_store(&__data, (page)); \ | ||
4783 | if (__data < __min) \ | ||
4784 | __data = __min; \ | ||
4785 | else if (__data > __max) \ | ||
4786 | __data = __max; \ | ||
4787 | *(__PTR) = (u64)__data * NSEC_PER_USEC; \ | ||
4788 | return count; \ | ||
4789 | } | ||
4790 | USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX); | ||
4791 | USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX); | ||
4792 | USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX); | ||
4793 | USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX); | ||
4794 | USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX); | ||
4795 | #undef USEC_STORE_FUNCTION | ||
4796 | |||
4797 | #define CFQ_ATTR(name) \ | ||
4798 | __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store) | ||
4799 | |||
4800 | static struct elv_fs_entry cfq_attrs[] = { | ||
4801 | CFQ_ATTR(quantum), | ||
4802 | CFQ_ATTR(fifo_expire_sync), | ||
4803 | CFQ_ATTR(fifo_expire_async), | ||
4804 | CFQ_ATTR(back_seek_max), | ||
4805 | CFQ_ATTR(back_seek_penalty), | ||
4806 | CFQ_ATTR(slice_sync), | ||
4807 | CFQ_ATTR(slice_sync_us), | ||
4808 | CFQ_ATTR(slice_async), | ||
4809 | CFQ_ATTR(slice_async_us), | ||
4810 | CFQ_ATTR(slice_async_rq), | ||
4811 | CFQ_ATTR(slice_idle), | ||
4812 | CFQ_ATTR(slice_idle_us), | ||
4813 | CFQ_ATTR(group_idle), | ||
4814 | CFQ_ATTR(group_idle_us), | ||
4815 | CFQ_ATTR(low_latency), | ||
4816 | CFQ_ATTR(target_latency), | ||
4817 | CFQ_ATTR(target_latency_us), | ||
4818 | __ATTR_NULL | ||
4819 | }; | ||
4820 | |||
4821 | static struct elevator_type iosched_cfq = { | ||
4822 | .ops.sq = { | ||
4823 | .elevator_merge_fn = cfq_merge, | ||
4824 | .elevator_merged_fn = cfq_merged_request, | ||
4825 | .elevator_merge_req_fn = cfq_merged_requests, | ||
4826 | .elevator_allow_bio_merge_fn = cfq_allow_bio_merge, | ||
4827 | .elevator_allow_rq_merge_fn = cfq_allow_rq_merge, | ||
4828 | .elevator_bio_merged_fn = cfq_bio_merged, | ||
4829 | .elevator_dispatch_fn = cfq_dispatch_requests, | ||
4830 | .elevator_add_req_fn = cfq_insert_request, | ||
4831 | .elevator_activate_req_fn = cfq_activate_request, | ||
4832 | .elevator_deactivate_req_fn = cfq_deactivate_request, | ||
4833 | .elevator_completed_req_fn = cfq_completed_request, | ||
4834 | .elevator_former_req_fn = elv_rb_former_request, | ||
4835 | .elevator_latter_req_fn = elv_rb_latter_request, | ||
4836 | .elevator_init_icq_fn = cfq_init_icq, | ||
4837 | .elevator_exit_icq_fn = cfq_exit_icq, | ||
4838 | .elevator_set_req_fn = cfq_set_request, | ||
4839 | .elevator_put_req_fn = cfq_put_request, | ||
4840 | .elevator_may_queue_fn = cfq_may_queue, | ||
4841 | .elevator_init_fn = cfq_init_queue, | ||
4842 | .elevator_exit_fn = cfq_exit_queue, | ||
4843 | .elevator_registered_fn = cfq_registered_queue, | ||
4844 | }, | ||
4845 | .icq_size = sizeof(struct cfq_io_cq), | ||
4846 | .icq_align = __alignof__(struct cfq_io_cq), | ||
4847 | .elevator_attrs = cfq_attrs, | ||
4848 | .elevator_name = "cfq", | ||
4849 | .elevator_owner = THIS_MODULE, | ||
4850 | }; | ||
4851 | |||
4852 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4853 | static struct blkcg_policy blkcg_policy_cfq = { | ||
4854 | .dfl_cftypes = cfq_blkcg_files, | ||
4855 | .legacy_cftypes = cfq_blkcg_legacy_files, | ||
4856 | |||
4857 | .cpd_alloc_fn = cfq_cpd_alloc, | ||
4858 | .cpd_init_fn = cfq_cpd_init, | ||
4859 | .cpd_free_fn = cfq_cpd_free, | ||
4860 | .cpd_bind_fn = cfq_cpd_bind, | ||
4861 | |||
4862 | .pd_alloc_fn = cfq_pd_alloc, | ||
4863 | .pd_init_fn = cfq_pd_init, | ||
4864 | .pd_offline_fn = cfq_pd_offline, | ||
4865 | .pd_free_fn = cfq_pd_free, | ||
4866 | .pd_reset_stats_fn = cfq_pd_reset_stats, | ||
4867 | }; | ||
4868 | #endif | ||
4869 | |||
4870 | static int __init cfq_init(void) | ||
4871 | { | ||
4872 | int ret; | ||
4873 | |||
4874 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4875 | ret = blkcg_policy_register(&blkcg_policy_cfq); | ||
4876 | if (ret) | ||
4877 | return ret; | ||
4878 | #else | ||
4879 | cfq_group_idle = 0; | ||
4880 | #endif | ||
4881 | |||
4882 | ret = -ENOMEM; | ||
4883 | cfq_pool = KMEM_CACHE(cfq_queue, 0); | ||
4884 | if (!cfq_pool) | ||
4885 | goto err_pol_unreg; | ||
4886 | |||
4887 | ret = elv_register(&iosched_cfq); | ||
4888 | if (ret) | ||
4889 | goto err_free_pool; | ||
4890 | |||
4891 | return 0; | ||
4892 | |||
4893 | err_free_pool: | ||
4894 | kmem_cache_destroy(cfq_pool); | ||
4895 | err_pol_unreg: | ||
4896 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4897 | blkcg_policy_unregister(&blkcg_policy_cfq); | ||
4898 | #endif | ||
4899 | return ret; | ||
4900 | } | ||
4901 | |||
4902 | static void __exit cfq_exit(void) | ||
4903 | { | ||
4904 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4905 | blkcg_policy_unregister(&blkcg_policy_cfq); | ||
4906 | #endif | ||
4907 | elv_unregister(&iosched_cfq); | ||
4908 | kmem_cache_destroy(cfq_pool); | ||
4909 | } | ||
4910 | |||
4911 | module_init(cfq_init); | ||
4912 | module_exit(cfq_exit); | ||
4913 | |||
4914 | MODULE_AUTHOR("Jens Axboe"); | ||
4915 | MODULE_LICENSE("GPL"); | ||
4916 | MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); | ||
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c deleted file mode 100644 index ef2f1f09e9b3..000000000000 --- a/block/deadline-iosched.c +++ /dev/null | |||
@@ -1,560 +0,0 @@ | |||
1 | /* | ||
2 | * Deadline i/o scheduler. | ||
3 | * | ||
4 | * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk> | ||
5 | */ | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/fs.h> | ||
8 | #include <linux/blkdev.h> | ||
9 | #include <linux/elevator.h> | ||
10 | #include <linux/bio.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/slab.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/compiler.h> | ||
15 | #include <linux/rbtree.h> | ||
16 | |||
17 | /* | ||
18 | * See Documentation/block/deadline-iosched.txt | ||
19 | */ | ||
20 | static const int read_expire = HZ / 2; /* max time before a read is submitted. */ | ||
21 | static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ | ||
22 | static const int writes_starved = 2; /* max times reads can starve a write */ | ||
23 | static const int fifo_batch = 16; /* # of sequential requests treated as one | ||
24 | by the above parameters. For throughput. */ | ||
25 | |||
26 | struct deadline_data { | ||
27 | /* | ||
28 | * run time data | ||
29 | */ | ||
30 | |||
31 | /* | ||
32 | * requests (deadline_rq s) are present on both sort_list and fifo_list | ||
33 | */ | ||
34 | struct rb_root sort_list[2]; | ||
35 | struct list_head fifo_list[2]; | ||
36 | |||
37 | /* | ||
38 | * next in sort order. read, write or both are NULL | ||
39 | */ | ||
40 | struct request *next_rq[2]; | ||
41 | unsigned int batching; /* number of sequential requests made */ | ||
42 | unsigned int starved; /* times reads have starved writes */ | ||
43 | |||
44 | /* | ||
45 | * settings that change how the i/o scheduler behaves | ||
46 | */ | ||
47 | int fifo_expire[2]; | ||
48 | int fifo_batch; | ||
49 | int writes_starved; | ||
50 | int front_merges; | ||
51 | }; | ||
52 | |||
53 | static inline struct rb_root * | ||
54 | deadline_rb_root(struct deadline_data *dd, struct request *rq) | ||
55 | { | ||
56 | return &dd->sort_list[rq_data_dir(rq)]; | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | * get the request after `rq' in sector-sorted order | ||
61 | */ | ||
62 | static inline struct request * | ||
63 | deadline_latter_request(struct request *rq) | ||
64 | { | ||
65 | struct rb_node *node = rb_next(&rq->rb_node); | ||
66 | |||
67 | if (node) | ||
68 | return rb_entry_rq(node); | ||
69 | |||
70 | return NULL; | ||
71 | } | ||
72 | |||
73 | static void | ||
74 | deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) | ||
75 | { | ||
76 | struct rb_root *root = deadline_rb_root(dd, rq); | ||
77 | |||
78 | elv_rb_add(root, rq); | ||
79 | } | ||
80 | |||
81 | static inline void | ||
82 | deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) | ||
83 | { | ||
84 | const int data_dir = rq_data_dir(rq); | ||
85 | |||
86 | if (dd->next_rq[data_dir] == rq) | ||
87 | dd->next_rq[data_dir] = deadline_latter_request(rq); | ||
88 | |||
89 | elv_rb_del(deadline_rb_root(dd, rq), rq); | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * add rq to rbtree and fifo | ||
94 | */ | ||
95 | static void | ||
96 | deadline_add_request(struct request_queue *q, struct request *rq) | ||
97 | { | ||
98 | struct deadline_data *dd = q->elevator->elevator_data; | ||
99 | const int data_dir = rq_data_dir(rq); | ||
100 | |||
101 | /* | ||
102 | * This may be a requeue of a write request that has locked its | ||
103 | * target zone. If it is the case, this releases the zone lock. | ||
104 | */ | ||
105 | blk_req_zone_write_unlock(rq); | ||
106 | |||
107 | deadline_add_rq_rb(dd, rq); | ||
108 | |||
109 | /* | ||
110 | * set expire time and add to fifo list | ||
111 | */ | ||
112 | rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; | ||
113 | list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * remove rq from rbtree and fifo. | ||
118 | */ | ||
119 | static void deadline_remove_request(struct request_queue *q, struct request *rq) | ||
120 | { | ||
121 | struct deadline_data *dd = q->elevator->elevator_data; | ||
122 | |||
123 | rq_fifo_clear(rq); | ||
124 | deadline_del_rq_rb(dd, rq); | ||
125 | } | ||
126 | |||
127 | static enum elv_merge | ||
128 | deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) | ||
129 | { | ||
130 | struct deadline_data *dd = q->elevator->elevator_data; | ||
131 | struct request *__rq; | ||
132 | |||
133 | /* | ||
134 | * check for front merge | ||
135 | */ | ||
136 | if (dd->front_merges) { | ||
137 | sector_t sector = bio_end_sector(bio); | ||
138 | |||
139 | __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); | ||
140 | if (__rq) { | ||
141 | BUG_ON(sector != blk_rq_pos(__rq)); | ||
142 | |||
143 | if (elv_bio_merge_ok(__rq, bio)) { | ||
144 | *req = __rq; | ||
145 | return ELEVATOR_FRONT_MERGE; | ||
146 | } | ||
147 | } | ||
148 | } | ||
149 | |||
150 | return ELEVATOR_NO_MERGE; | ||
151 | } | ||
152 | |||
153 | static void deadline_merged_request(struct request_queue *q, | ||
154 | struct request *req, enum elv_merge type) | ||
155 | { | ||
156 | struct deadline_data *dd = q->elevator->elevator_data; | ||
157 | |||
158 | /* | ||
159 | * if the merge was a front merge, we need to reposition request | ||
160 | */ | ||
161 | if (type == ELEVATOR_FRONT_MERGE) { | ||
162 | elv_rb_del(deadline_rb_root(dd, req), req); | ||
163 | deadline_add_rq_rb(dd, req); | ||
164 | } | ||
165 | } | ||
166 | |||
167 | static void | ||
168 | deadline_merged_requests(struct request_queue *q, struct request *req, | ||
169 | struct request *next) | ||
170 | { | ||
171 | /* | ||
172 | * if next expires before rq, assign its expire time to rq | ||
173 | * and move into next position (next will be deleted) in fifo | ||
174 | */ | ||
175 | if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { | ||
176 | if (time_before((unsigned long)next->fifo_time, | ||
177 | (unsigned long)req->fifo_time)) { | ||
178 | list_move(&req->queuelist, &next->queuelist); | ||
179 | req->fifo_time = next->fifo_time; | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * kill knowledge of next, this one is a goner | ||
185 | */ | ||
186 | deadline_remove_request(q, next); | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * move request from sort list to dispatch queue. | ||
191 | */ | ||
192 | static inline void | ||
193 | deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) | ||
194 | { | ||
195 | struct request_queue *q = rq->q; | ||
196 | |||
197 | /* | ||
198 | * For a zoned block device, write requests must write lock their | ||
199 | * target zone. | ||
200 | */ | ||
201 | blk_req_zone_write_lock(rq); | ||
202 | |||
203 | deadline_remove_request(q, rq); | ||
204 | elv_dispatch_add_tail(q, rq); | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | * move an entry to dispatch queue | ||
209 | */ | ||
210 | static void | ||
211 | deadline_move_request(struct deadline_data *dd, struct request *rq) | ||
212 | { | ||
213 | const int data_dir = rq_data_dir(rq); | ||
214 | |||
215 | dd->next_rq[READ] = NULL; | ||
216 | dd->next_rq[WRITE] = NULL; | ||
217 | dd->next_rq[data_dir] = deadline_latter_request(rq); | ||
218 | |||
219 | /* | ||
220 | * take it off the sort and fifo list, move | ||
221 | * to dispatch queue | ||
222 | */ | ||
223 | deadline_move_to_dispatch(dd, rq); | ||
224 | } | ||
225 | |||
226 | /* | ||
227 | * deadline_check_fifo returns 0 if there are no expired requests on the fifo, | ||
228 | * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) | ||
229 | */ | ||
230 | static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) | ||
231 | { | ||
232 | struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); | ||
233 | |||
234 | /* | ||
235 | * rq is expired! | ||
236 | */ | ||
237 | if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) | ||
238 | return 1; | ||
239 | |||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * For the specified data direction, return the next request to dispatch using | ||
245 | * arrival ordered lists. | ||
246 | */ | ||
247 | static struct request * | ||
248 | deadline_fifo_request(struct deadline_data *dd, int data_dir) | ||
249 | { | ||
250 | struct request *rq; | ||
251 | |||
252 | if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) | ||
253 | return NULL; | ||
254 | |||
255 | if (list_empty(&dd->fifo_list[data_dir])) | ||
256 | return NULL; | ||
257 | |||
258 | rq = rq_entry_fifo(dd->fifo_list[data_dir].next); | ||
259 | if (data_dir == READ || !blk_queue_is_zoned(rq->q)) | ||
260 | return rq; | ||
261 | |||
262 | /* | ||
263 | * Look for a write request that can be dispatched, that is one with | ||
264 | * an unlocked target zone. | ||
265 | */ | ||
266 | list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { | ||
267 | if (blk_req_can_dispatch_to_zone(rq)) | ||
268 | return rq; | ||
269 | } | ||
270 | |||
271 | return NULL; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * For the specified data direction, return the next request to dispatch using | ||
276 | * sector position sorted lists. | ||
277 | */ | ||
278 | static struct request * | ||
279 | deadline_next_request(struct deadline_data *dd, int data_dir) | ||
280 | { | ||
281 | struct request *rq; | ||
282 | |||
283 | if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) | ||
284 | return NULL; | ||
285 | |||
286 | rq = dd->next_rq[data_dir]; | ||
287 | if (!rq) | ||
288 | return NULL; | ||
289 | |||
290 | if (data_dir == READ || !blk_queue_is_zoned(rq->q)) | ||
291 | return rq; | ||
292 | |||
293 | /* | ||
294 | * Look for a write request that can be dispatched, that is one with | ||
295 | * an unlocked target zone. | ||
296 | */ | ||
297 | while (rq) { | ||
298 | if (blk_req_can_dispatch_to_zone(rq)) | ||
299 | return rq; | ||
300 | rq = deadline_latter_request(rq); | ||
301 | } | ||
302 | |||
303 | return NULL; | ||
304 | } | ||
305 | |||
306 | /* | ||
307 | * deadline_dispatch_requests selects the best request according to | ||
308 | * read/write expire, fifo_batch, etc | ||
309 | */ | ||
310 | static int deadline_dispatch_requests(struct request_queue *q, int force) | ||
311 | { | ||
312 | struct deadline_data *dd = q->elevator->elevator_data; | ||
313 | const int reads = !list_empty(&dd->fifo_list[READ]); | ||
314 | const int writes = !list_empty(&dd->fifo_list[WRITE]); | ||
315 | struct request *rq, *next_rq; | ||
316 | int data_dir; | ||
317 | |||
318 | /* | ||
319 | * batches are currently reads XOR writes | ||
320 | */ | ||
321 | rq = deadline_next_request(dd, WRITE); | ||
322 | if (!rq) | ||
323 | rq = deadline_next_request(dd, READ); | ||
324 | |||
325 | if (rq && dd->batching < dd->fifo_batch) | ||
326 | /* we have a next request are still entitled to batch */ | ||
327 | goto dispatch_request; | ||
328 | |||
329 | /* | ||
330 | * at this point we are not running a batch. select the appropriate | ||
331 | * data direction (read / write) | ||
332 | */ | ||
333 | |||
334 | if (reads) { | ||
335 | BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); | ||
336 | |||
337 | if (deadline_fifo_request(dd, WRITE) && | ||
338 | (dd->starved++ >= dd->writes_starved)) | ||
339 | goto dispatch_writes; | ||
340 | |||
341 | data_dir = READ; | ||
342 | |||
343 | goto dispatch_find_request; | ||
344 | } | ||
345 | |||
346 | /* | ||
347 | * there are either no reads or writes have been starved | ||
348 | */ | ||
349 | |||
350 | if (writes) { | ||
351 | dispatch_writes: | ||
352 | BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); | ||
353 | |||
354 | dd->starved = 0; | ||
355 | |||
356 | data_dir = WRITE; | ||
357 | |||
358 | goto dispatch_find_request; | ||
359 | } | ||
360 | |||
361 | return 0; | ||
362 | |||
363 | dispatch_find_request: | ||
364 | /* | ||
365 | * we are not running a batch, find best request for selected data_dir | ||
366 | */ | ||
367 | next_rq = deadline_next_request(dd, data_dir); | ||
368 | if (deadline_check_fifo(dd, data_dir) || !next_rq) { | ||
369 | /* | ||
370 | * A deadline has expired, the last request was in the other | ||
371 | * direction, or we have run out of higher-sectored requests. | ||
372 | * Start again from the request with the earliest expiry time. | ||
373 | */ | ||
374 | rq = deadline_fifo_request(dd, data_dir); | ||
375 | } else { | ||
376 | /* | ||
377 | * The last req was the same dir and we have a next request in | ||
378 | * sort order. No expired requests so continue on from here. | ||
379 | */ | ||
380 | rq = next_rq; | ||
381 | } | ||
382 | |||
383 | /* | ||
384 | * For a zoned block device, if we only have writes queued and none of | ||
385 | * them can be dispatched, rq will be NULL. | ||
386 | */ | ||
387 | if (!rq) | ||
388 | return 0; | ||
389 | |||
390 | dd->batching = 0; | ||
391 | |||
392 | dispatch_request: | ||
393 | /* | ||
394 | * rq is the selected appropriate request. | ||
395 | */ | ||
396 | dd->batching++; | ||
397 | deadline_move_request(dd, rq); | ||
398 | |||
399 | return 1; | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * For zoned block devices, write unlock the target zone of completed | ||
404 | * write requests. | ||
405 | */ | ||
406 | static void | ||
407 | deadline_completed_request(struct request_queue *q, struct request *rq) | ||
408 | { | ||
409 | blk_req_zone_write_unlock(rq); | ||
410 | } | ||
411 | |||
412 | static void deadline_exit_queue(struct elevator_queue *e) | ||
413 | { | ||
414 | struct deadline_data *dd = e->elevator_data; | ||
415 | |||
416 | BUG_ON(!list_empty(&dd->fifo_list[READ])); | ||
417 | BUG_ON(!list_empty(&dd->fifo_list[WRITE])); | ||
418 | |||
419 | kfree(dd); | ||
420 | } | ||
421 | |||
422 | /* | ||
423 | * initialize elevator private data (deadline_data). | ||
424 | */ | ||
425 | static int deadline_init_queue(struct request_queue *q, struct elevator_type *e) | ||
426 | { | ||
427 | struct deadline_data *dd; | ||
428 | struct elevator_queue *eq; | ||
429 | |||
430 | eq = elevator_alloc(q, e); | ||
431 | if (!eq) | ||
432 | return -ENOMEM; | ||
433 | |||
434 | dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); | ||
435 | if (!dd) { | ||
436 | kobject_put(&eq->kobj); | ||
437 | return -ENOMEM; | ||
438 | } | ||
439 | eq->elevator_data = dd; | ||
440 | |||
441 | INIT_LIST_HEAD(&dd->fifo_list[READ]); | ||
442 | INIT_LIST_HEAD(&dd->fifo_list[WRITE]); | ||
443 | dd->sort_list[READ] = RB_ROOT; | ||
444 | dd->sort_list[WRITE] = RB_ROOT; | ||
445 | dd->fifo_expire[READ] = read_expire; | ||
446 | dd->fifo_expire[WRITE] = write_expire; | ||
447 | dd->writes_starved = writes_starved; | ||
448 | dd->front_merges = 1; | ||
449 | dd->fifo_batch = fifo_batch; | ||
450 | |||
451 | spin_lock_irq(q->queue_lock); | ||
452 | q->elevator = eq; | ||
453 | spin_unlock_irq(q->queue_lock); | ||
454 | return 0; | ||
455 | } | ||
456 | |||
457 | /* | ||
458 | * sysfs parts below | ||
459 | */ | ||
460 | |||
461 | static ssize_t | ||
462 | deadline_var_show(int var, char *page) | ||
463 | { | ||
464 | return sprintf(page, "%d\n", var); | ||
465 | } | ||
466 | |||
467 | static void | ||
468 | deadline_var_store(int *var, const char *page) | ||
469 | { | ||
470 | char *p = (char *) page; | ||
471 | |||
472 | *var = simple_strtol(p, &p, 10); | ||
473 | } | ||
474 | |||
475 | #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ | ||
476 | static ssize_t __FUNC(struct elevator_queue *e, char *page) \ | ||
477 | { \ | ||
478 | struct deadline_data *dd = e->elevator_data; \ | ||
479 | int __data = __VAR; \ | ||
480 | if (__CONV) \ | ||
481 | __data = jiffies_to_msecs(__data); \ | ||
482 | return deadline_var_show(__data, (page)); \ | ||
483 | } | ||
484 | SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); | ||
485 | SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); | ||
486 | SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); | ||
487 | SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); | ||
488 | SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); | ||
489 | #undef SHOW_FUNCTION | ||
490 | |||
491 | #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ | ||
492 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ | ||
493 | { \ | ||
494 | struct deadline_data *dd = e->elevator_data; \ | ||
495 | int __data; \ | ||
496 | deadline_var_store(&__data, (page)); \ | ||
497 | if (__data < (MIN)) \ | ||
498 | __data = (MIN); \ | ||
499 | else if (__data > (MAX)) \ | ||
500 | __data = (MAX); \ | ||
501 | if (__CONV) \ | ||
502 | *(__PTR) = msecs_to_jiffies(__data); \ | ||
503 | else \ | ||
504 | *(__PTR) = __data; \ | ||
505 | return count; \ | ||
506 | } | ||
507 | STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); | ||
508 | STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); | ||
509 | STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); | ||
510 | STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); | ||
511 | STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); | ||
512 | #undef STORE_FUNCTION | ||
513 | |||
514 | #define DD_ATTR(name) \ | ||
515 | __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) | ||
516 | |||
517 | static struct elv_fs_entry deadline_attrs[] = { | ||
518 | DD_ATTR(read_expire), | ||
519 | DD_ATTR(write_expire), | ||
520 | DD_ATTR(writes_starved), | ||
521 | DD_ATTR(front_merges), | ||
522 | DD_ATTR(fifo_batch), | ||
523 | __ATTR_NULL | ||
524 | }; | ||
525 | |||
526 | static struct elevator_type iosched_deadline = { | ||
527 | .ops.sq = { | ||
528 | .elevator_merge_fn = deadline_merge, | ||
529 | .elevator_merged_fn = deadline_merged_request, | ||
530 | .elevator_merge_req_fn = deadline_merged_requests, | ||
531 | .elevator_dispatch_fn = deadline_dispatch_requests, | ||
532 | .elevator_completed_req_fn = deadline_completed_request, | ||
533 | .elevator_add_req_fn = deadline_add_request, | ||
534 | .elevator_former_req_fn = elv_rb_former_request, | ||
535 | .elevator_latter_req_fn = elv_rb_latter_request, | ||
536 | .elevator_init_fn = deadline_init_queue, | ||
537 | .elevator_exit_fn = deadline_exit_queue, | ||
538 | }, | ||
539 | |||
540 | .elevator_attrs = deadline_attrs, | ||
541 | .elevator_name = "deadline", | ||
542 | .elevator_owner = THIS_MODULE, | ||
543 | }; | ||
544 | |||
545 | static int __init deadline_init(void) | ||
546 | { | ||
547 | return elv_register(&iosched_deadline); | ||
548 | } | ||
549 | |||
550 | static void __exit deadline_exit(void) | ||
551 | { | ||
552 | elv_unregister(&iosched_deadline); | ||
553 | } | ||
554 | |||
555 | module_init(deadline_init); | ||
556 | module_exit(deadline_exit); | ||
557 | |||
558 | MODULE_AUTHOR("Jens Axboe"); | ||
559 | MODULE_LICENSE("GPL"); | ||
560 | MODULE_DESCRIPTION("deadline IO scheduler"); | ||
diff --git a/block/elevator.c b/block/elevator.c index 8fdcd64ae12e..f05e90d4e695 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -61,10 +61,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) | |||
61 | struct request_queue *q = rq->q; | 61 | struct request_queue *q = rq->q; |
62 | struct elevator_queue *e = q->elevator; | 62 | struct elevator_queue *e = q->elevator; |
63 | 63 | ||
64 | if (e->uses_mq && e->type->ops.mq.allow_merge) | 64 | if (e->type->ops.allow_merge) |
65 | return e->type->ops.mq.allow_merge(q, rq, bio); | 65 | return e->type->ops.allow_merge(q, rq, bio); |
66 | else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn) | ||
67 | return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio); | ||
68 | 66 | ||
69 | return 1; | 67 | return 1; |
70 | } | 68 | } |
@@ -95,14 +93,14 @@ static bool elevator_match(const struct elevator_type *e, const char *name) | |||
95 | } | 93 | } |
96 | 94 | ||
97 | /* | 95 | /* |
98 | * Return scheduler with name 'name' and with matching 'mq capability | 96 | * Return scheduler with name 'name' |
99 | */ | 97 | */ |
100 | static struct elevator_type *elevator_find(const char *name, bool mq) | 98 | static struct elevator_type *elevator_find(const char *name) |
101 | { | 99 | { |
102 | struct elevator_type *e; | 100 | struct elevator_type *e; |
103 | 101 | ||
104 | list_for_each_entry(e, &elv_list, list) { | 102 | list_for_each_entry(e, &elv_list, list) { |
105 | if (elevator_match(e, name) && (mq == e->uses_mq)) | 103 | if (elevator_match(e, name)) |
106 | return e; | 104 | return e; |
107 | } | 105 | } |
108 | 106 | ||
@@ -121,12 +119,12 @@ static struct elevator_type *elevator_get(struct request_queue *q, | |||
121 | 119 | ||
122 | spin_lock(&elv_list_lock); | 120 | spin_lock(&elv_list_lock); |
123 | 121 | ||
124 | e = elevator_find(name, q->mq_ops != NULL); | 122 | e = elevator_find(name); |
125 | if (!e && try_loading) { | 123 | if (!e && try_loading) { |
126 | spin_unlock(&elv_list_lock); | 124 | spin_unlock(&elv_list_lock); |
127 | request_module("%s-iosched", name); | 125 | request_module("%s-iosched", name); |
128 | spin_lock(&elv_list_lock); | 126 | spin_lock(&elv_list_lock); |
129 | e = elevator_find(name, q->mq_ops != NULL); | 127 | e = elevator_find(name); |
130 | } | 128 | } |
131 | 129 | ||
132 | if (e && !try_module_get(e->elevator_owner)) | 130 | if (e && !try_module_get(e->elevator_owner)) |
@@ -150,26 +148,6 @@ static int __init elevator_setup(char *str) | |||
150 | 148 | ||
151 | __setup("elevator=", elevator_setup); | 149 | __setup("elevator=", elevator_setup); |
152 | 150 | ||
153 | /* called during boot to load the elevator chosen by the elevator param */ | ||
154 | void __init load_default_elevator_module(void) | ||
155 | { | ||
156 | struct elevator_type *e; | ||
157 | |||
158 | if (!chosen_elevator[0]) | ||
159 | return; | ||
160 | |||
161 | /* | ||
162 | * Boot parameter is deprecated, we haven't supported that for MQ. | ||
163 | * Only look for non-mq schedulers from here. | ||
164 | */ | ||
165 | spin_lock(&elv_list_lock); | ||
166 | e = elevator_find(chosen_elevator, false); | ||
167 | spin_unlock(&elv_list_lock); | ||
168 | |||
169 | if (!e) | ||
170 | request_module("%s-iosched", chosen_elevator); | ||
171 | } | ||
172 | |||
173 | static struct kobj_type elv_ktype; | 151 | static struct kobj_type elv_ktype; |
174 | 152 | ||
175 | struct elevator_queue *elevator_alloc(struct request_queue *q, | 153 | struct elevator_queue *elevator_alloc(struct request_queue *q, |
@@ -185,7 +163,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, | |||
185 | kobject_init(&eq->kobj, &elv_ktype); | 163 | kobject_init(&eq->kobj, &elv_ktype); |
186 | mutex_init(&eq->sysfs_lock); | 164 | mutex_init(&eq->sysfs_lock); |
187 | hash_init(eq->hash); | 165 | hash_init(eq->hash); |
188 | eq->uses_mq = e->uses_mq; | ||
189 | 166 | ||
190 | return eq; | 167 | return eq; |
191 | } | 168 | } |
@@ -200,54 +177,11 @@ static void elevator_release(struct kobject *kobj) | |||
200 | kfree(e); | 177 | kfree(e); |
201 | } | 178 | } |
202 | 179 | ||
203 | /* | ||
204 | * Use the default elevator specified by config boot param for non-mq devices, | ||
205 | * or by config option. Don't try to load modules as we could be running off | ||
206 | * async and request_module() isn't allowed from async. | ||
207 | */ | ||
208 | int elevator_init(struct request_queue *q) | ||
209 | { | ||
210 | struct elevator_type *e = NULL; | ||
211 | int err = 0; | ||
212 | |||
213 | /* | ||
214 | * q->sysfs_lock must be held to provide mutual exclusion between | ||
215 | * elevator_switch() and here. | ||
216 | */ | ||
217 | mutex_lock(&q->sysfs_lock); | ||
218 | if (unlikely(q->elevator)) | ||
219 | goto out_unlock; | ||
220 | |||
221 | if (*chosen_elevator) { | ||
222 | e = elevator_get(q, chosen_elevator, false); | ||
223 | if (!e) | ||
224 | printk(KERN_ERR "I/O scheduler %s not found\n", | ||
225 | chosen_elevator); | ||
226 | } | ||
227 | |||
228 | if (!e) | ||
229 | e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); | ||
230 | if (!e) { | ||
231 | printk(KERN_ERR | ||
232 | "Default I/O scheduler not found. Using noop.\n"); | ||
233 | e = elevator_get(q, "noop", false); | ||
234 | } | ||
235 | |||
236 | err = e->ops.sq.elevator_init_fn(q, e); | ||
237 | if (err) | ||
238 | elevator_put(e); | ||
239 | out_unlock: | ||
240 | mutex_unlock(&q->sysfs_lock); | ||
241 | return err; | ||
242 | } | ||
243 | |||
244 | void elevator_exit(struct request_queue *q, struct elevator_queue *e) | 180 | void elevator_exit(struct request_queue *q, struct elevator_queue *e) |
245 | { | 181 | { |
246 | mutex_lock(&e->sysfs_lock); | 182 | mutex_lock(&e->sysfs_lock); |
247 | if (e->uses_mq && e->type->ops.mq.exit_sched) | 183 | if (e->type->ops.exit_sched) |
248 | blk_mq_exit_sched(q, e); | 184 | blk_mq_exit_sched(q, e); |
249 | else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) | ||
250 | e->type->ops.sq.elevator_exit_fn(e); | ||
251 | mutex_unlock(&e->sysfs_lock); | 185 | mutex_unlock(&e->sysfs_lock); |
252 | 186 | ||
253 | kobject_put(&e->kobj); | 187 | kobject_put(&e->kobj); |
@@ -356,68 +290,6 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector) | |||
356 | } | 290 | } |
357 | EXPORT_SYMBOL(elv_rb_find); | 291 | EXPORT_SYMBOL(elv_rb_find); |
358 | 292 | ||
359 | /* | ||
360 | * Insert rq into dispatch queue of q. Queue lock must be held on | ||
361 | * entry. rq is sort instead into the dispatch queue. To be used by | ||
362 | * specific elevators. | ||
363 | */ | ||
364 | void elv_dispatch_sort(struct request_queue *q, struct request *rq) | ||
365 | { | ||
366 | sector_t boundary; | ||
367 | struct list_head *entry; | ||
368 | |||
369 | if (q->last_merge == rq) | ||
370 | q->last_merge = NULL; | ||
371 | |||
372 | elv_rqhash_del(q, rq); | ||
373 | |||
374 | q->nr_sorted--; | ||
375 | |||
376 | boundary = q->end_sector; | ||
377 | list_for_each_prev(entry, &q->queue_head) { | ||
378 | struct request *pos = list_entry_rq(entry); | ||
379 | |||
380 | if (req_op(rq) != req_op(pos)) | ||
381 | break; | ||
382 | if (rq_data_dir(rq) != rq_data_dir(pos)) | ||
383 | break; | ||
384 | if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER)) | ||
385 | break; | ||
386 | if (blk_rq_pos(rq) >= boundary) { | ||
387 | if (blk_rq_pos(pos) < boundary) | ||
388 | continue; | ||
389 | } else { | ||
390 | if (blk_rq_pos(pos) >= boundary) | ||
391 | break; | ||
392 | } | ||
393 | if (blk_rq_pos(rq) >= blk_rq_pos(pos)) | ||
394 | break; | ||
395 | } | ||
396 | |||
397 | list_add(&rq->queuelist, entry); | ||
398 | } | ||
399 | EXPORT_SYMBOL(elv_dispatch_sort); | ||
400 | |||
401 | /* | ||
402 | * Insert rq into dispatch queue of q. Queue lock must be held on | ||
403 | * entry. rq is added to the back of the dispatch queue. To be used by | ||
404 | * specific elevators. | ||
405 | */ | ||
406 | void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) | ||
407 | { | ||
408 | if (q->last_merge == rq) | ||
409 | q->last_merge = NULL; | ||
410 | |||
411 | elv_rqhash_del(q, rq); | ||
412 | |||
413 | q->nr_sorted--; | ||
414 | |||
415 | q->end_sector = rq_end_sector(rq); | ||
416 | q->boundary_rq = rq; | ||
417 | list_add_tail(&rq->queuelist, &q->queue_head); | ||
418 | } | ||
419 | EXPORT_SYMBOL(elv_dispatch_add_tail); | ||
420 | |||
421 | enum elv_merge elv_merge(struct request_queue *q, struct request **req, | 293 | enum elv_merge elv_merge(struct request_queue *q, struct request **req, |
422 | struct bio *bio) | 294 | struct bio *bio) |
423 | { | 295 | { |
@@ -457,10 +329,8 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, | |||
457 | return ELEVATOR_BACK_MERGE; | 329 | return ELEVATOR_BACK_MERGE; |
458 | } | 330 | } |
459 | 331 | ||
460 | if (e->uses_mq && e->type->ops.mq.request_merge) | 332 | if (e->type->ops.request_merge) |
461 | return e->type->ops.mq.request_merge(q, req, bio); | 333 | return e->type->ops.request_merge(q, req, bio); |
462 | else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn) | ||
463 | return e->type->ops.sq.elevator_merge_fn(q, req, bio); | ||
464 | 334 | ||
465 | return ELEVATOR_NO_MERGE; | 335 | return ELEVATOR_NO_MERGE; |
466 | } | 336 | } |
@@ -511,10 +381,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq, | |||
511 | { | 381 | { |
512 | struct elevator_queue *e = q->elevator; | 382 | struct elevator_queue *e = q->elevator; |
513 | 383 | ||
514 | if (e->uses_mq && e->type->ops.mq.request_merged) | 384 | if (e->type->ops.request_merged) |
515 | e->type->ops.mq.request_merged(q, rq, type); | 385 | e->type->ops.request_merged(q, rq, type); |
516 | else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn) | ||
517 | e->type->ops.sq.elevator_merged_fn(q, rq, type); | ||
518 | 386 | ||
519 | if (type == ELEVATOR_BACK_MERGE) | 387 | if (type == ELEVATOR_BACK_MERGE) |
520 | elv_rqhash_reposition(q, rq); | 388 | elv_rqhash_reposition(q, rq); |
@@ -526,176 +394,20 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, | |||
526 | struct request *next) | 394 | struct request *next) |
527 | { | 395 | { |
528 | struct elevator_queue *e = q->elevator; | 396 | struct elevator_queue *e = q->elevator; |
529 | bool next_sorted = false; | ||
530 | |||
531 | if (e->uses_mq && e->type->ops.mq.requests_merged) | ||
532 | e->type->ops.mq.requests_merged(q, rq, next); | ||
533 | else if (e->type->ops.sq.elevator_merge_req_fn) { | ||
534 | next_sorted = (__force bool)(next->rq_flags & RQF_SORTED); | ||
535 | if (next_sorted) | ||
536 | e->type->ops.sq.elevator_merge_req_fn(q, rq, next); | ||
537 | } | ||
538 | 397 | ||
539 | elv_rqhash_reposition(q, rq); | 398 | if (e->type->ops.requests_merged) |
540 | 399 | e->type->ops.requests_merged(q, rq, next); | |
541 | if (next_sorted) { | ||
542 | elv_rqhash_del(q, next); | ||
543 | q->nr_sorted--; | ||
544 | } | ||
545 | 400 | ||
401 | elv_rqhash_reposition(q, rq); | ||
546 | q->last_merge = rq; | 402 | q->last_merge = rq; |
547 | } | 403 | } |
548 | 404 | ||
549 | void elv_bio_merged(struct request_queue *q, struct request *rq, | ||
550 | struct bio *bio) | ||
551 | { | ||
552 | struct elevator_queue *e = q->elevator; | ||
553 | |||
554 | if (WARN_ON_ONCE(e->uses_mq)) | ||
555 | return; | ||
556 | |||
557 | if (e->type->ops.sq.elevator_bio_merged_fn) | ||
558 | e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio); | ||
559 | } | ||
560 | |||
561 | void elv_requeue_request(struct request_queue *q, struct request *rq) | ||
562 | { | ||
563 | /* | ||
564 | * it already went through dequeue, we need to decrement the | ||
565 | * in_flight count again | ||
566 | */ | ||
567 | if (blk_account_rq(rq)) { | ||
568 | q->in_flight[rq_is_sync(rq)]--; | ||
569 | if (rq->rq_flags & RQF_SORTED) | ||
570 | elv_deactivate_rq(q, rq); | ||
571 | } | ||
572 | |||
573 | rq->rq_flags &= ~RQF_STARTED; | ||
574 | |||
575 | blk_pm_requeue_request(rq); | ||
576 | |||
577 | __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE); | ||
578 | } | ||
579 | |||
580 | void elv_drain_elevator(struct request_queue *q) | ||
581 | { | ||
582 | struct elevator_queue *e = q->elevator; | ||
583 | static int printed; | ||
584 | |||
585 | if (WARN_ON_ONCE(e->uses_mq)) | ||
586 | return; | ||
587 | |||
588 | lockdep_assert_held(q->queue_lock); | ||
589 | |||
590 | while (e->type->ops.sq.elevator_dispatch_fn(q, 1)) | ||
591 | ; | ||
592 | if (q->nr_sorted && !blk_queue_is_zoned(q) && printed++ < 10 ) { | ||
593 | printk(KERN_ERR "%s: forced dispatching is broken " | ||
594 | "(nr_sorted=%u), please report this\n", | ||
595 | q->elevator->type->elevator_name, q->nr_sorted); | ||
596 | } | ||
597 | } | ||
598 | |||
599 | void __elv_add_request(struct request_queue *q, struct request *rq, int where) | ||
600 | { | ||
601 | trace_block_rq_insert(q, rq); | ||
602 | |||
603 | blk_pm_add_request(q, rq); | ||
604 | |||
605 | rq->q = q; | ||
606 | |||
607 | if (rq->rq_flags & RQF_SOFTBARRIER) { | ||
608 | /* barriers are scheduling boundary, update end_sector */ | ||
609 | if (!blk_rq_is_passthrough(rq)) { | ||
610 | q->end_sector = rq_end_sector(rq); | ||
611 | q->boundary_rq = rq; | ||
612 | } | ||
613 | } else if (!(rq->rq_flags & RQF_ELVPRIV) && | ||
614 | (where == ELEVATOR_INSERT_SORT || | ||
615 | where == ELEVATOR_INSERT_SORT_MERGE)) | ||
616 | where = ELEVATOR_INSERT_BACK; | ||
617 | |||
618 | switch (where) { | ||
619 | case ELEVATOR_INSERT_REQUEUE: | ||
620 | case ELEVATOR_INSERT_FRONT: | ||
621 | rq->rq_flags |= RQF_SOFTBARRIER; | ||
622 | list_add(&rq->queuelist, &q->queue_head); | ||
623 | break; | ||
624 | |||
625 | case ELEVATOR_INSERT_BACK: | ||
626 | rq->rq_flags |= RQF_SOFTBARRIER; | ||
627 | elv_drain_elevator(q); | ||
628 | list_add_tail(&rq->queuelist, &q->queue_head); | ||
629 | /* | ||
630 | * We kick the queue here for the following reasons. | ||
631 | * - The elevator might have returned NULL previously | ||
632 | * to delay requests and returned them now. As the | ||
633 | * queue wasn't empty before this request, ll_rw_blk | ||
634 | * won't run the queue on return, resulting in hang. | ||
635 | * - Usually, back inserted requests won't be merged | ||
636 | * with anything. There's no point in delaying queue | ||
637 | * processing. | ||
638 | */ | ||
639 | __blk_run_queue(q); | ||
640 | break; | ||
641 | |||
642 | case ELEVATOR_INSERT_SORT_MERGE: | ||
643 | /* | ||
644 | * If we succeed in merging this request with one in the | ||
645 | * queue already, we are done - rq has now been freed, | ||
646 | * so no need to do anything further. | ||
647 | */ | ||
648 | if (elv_attempt_insert_merge(q, rq)) | ||
649 | break; | ||
650 | /* fall through */ | ||
651 | case ELEVATOR_INSERT_SORT: | ||
652 | BUG_ON(blk_rq_is_passthrough(rq)); | ||
653 | rq->rq_flags |= RQF_SORTED; | ||
654 | q->nr_sorted++; | ||
655 | if (rq_mergeable(rq)) { | ||
656 | elv_rqhash_add(q, rq); | ||
657 | if (!q->last_merge) | ||
658 | q->last_merge = rq; | ||
659 | } | ||
660 | |||
661 | /* | ||
662 | * Some ioscheds (cfq) run q->request_fn directly, so | ||
663 | * rq cannot be accessed after calling | ||
664 | * elevator_add_req_fn. | ||
665 | */ | ||
666 | q->elevator->type->ops.sq.elevator_add_req_fn(q, rq); | ||
667 | break; | ||
668 | |||
669 | case ELEVATOR_INSERT_FLUSH: | ||
670 | rq->rq_flags |= RQF_SOFTBARRIER; | ||
671 | blk_insert_flush(rq); | ||
672 | break; | ||
673 | default: | ||
674 | printk(KERN_ERR "%s: bad insertion point %d\n", | ||
675 | __func__, where); | ||
676 | BUG(); | ||
677 | } | ||
678 | } | ||
679 | EXPORT_SYMBOL(__elv_add_request); | ||
680 | |||
681 | void elv_add_request(struct request_queue *q, struct request *rq, int where) | ||
682 | { | ||
683 | unsigned long flags; | ||
684 | |||
685 | spin_lock_irqsave(q->queue_lock, flags); | ||
686 | __elv_add_request(q, rq, where); | ||
687 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
688 | } | ||
689 | EXPORT_SYMBOL(elv_add_request); | ||
690 | |||
691 | struct request *elv_latter_request(struct request_queue *q, struct request *rq) | 405 | struct request *elv_latter_request(struct request_queue *q, struct request *rq) |
692 | { | 406 | { |
693 | struct elevator_queue *e = q->elevator; | 407 | struct elevator_queue *e = q->elevator; |
694 | 408 | ||
695 | if (e->uses_mq && e->type->ops.mq.next_request) | 409 | if (e->type->ops.next_request) |
696 | return e->type->ops.mq.next_request(q, rq); | 410 | return e->type->ops.next_request(q, rq); |
697 | else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn) | ||
698 | return e->type->ops.sq.elevator_latter_req_fn(q, rq); | ||
699 | 411 | ||
700 | return NULL; | 412 | return NULL; |
701 | } | 413 | } |
@@ -704,66 +416,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) | |||
704 | { | 416 | { |
705 | struct elevator_queue *e = q->elevator; | 417 | struct elevator_queue *e = q->elevator; |
706 | 418 | ||
707 | if (e->uses_mq && e->type->ops.mq.former_request) | 419 | if (e->type->ops.former_request) |
708 | return e->type->ops.mq.former_request(q, rq); | 420 | return e->type->ops.former_request(q, rq); |
709 | if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn) | ||
710 | return e->type->ops.sq.elevator_former_req_fn(q, rq); | ||
711 | return NULL; | ||
712 | } | ||
713 | |||
714 | int elv_set_request(struct request_queue *q, struct request *rq, | ||
715 | struct bio *bio, gfp_t gfp_mask) | ||
716 | { | ||
717 | struct elevator_queue *e = q->elevator; | ||
718 | |||
719 | if (WARN_ON_ONCE(e->uses_mq)) | ||
720 | return 0; | ||
721 | |||
722 | if (e->type->ops.sq.elevator_set_req_fn) | ||
723 | return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask); | ||
724 | return 0; | ||
725 | } | ||
726 | |||
727 | void elv_put_request(struct request_queue *q, struct request *rq) | ||
728 | { | ||
729 | struct elevator_queue *e = q->elevator; | ||
730 | |||
731 | if (WARN_ON_ONCE(e->uses_mq)) | ||
732 | return; | ||
733 | |||
734 | if (e->type->ops.sq.elevator_put_req_fn) | ||
735 | e->type->ops.sq.elevator_put_req_fn(rq); | ||
736 | } | ||
737 | |||
738 | int elv_may_queue(struct request_queue *q, unsigned int op) | ||
739 | { | ||
740 | struct elevator_queue *e = q->elevator; | ||
741 | |||
742 | if (WARN_ON_ONCE(e->uses_mq)) | ||
743 | return 0; | ||
744 | |||
745 | if (e->type->ops.sq.elevator_may_queue_fn) | ||
746 | return e->type->ops.sq.elevator_may_queue_fn(q, op); | ||
747 | |||
748 | return ELV_MQUEUE_MAY; | ||
749 | } | ||
750 | |||
751 | void elv_completed_request(struct request_queue *q, struct request *rq) | ||
752 | { | ||
753 | struct elevator_queue *e = q->elevator; | ||
754 | |||
755 | if (WARN_ON_ONCE(e->uses_mq)) | ||
756 | return; | ||
757 | 421 | ||
758 | /* | 422 | return NULL; |
759 | * request is released from the driver, io must be done | ||
760 | */ | ||
761 | if (blk_account_rq(rq)) { | ||
762 | q->in_flight[rq_is_sync(rq)]--; | ||
763 | if ((rq->rq_flags & RQF_SORTED) && | ||
764 | e->type->ops.sq.elevator_completed_req_fn) | ||
765 | e->type->ops.sq.elevator_completed_req_fn(q, rq); | ||
766 | } | ||
767 | } | 423 | } |
768 | 424 | ||
769 | #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) | 425 | #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) |
@@ -832,8 +488,6 @@ int elv_register_queue(struct request_queue *q) | |||
832 | } | 488 | } |
833 | kobject_uevent(&e->kobj, KOBJ_ADD); | 489 | kobject_uevent(&e->kobj, KOBJ_ADD); |
834 | e->registered = 1; | 490 | e->registered = 1; |
835 | if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn) | ||
836 | e->type->ops.sq.elevator_registered_fn(q); | ||
837 | } | 491 | } |
838 | return error; | 492 | return error; |
839 | } | 493 | } |
@@ -873,7 +527,7 @@ int elv_register(struct elevator_type *e) | |||
873 | 527 | ||
874 | /* register, don't allow duplicate names */ | 528 | /* register, don't allow duplicate names */ |
875 | spin_lock(&elv_list_lock); | 529 | spin_lock(&elv_list_lock); |
876 | if (elevator_find(e->elevator_name, e->uses_mq)) { | 530 | if (elevator_find(e->elevator_name)) { |
877 | spin_unlock(&elv_list_lock); | 531 | spin_unlock(&elv_list_lock); |
878 | kmem_cache_destroy(e->icq_cache); | 532 | kmem_cache_destroy(e->icq_cache); |
879 | return -EBUSY; | 533 | return -EBUSY; |
@@ -881,12 +535,6 @@ int elv_register(struct elevator_type *e) | |||
881 | list_add_tail(&e->list, &elv_list); | 535 | list_add_tail(&e->list, &elv_list); |
882 | spin_unlock(&elv_list_lock); | 536 | spin_unlock(&elv_list_lock); |
883 | 537 | ||
884 | /* print pretty message */ | ||
885 | if (elevator_match(e, chosen_elevator) || | ||
886 | (!*chosen_elevator && | ||
887 | elevator_match(e, CONFIG_DEFAULT_IOSCHED))) | ||
888 | def = " (default)"; | ||
889 | |||
890 | printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, | 538 | printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, |
891 | def); | 539 | def); |
892 | return 0; | 540 | return 0; |
@@ -989,71 +637,17 @@ out_unlock: | |||
989 | */ | 637 | */ |
990 | static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) | 638 | static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) |
991 | { | 639 | { |
992 | struct elevator_queue *old = q->elevator; | ||
993 | bool old_registered = false; | ||
994 | int err; | 640 | int err; |
995 | 641 | ||
996 | lockdep_assert_held(&q->sysfs_lock); | 642 | lockdep_assert_held(&q->sysfs_lock); |
997 | 643 | ||
998 | if (q->mq_ops) { | 644 | blk_mq_freeze_queue(q); |
999 | blk_mq_freeze_queue(q); | 645 | blk_mq_quiesce_queue(q); |
1000 | blk_mq_quiesce_queue(q); | ||
1001 | |||
1002 | err = elevator_switch_mq(q, new_e); | ||
1003 | |||
1004 | blk_mq_unquiesce_queue(q); | ||
1005 | blk_mq_unfreeze_queue(q); | ||
1006 | |||
1007 | return err; | ||
1008 | } | ||
1009 | |||
1010 | /* | ||
1011 | * Turn on BYPASS and drain all requests w/ elevator private data. | ||
1012 | * Block layer doesn't call into a quiesced elevator - all requests | ||
1013 | * are directly put on the dispatch list without elevator data | ||
1014 | * using INSERT_BACK. All requests have SOFTBARRIER set and no | ||
1015 | * merge happens either. | ||
1016 | */ | ||
1017 | if (old) { | ||
1018 | old_registered = old->registered; | ||
1019 | |||
1020 | blk_queue_bypass_start(q); | ||
1021 | |||
1022 | /* unregister and clear all auxiliary data of the old elevator */ | ||
1023 | if (old_registered) | ||
1024 | elv_unregister_queue(q); | ||
1025 | |||
1026 | ioc_clear_queue(q); | ||
1027 | } | ||
1028 | 646 | ||
1029 | /* allocate, init and register new elevator */ | 647 | err = elevator_switch_mq(q, new_e); |
1030 | err = new_e->ops.sq.elevator_init_fn(q, new_e); | ||
1031 | if (err) | ||
1032 | goto fail_init; | ||
1033 | |||
1034 | err = elv_register_queue(q); | ||
1035 | if (err) | ||
1036 | goto fail_register; | ||
1037 | |||
1038 | /* done, kill the old one and finish */ | ||
1039 | if (old) { | ||
1040 | elevator_exit(q, old); | ||
1041 | blk_queue_bypass_end(q); | ||
1042 | } | ||
1043 | 648 | ||
1044 | blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); | 649 | blk_mq_unquiesce_queue(q); |
1045 | 650 | blk_mq_unfreeze_queue(q); | |
1046 | return 0; | ||
1047 | |||
1048 | fail_register: | ||
1049 | elevator_exit(q, q->elevator); | ||
1050 | fail_init: | ||
1051 | /* switch failed, restore and re-register old elevator */ | ||
1052 | if (old) { | ||
1053 | q->elevator = old; | ||
1054 | elv_register_queue(q); | ||
1055 | blk_queue_bypass_end(q); | ||
1056 | } | ||
1057 | 651 | ||
1058 | return err; | 652 | return err; |
1059 | } | 653 | } |
@@ -1073,7 +667,7 @@ static int __elevator_change(struct request_queue *q, const char *name) | |||
1073 | /* | 667 | /* |
1074 | * Special case for mq, turn off scheduling | 668 | * Special case for mq, turn off scheduling |
1075 | */ | 669 | */ |
1076 | if (q->mq_ops && !strncmp(name, "none", 4)) | 670 | if (!strncmp(name, "none", 4)) |
1077 | return elevator_switch(q, NULL); | 671 | return elevator_switch(q, NULL); |
1078 | 672 | ||
1079 | strlcpy(elevator_name, name, sizeof(elevator_name)); | 673 | strlcpy(elevator_name, name, sizeof(elevator_name)); |
@@ -1091,8 +685,7 @@ static int __elevator_change(struct request_queue *q, const char *name) | |||
1091 | 685 | ||
1092 | static inline bool elv_support_iosched(struct request_queue *q) | 686 | static inline bool elv_support_iosched(struct request_queue *q) |
1093 | { | 687 | { |
1094 | if (q->mq_ops && q->tag_set && (q->tag_set->flags & | 688 | if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) |
1095 | BLK_MQ_F_NO_SCHED)) | ||
1096 | return false; | 689 | return false; |
1097 | return true; | 690 | return true; |
1098 | } | 691 | } |
@@ -1102,7 +695,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, | |||
1102 | { | 695 | { |
1103 | int ret; | 696 | int ret; |
1104 | 697 | ||
1105 | if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q)) | 698 | if (!queue_is_mq(q) || !elv_support_iosched(q)) |
1106 | return count; | 699 | return count; |
1107 | 700 | ||
1108 | ret = __elevator_change(q, name); | 701 | ret = __elevator_change(q, name); |
@@ -1117,10 +710,9 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) | |||
1117 | struct elevator_queue *e = q->elevator; | 710 | struct elevator_queue *e = q->elevator; |
1118 | struct elevator_type *elv = NULL; | 711 | struct elevator_type *elv = NULL; |
1119 | struct elevator_type *__e; | 712 | struct elevator_type *__e; |
1120 | bool uses_mq = q->mq_ops != NULL; | ||
1121 | int len = 0; | 713 | int len = 0; |
1122 | 714 | ||
1123 | if (!queue_is_rq_based(q)) | 715 | if (!queue_is_mq(q)) |
1124 | return sprintf(name, "none\n"); | 716 | return sprintf(name, "none\n"); |
1125 | 717 | ||
1126 | if (!q->elevator) | 718 | if (!q->elevator) |
@@ -1130,19 +722,16 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) | |||
1130 | 722 | ||
1131 | spin_lock(&elv_list_lock); | 723 | spin_lock(&elv_list_lock); |
1132 | list_for_each_entry(__e, &elv_list, list) { | 724 | list_for_each_entry(__e, &elv_list, list) { |
1133 | if (elv && elevator_match(elv, __e->elevator_name) && | 725 | if (elv && elevator_match(elv, __e->elevator_name)) { |
1134 | (__e->uses_mq == uses_mq)) { | ||
1135 | len += sprintf(name+len, "[%s] ", elv->elevator_name); | 726 | len += sprintf(name+len, "[%s] ", elv->elevator_name); |
1136 | continue; | 727 | continue; |
1137 | } | 728 | } |
1138 | if (__e->uses_mq && q->mq_ops && elv_support_iosched(q)) | 729 | if (elv_support_iosched(q)) |
1139 | len += sprintf(name+len, "%s ", __e->elevator_name); | ||
1140 | else if (!__e->uses_mq && !q->mq_ops) | ||
1141 | len += sprintf(name+len, "%s ", __e->elevator_name); | 730 | len += sprintf(name+len, "%s ", __e->elevator_name); |
1142 | } | 731 | } |
1143 | spin_unlock(&elv_list_lock); | 732 | spin_unlock(&elv_list_lock); |
1144 | 733 | ||
1145 | if (q->mq_ops && q->elevator) | 734 | if (q->elevator) |
1146 | len += sprintf(name+len, "none"); | 735 | len += sprintf(name+len, "none"); |
1147 | 736 | ||
1148 | len += sprintf(len+name, "\n"); | 737 | len += sprintf(len+name, "\n"); |
diff --git a/block/genhd.c b/block/genhd.c index cff6bdf27226..1dd8fd6613b8 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -47,51 +47,64 @@ static void disk_release_events(struct gendisk *disk); | |||
47 | 47 | ||
48 | void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) | 48 | void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) |
49 | { | 49 | { |
50 | if (q->mq_ops) | 50 | if (queue_is_mq(q)) |
51 | return; | 51 | return; |
52 | 52 | ||
53 | atomic_inc(&part->in_flight[rw]); | 53 | part_stat_local_inc(part, in_flight[rw]); |
54 | if (part->partno) | 54 | if (part->partno) |
55 | atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); | 55 | part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]); |
56 | } | 56 | } |
57 | 57 | ||
58 | void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) | 58 | void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) |
59 | { | 59 | { |
60 | if (q->mq_ops) | 60 | if (queue_is_mq(q)) |
61 | return; | 61 | return; |
62 | 62 | ||
63 | atomic_dec(&part->in_flight[rw]); | 63 | part_stat_local_dec(part, in_flight[rw]); |
64 | if (part->partno) | 64 | if (part->partno) |
65 | atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); | 65 | part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]); |
66 | } | 66 | } |
67 | 67 | ||
68 | void part_in_flight(struct request_queue *q, struct hd_struct *part, | 68 | unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) |
69 | unsigned int inflight[2]) | ||
70 | { | 69 | { |
71 | if (q->mq_ops) { | 70 | int cpu; |
72 | blk_mq_in_flight(q, part, inflight); | 71 | unsigned int inflight; |
73 | return; | 72 | |
73 | if (queue_is_mq(q)) { | ||
74 | return blk_mq_in_flight(q, part); | ||
74 | } | 75 | } |
75 | 76 | ||
76 | inflight[0] = atomic_read(&part->in_flight[0]) + | 77 | inflight = 0; |
77 | atomic_read(&part->in_flight[1]); | 78 | for_each_possible_cpu(cpu) { |
78 | if (part->partno) { | 79 | inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + |
79 | part = &part_to_disk(part)->part0; | 80 | part_stat_local_read_cpu(part, in_flight[1], cpu); |
80 | inflight[1] = atomic_read(&part->in_flight[0]) + | ||
81 | atomic_read(&part->in_flight[1]); | ||
82 | } | 81 | } |
82 | if ((int)inflight < 0) | ||
83 | inflight = 0; | ||
84 | |||
85 | return inflight; | ||
83 | } | 86 | } |
84 | 87 | ||
85 | void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, | 88 | void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, |
86 | unsigned int inflight[2]) | 89 | unsigned int inflight[2]) |
87 | { | 90 | { |
88 | if (q->mq_ops) { | 91 | int cpu; |
92 | |||
93 | if (queue_is_mq(q)) { | ||
89 | blk_mq_in_flight_rw(q, part, inflight); | 94 | blk_mq_in_flight_rw(q, part, inflight); |
90 | return; | 95 | return; |
91 | } | 96 | } |
92 | 97 | ||
93 | inflight[0] = atomic_read(&part->in_flight[0]); | 98 | inflight[0] = 0; |
94 | inflight[1] = atomic_read(&part->in_flight[1]); | 99 | inflight[1] = 0; |
100 | for_each_possible_cpu(cpu) { | ||
101 | inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); | ||
102 | inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); | ||
103 | } | ||
104 | if ((int)inflight[0] < 0) | ||
105 | inflight[0] = 0; | ||
106 | if ((int)inflight[1] < 0) | ||
107 | inflight[1] = 0; | ||
95 | } | 108 | } |
96 | 109 | ||
97 | struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) | 110 | struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) |
@@ -1325,8 +1338,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) | |||
1325 | struct disk_part_iter piter; | 1338 | struct disk_part_iter piter; |
1326 | struct hd_struct *hd; | 1339 | struct hd_struct *hd; |
1327 | char buf[BDEVNAME_SIZE]; | 1340 | char buf[BDEVNAME_SIZE]; |
1328 | unsigned int inflight[2]; | 1341 | unsigned int inflight; |
1329 | int cpu; | ||
1330 | 1342 | ||
1331 | /* | 1343 | /* |
1332 | if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) | 1344 | if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) |
@@ -1338,10 +1350,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) | |||
1338 | 1350 | ||
1339 | disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); | 1351 | disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); |
1340 | while ((hd = disk_part_iter_next(&piter))) { | 1352 | while ((hd = disk_part_iter_next(&piter))) { |
1341 | cpu = part_stat_lock(); | 1353 | inflight = part_in_flight(gp->queue, hd); |
1342 | part_round_stats(gp->queue, cpu, hd); | ||
1343 | part_stat_unlock(); | ||
1344 | part_in_flight(gp->queue, hd, inflight); | ||
1345 | seq_printf(seqf, "%4d %7d %s " | 1354 | seq_printf(seqf, "%4d %7d %s " |
1346 | "%lu %lu %lu %u " | 1355 | "%lu %lu %lu %u " |
1347 | "%lu %lu %lu %u " | 1356 | "%lu %lu %lu %u " |
@@ -1357,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) | |||
1357 | part_stat_read(hd, merges[STAT_WRITE]), | 1366 | part_stat_read(hd, merges[STAT_WRITE]), |
1358 | part_stat_read(hd, sectors[STAT_WRITE]), | 1367 | part_stat_read(hd, sectors[STAT_WRITE]), |
1359 | (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), | 1368 | (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), |
1360 | inflight[0], | 1369 | inflight, |
1361 | jiffies_to_msecs(part_stat_read(hd, io_ticks)), | 1370 | jiffies_to_msecs(part_stat_read(hd, io_ticks)), |
1362 | jiffies_to_msecs(part_stat_read(hd, time_in_queue)), | 1371 | jiffies_to_msecs(part_stat_read(hd, time_in_queue)), |
1363 | part_stat_read(hd, ios[STAT_DISCARD]), | 1372 | part_stat_read(hd, ios[STAT_DISCARD]), |
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index eccac01a10b6..ec6a04e01bc1 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c | |||
@@ -195,7 +195,7 @@ struct kyber_hctx_data { | |||
195 | unsigned int batching; | 195 | unsigned int batching; |
196 | struct kyber_ctx_queue *kcqs; | 196 | struct kyber_ctx_queue *kcqs; |
197 | struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; | 197 | struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; |
198 | wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; | 198 | struct sbq_wait domain_wait[KYBER_NUM_DOMAINS]; |
199 | struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; | 199 | struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; |
200 | atomic_t wait_index[KYBER_NUM_DOMAINS]; | 200 | atomic_t wait_index[KYBER_NUM_DOMAINS]; |
201 | }; | 201 | }; |
@@ -501,10 +501,11 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) | |||
501 | 501 | ||
502 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { | 502 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
503 | INIT_LIST_HEAD(&khd->rqs[i]); | 503 | INIT_LIST_HEAD(&khd->rqs[i]); |
504 | init_waitqueue_func_entry(&khd->domain_wait[i], | 504 | khd->domain_wait[i].sbq = NULL; |
505 | init_waitqueue_func_entry(&khd->domain_wait[i].wait, | ||
505 | kyber_domain_wake); | 506 | kyber_domain_wake); |
506 | khd->domain_wait[i].private = hctx; | 507 | khd->domain_wait[i].wait.private = hctx; |
507 | INIT_LIST_HEAD(&khd->domain_wait[i].entry); | 508 | INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry); |
508 | atomic_set(&khd->wait_index[i], 0); | 509 | atomic_set(&khd->wait_index[i], 0); |
509 | } | 510 | } |
510 | 511 | ||
@@ -576,7 +577,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) | |||
576 | { | 577 | { |
577 | struct kyber_hctx_data *khd = hctx->sched_data; | 578 | struct kyber_hctx_data *khd = hctx->sched_data; |
578 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); | 579 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); |
579 | struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; | 580 | struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; |
580 | unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); | 581 | unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); |
581 | struct list_head *rq_list = &kcq->rq_list[sched_domain]; | 582 | struct list_head *rq_list = &kcq->rq_list[sched_domain]; |
582 | bool merged; | 583 | bool merged; |
@@ -602,7 +603,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, | |||
602 | 603 | ||
603 | list_for_each_entry_safe(rq, next, rq_list, queuelist) { | 604 | list_for_each_entry_safe(rq, next, rq_list, queuelist) { |
604 | unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); | 605 | unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); |
605 | struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; | 606 | struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; |
606 | struct list_head *head = &kcq->rq_list[sched_domain]; | 607 | struct list_head *head = &kcq->rq_list[sched_domain]; |
607 | 608 | ||
608 | spin_lock(&kcq->lock); | 609 | spin_lock(&kcq->lock); |
@@ -611,7 +612,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, | |||
611 | else | 612 | else |
612 | list_move_tail(&rq->queuelist, head); | 613 | list_move_tail(&rq->queuelist, head); |
613 | sbitmap_set_bit(&khd->kcq_map[sched_domain], | 614 | sbitmap_set_bit(&khd->kcq_map[sched_domain], |
614 | rq->mq_ctx->index_hw); | 615 | rq->mq_ctx->index_hw[hctx->type]); |
615 | blk_mq_sched_request_inserted(rq); | 616 | blk_mq_sched_request_inserted(rq); |
616 | spin_unlock(&kcq->lock); | 617 | spin_unlock(&kcq->lock); |
617 | } | 618 | } |
@@ -698,12 +699,13 @@ static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd, | |||
698 | flush_busy_kcq, &data); | 699 | flush_busy_kcq, &data); |
699 | } | 700 | } |
700 | 701 | ||
701 | static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, | 702 | static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags, |
702 | void *key) | 703 | void *key) |
703 | { | 704 | { |
704 | struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); | 705 | struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private); |
706 | struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait); | ||
705 | 707 | ||
706 | list_del_init(&wait->entry); | 708 | sbitmap_del_wait_queue(wait); |
707 | blk_mq_run_hw_queue(hctx, true); | 709 | blk_mq_run_hw_queue(hctx, true); |
708 | return 1; | 710 | return 1; |
709 | } | 711 | } |
@@ -714,7 +716,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, | |||
714 | { | 716 | { |
715 | unsigned int sched_domain = khd->cur_domain; | 717 | unsigned int sched_domain = khd->cur_domain; |
716 | struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; | 718 | struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; |
717 | wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; | 719 | struct sbq_wait *wait = &khd->domain_wait[sched_domain]; |
718 | struct sbq_wait_state *ws; | 720 | struct sbq_wait_state *ws; |
719 | int nr; | 721 | int nr; |
720 | 722 | ||
@@ -725,11 +727,11 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, | |||
725 | * run when one becomes available. Note that this is serialized on | 727 | * run when one becomes available. Note that this is serialized on |
726 | * khd->lock, but we still need to be careful about the waker. | 728 | * khd->lock, but we still need to be careful about the waker. |
727 | */ | 729 | */ |
728 | if (nr < 0 && list_empty_careful(&wait->entry)) { | 730 | if (nr < 0 && list_empty_careful(&wait->wait.entry)) { |
729 | ws = sbq_wait_ptr(domain_tokens, | 731 | ws = sbq_wait_ptr(domain_tokens, |
730 | &khd->wait_index[sched_domain]); | 732 | &khd->wait_index[sched_domain]); |
731 | khd->domain_ws[sched_domain] = ws; | 733 | khd->domain_ws[sched_domain] = ws; |
732 | add_wait_queue(&ws->wait, wait); | 734 | sbitmap_add_wait_queue(domain_tokens, ws, wait); |
733 | 735 | ||
734 | /* | 736 | /* |
735 | * Try again in case a token was freed before we got on the wait | 737 | * Try again in case a token was freed before we got on the wait |
@@ -745,10 +747,10 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, | |||
745 | * between the !list_empty_careful() check and us grabbing the lock, but | 747 | * between the !list_empty_careful() check and us grabbing the lock, but |
746 | * list_del_init() is okay with that. | 748 | * list_del_init() is okay with that. |
747 | */ | 749 | */ |
748 | if (nr >= 0 && !list_empty_careful(&wait->entry)) { | 750 | if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) { |
749 | ws = khd->domain_ws[sched_domain]; | 751 | ws = khd->domain_ws[sched_domain]; |
750 | spin_lock_irq(&ws->wait.lock); | 752 | spin_lock_irq(&ws->wait.lock); |
751 | list_del_init(&wait->entry); | 753 | sbitmap_del_wait_queue(wait); |
752 | spin_unlock_irq(&ws->wait.lock); | 754 | spin_unlock_irq(&ws->wait.lock); |
753 | } | 755 | } |
754 | 756 | ||
@@ -951,7 +953,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ | |||
951 | { \ | 953 | { \ |
952 | struct blk_mq_hw_ctx *hctx = data; \ | 954 | struct blk_mq_hw_ctx *hctx = data; \ |
953 | struct kyber_hctx_data *khd = hctx->sched_data; \ | 955 | struct kyber_hctx_data *khd = hctx->sched_data; \ |
954 | wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ | 956 | wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \ |
955 | \ | 957 | \ |
956 | seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ | 958 | seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ |
957 | return 0; \ | 959 | return 0; \ |
@@ -1017,7 +1019,7 @@ static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { | |||
1017 | #endif | 1019 | #endif |
1018 | 1020 | ||
1019 | static struct elevator_type kyber_sched = { | 1021 | static struct elevator_type kyber_sched = { |
1020 | .ops.mq = { | 1022 | .ops = { |
1021 | .init_sched = kyber_init_sched, | 1023 | .init_sched = kyber_init_sched, |
1022 | .exit_sched = kyber_exit_sched, | 1024 | .exit_sched = kyber_exit_sched, |
1023 | .init_hctx = kyber_init_hctx, | 1025 | .init_hctx = kyber_init_hctx, |
@@ -1032,7 +1034,6 @@ static struct elevator_type kyber_sched = { | |||
1032 | .dispatch_request = kyber_dispatch_request, | 1034 | .dispatch_request = kyber_dispatch_request, |
1033 | .has_work = kyber_has_work, | 1035 | .has_work = kyber_has_work, |
1034 | }, | 1036 | }, |
1035 | .uses_mq = true, | ||
1036 | #ifdef CONFIG_BLK_DEBUG_FS | 1037 | #ifdef CONFIG_BLK_DEBUG_FS |
1037 | .queue_debugfs_attrs = kyber_queue_debugfs_attrs, | 1038 | .queue_debugfs_attrs = kyber_queue_debugfs_attrs, |
1038 | .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, | 1039 | .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, |
diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 099a9e05854c..14288f864e94 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c | |||
@@ -373,9 +373,16 @@ done: | |||
373 | 373 | ||
374 | /* | 374 | /* |
375 | * One confusing aspect here is that we get called for a specific | 375 | * One confusing aspect here is that we get called for a specific |
376 | * hardware queue, but we return a request that may not be for a | 376 | * hardware queue, but we may return a request that is for a |
377 | * different hardware queue. This is because mq-deadline has shared | 377 | * different hardware queue. This is because mq-deadline has shared |
378 | * state for all hardware queues, in terms of sorting, FIFOs, etc. | 378 | * state for all hardware queues, in terms of sorting, FIFOs, etc. |
379 | * | ||
380 | * For a zoned block device, __dd_dispatch_request() may return NULL | ||
381 | * if all the queued write requests are directed at zones that are already | ||
382 | * locked due to on-going write requests. In this case, make sure to mark | ||
383 | * the queue as needing a restart to ensure that the queue is run again | ||
384 | * and the pending writes dispatched once the target zones for the ongoing | ||
385 | * write requests are unlocked in dd_finish_request(). | ||
379 | */ | 386 | */ |
380 | static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) | 387 | static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) |
381 | { | 388 | { |
@@ -384,6 +391,9 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) | |||
384 | 391 | ||
385 | spin_lock(&dd->lock); | 392 | spin_lock(&dd->lock); |
386 | rq = __dd_dispatch_request(dd); | 393 | rq = __dd_dispatch_request(dd); |
394 | if (!rq && blk_queue_is_zoned(hctx->queue) && | ||
395 | !list_empty(&dd->fifo_list[WRITE])) | ||
396 | blk_mq_sched_mark_restart_hctx(hctx); | ||
387 | spin_unlock(&dd->lock); | 397 | spin_unlock(&dd->lock); |
388 | 398 | ||
389 | return rq; | 399 | return rq; |
@@ -761,7 +771,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { | |||
761 | #endif | 771 | #endif |
762 | 772 | ||
763 | static struct elevator_type mq_deadline = { | 773 | static struct elevator_type mq_deadline = { |
764 | .ops.mq = { | 774 | .ops = { |
765 | .insert_requests = dd_insert_requests, | 775 | .insert_requests = dd_insert_requests, |
766 | .dispatch_request = dd_dispatch_request, | 776 | .dispatch_request = dd_dispatch_request, |
767 | .prepare_request = dd_prepare_request, | 777 | .prepare_request = dd_prepare_request, |
@@ -777,7 +787,6 @@ static struct elevator_type mq_deadline = { | |||
777 | .exit_sched = dd_exit_queue, | 787 | .exit_sched = dd_exit_queue, |
778 | }, | 788 | }, |
779 | 789 | ||
780 | .uses_mq = true, | ||
781 | #ifdef CONFIG_BLK_DEBUG_FS | 790 | #ifdef CONFIG_BLK_DEBUG_FS |
782 | .queue_debugfs_attrs = deadline_queue_debugfs_attrs, | 791 | .queue_debugfs_attrs = deadline_queue_debugfs_attrs, |
783 | #endif | 792 | #endif |
diff --git a/block/noop-iosched.c b/block/noop-iosched.c deleted file mode 100644 index 2d1b15d89b45..000000000000 --- a/block/noop-iosched.c +++ /dev/null | |||
@@ -1,124 +0,0 @@ | |||
1 | /* | ||
2 | * elevator noop | ||
3 | */ | ||
4 | #include <linux/blkdev.h> | ||
5 | #include <linux/elevator.h> | ||
6 | #include <linux/bio.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/init.h> | ||
10 | |||
11 | struct noop_data { | ||
12 | struct list_head queue; | ||
13 | }; | ||
14 | |||
15 | static void noop_merged_requests(struct request_queue *q, struct request *rq, | ||
16 | struct request *next) | ||
17 | { | ||
18 | list_del_init(&next->queuelist); | ||
19 | } | ||
20 | |||
21 | static int noop_dispatch(struct request_queue *q, int force) | ||
22 | { | ||
23 | struct noop_data *nd = q->elevator->elevator_data; | ||
24 | struct request *rq; | ||
25 | |||
26 | rq = list_first_entry_or_null(&nd->queue, struct request, queuelist); | ||
27 | if (rq) { | ||
28 | list_del_init(&rq->queuelist); | ||
29 | elv_dispatch_sort(q, rq); | ||
30 | return 1; | ||
31 | } | ||
32 | return 0; | ||
33 | } | ||
34 | |||
35 | static void noop_add_request(struct request_queue *q, struct request *rq) | ||
36 | { | ||
37 | struct noop_data *nd = q->elevator->elevator_data; | ||
38 | |||
39 | list_add_tail(&rq->queuelist, &nd->queue); | ||
40 | } | ||
41 | |||
42 | static struct request * | ||
43 | noop_former_request(struct request_queue *q, struct request *rq) | ||
44 | { | ||
45 | struct noop_data *nd = q->elevator->elevator_data; | ||
46 | |||
47 | if (rq->queuelist.prev == &nd->queue) | ||
48 | return NULL; | ||
49 | return list_prev_entry(rq, queuelist); | ||
50 | } | ||
51 | |||
52 | static struct request * | ||
53 | noop_latter_request(struct request_queue *q, struct request *rq) | ||
54 | { | ||
55 | struct noop_data *nd = q->elevator->elevator_data; | ||
56 | |||
57 | if (rq->queuelist.next == &nd->queue) | ||
58 | return NULL; | ||
59 | return list_next_entry(rq, queuelist); | ||
60 | } | ||
61 | |||
62 | static int noop_init_queue(struct request_queue *q, struct elevator_type *e) | ||
63 | { | ||
64 | struct noop_data *nd; | ||
65 | struct elevator_queue *eq; | ||
66 | |||
67 | eq = elevator_alloc(q, e); | ||
68 | if (!eq) | ||
69 | return -ENOMEM; | ||
70 | |||
71 | nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); | ||
72 | if (!nd) { | ||
73 | kobject_put(&eq->kobj); | ||
74 | return -ENOMEM; | ||
75 | } | ||
76 | eq->elevator_data = nd; | ||
77 | |||
78 | INIT_LIST_HEAD(&nd->queue); | ||
79 | |||
80 | spin_lock_irq(q->queue_lock); | ||
81 | q->elevator = eq; | ||
82 | spin_unlock_irq(q->queue_lock); | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | static void noop_exit_queue(struct elevator_queue *e) | ||
87 | { | ||
88 | struct noop_data *nd = e->elevator_data; | ||
89 | |||
90 | BUG_ON(!list_empty(&nd->queue)); | ||
91 | kfree(nd); | ||
92 | } | ||
93 | |||
94 | static struct elevator_type elevator_noop = { | ||
95 | .ops.sq = { | ||
96 | .elevator_merge_req_fn = noop_merged_requests, | ||
97 | .elevator_dispatch_fn = noop_dispatch, | ||
98 | .elevator_add_req_fn = noop_add_request, | ||
99 | .elevator_former_req_fn = noop_former_request, | ||
100 | .elevator_latter_req_fn = noop_latter_request, | ||
101 | .elevator_init_fn = noop_init_queue, | ||
102 | .elevator_exit_fn = noop_exit_queue, | ||
103 | }, | ||
104 | .elevator_name = "noop", | ||
105 | .elevator_owner = THIS_MODULE, | ||
106 | }; | ||
107 | |||
108 | static int __init noop_init(void) | ||
109 | { | ||
110 | return elv_register(&elevator_noop); | ||
111 | } | ||
112 | |||
113 | static void __exit noop_exit(void) | ||
114 | { | ||
115 | elv_unregister(&elevator_noop); | ||
116 | } | ||
117 | |||
118 | module_init(noop_init); | ||
119 | module_exit(noop_exit); | ||
120 | |||
121 | |||
122 | MODULE_AUTHOR("Jens Axboe"); | ||
123 | MODULE_LICENSE("GPL"); | ||
124 | MODULE_DESCRIPTION("No-op IO scheduler"); | ||
diff --git a/block/partition-generic.c b/block/partition-generic.c index d3d14e81fb12..8e596a8dff32 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c | |||
@@ -120,13 +120,9 @@ ssize_t part_stat_show(struct device *dev, | |||
120 | { | 120 | { |
121 | struct hd_struct *p = dev_to_part(dev); | 121 | struct hd_struct *p = dev_to_part(dev); |
122 | struct request_queue *q = part_to_disk(p)->queue; | 122 | struct request_queue *q = part_to_disk(p)->queue; |
123 | unsigned int inflight[2]; | 123 | unsigned int inflight; |
124 | int cpu; | ||
125 | 124 | ||
126 | cpu = part_stat_lock(); | 125 | inflight = part_in_flight(q, p); |
127 | part_round_stats(q, cpu, p); | ||
128 | part_stat_unlock(); | ||
129 | part_in_flight(q, p, inflight); | ||
130 | return sprintf(buf, | 126 | return sprintf(buf, |
131 | "%8lu %8lu %8llu %8u " | 127 | "%8lu %8lu %8llu %8u " |
132 | "%8lu %8lu %8llu %8u " | 128 | "%8lu %8lu %8llu %8u " |
@@ -141,7 +137,7 @@ ssize_t part_stat_show(struct device *dev, | |||
141 | part_stat_read(p, merges[STAT_WRITE]), | 137 | part_stat_read(p, merges[STAT_WRITE]), |
142 | (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), | 138 | (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), |
143 | (unsigned int)part_stat_read_msecs(p, STAT_WRITE), | 139 | (unsigned int)part_stat_read_msecs(p, STAT_WRITE), |
144 | inflight[0], | 140 | inflight, |
145 | jiffies_to_msecs(part_stat_read(p, io_ticks)), | 141 | jiffies_to_msecs(part_stat_read(p, io_ticks)), |
146 | jiffies_to_msecs(part_stat_read(p, time_in_queue)), | 142 | jiffies_to_msecs(part_stat_read(p, time_in_queue)), |
147 | part_stat_read(p, ios[STAT_DISCARD]), | 143 | part_stat_read(p, ios[STAT_DISCARD]), |
@@ -249,9 +245,10 @@ struct device_type part_type = { | |||
249 | .uevent = part_uevent, | 245 | .uevent = part_uevent, |
250 | }; | 246 | }; |
251 | 247 | ||
252 | static void delete_partition_rcu_cb(struct rcu_head *head) | 248 | static void delete_partition_work_fn(struct work_struct *work) |
253 | { | 249 | { |
254 | struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); | 250 | struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct, |
251 | rcu_work); | ||
255 | 252 | ||
256 | part->start_sect = 0; | 253 | part->start_sect = 0; |
257 | part->nr_sects = 0; | 254 | part->nr_sects = 0; |
@@ -262,7 +259,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head) | |||
262 | void __delete_partition(struct percpu_ref *ref) | 259 | void __delete_partition(struct percpu_ref *ref) |
263 | { | 260 | { |
264 | struct hd_struct *part = container_of(ref, struct hd_struct, ref); | 261 | struct hd_struct *part = container_of(ref, struct hd_struct, ref); |
265 | call_rcu(&part->rcu_head, delete_partition_rcu_cb); | 262 | INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn); |
263 | queue_rcu_work(system_wq, &part->rcu_work); | ||
266 | } | 264 | } |
267 | 265 | ||
268 | /* | 266 | /* |