summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig6
-rw-r--r--block/Kconfig.iosched61
-rw-r--r--block/Makefile5
-rw-r--r--block/bfq-cgroup.c6
-rw-r--r--block/bfq-iosched.c97
-rw-r--r--block/bfq-iosched.h51
-rw-r--r--block/bfq-wf2q.c5
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/bio.c205
-rw-r--r--block/blk-cgroup.c272
-rw-r--r--block/blk-core.c2071
-rw-r--r--block/blk-exec.c20
-rw-r--r--block/blk-flush.c188
-rw-r--r--block/blk-ioc.c54
-rw-r--r--block/blk-iolatency.c75
-rw-r--r--block/blk-lib.c26
-rw-r--r--block/blk-merge.c60
-rw-r--r--block/blk-mq-cpumap.c19
-rw-r--r--block/blk-mq-debugfs.c147
-rw-r--r--block/blk-mq-debugfs.h17
-rw-r--r--block/blk-mq-pci.c10
-rw-r--r--block/blk-mq-rdma.c8
-rw-r--r--block/blk-mq-sched.c82
-rw-r--r--block/blk-mq-sched.h25
-rw-r--r--block/blk-mq-sysfs.c35
-rw-r--r--block/blk-mq-tag.c41
-rw-r--r--block/blk-mq-virtio.c8
-rw-r--r--block/blk-mq.c758
-rw-r--r--block/blk-mq.h70
-rw-r--r--block/blk-pm.c20
-rw-r--r--block/blk-pm.h6
-rw-r--r--block/blk-rq-qos.c154
-rw-r--r--block/blk-rq-qos.h96
-rw-r--r--block/blk-settings.c65
-rw-r--r--block/blk-softirq.c27
-rw-r--r--block/blk-stat.c4
-rw-r--r--block/blk-stat.h5
-rw-r--r--block/blk-sysfs.c107
-rw-r--r--block/blk-tag.c378
-rw-r--r--block/blk-throttle.c39
-rw-r--r--block/blk-timeout.c117
-rw-r--r--block/blk-wbt.c176
-rw-r--r--block/blk-zoned.c4
-rw-r--r--block/blk.h190
-rw-r--r--block/bounce.c4
-rw-r--r--block/bsg-lib.c146
-rw-r--r--block/bsg.c2
-rw-r--r--block/cfq-iosched.c4916
-rw-r--r--block/deadline-iosched.c560
-rw-r--r--block/elevator.c477
-rw-r--r--block/genhd.c63
-rw-r--r--block/kyber-iosched.c37
-rw-r--r--block/mq-deadline.c15
-rw-r--r--block/noop-iosched.c124
-rw-r--r--block/partition-generic.c18
55 files changed, 1946 insertions, 10228 deletions
diff --git a/block/Kconfig b/block/Kconfig
index f7045aa47edb..8044452a4fd3 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -155,12 +155,6 @@ config BLK_CGROUP_IOLATENCY
155 155
156 Note, this is an experimental interface and could be changed someday. 156 Note, this is an experimental interface and could be changed someday.
157 157
158config BLK_WBT_SQ
159 bool "Single queue writeback throttling"
160 depends on BLK_WBT
161 ---help---
162 Enable writeback throttling by default on legacy single queue devices
163
164config BLK_WBT_MQ 158config BLK_WBT_MQ
165 bool "Multiqueue writeback throttling" 159 bool "Multiqueue writeback throttling"
166 default y 160 default y
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index f95a48b0d7b2..4626b88b2d5a 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -3,67 +3,6 @@ if BLOCK
3 3
4menu "IO Schedulers" 4menu "IO Schedulers"
5 5
6config IOSCHED_NOOP
7 bool
8 default y
9 ---help---
10 The no-op I/O scheduler is a minimal scheduler that does basic merging
11 and sorting. Its main uses include non-disk based block devices like
12 memory devices, and specialised software or hardware environments
13 that do their own scheduling and require only minimal assistance from
14 the kernel.
15
16config IOSCHED_DEADLINE
17 tristate "Deadline I/O scheduler"
18 default y
19 ---help---
20 The deadline I/O scheduler is simple and compact. It will provide
21 CSCAN service with FIFO expiration of requests, switching to
22 a new point in the service tree and doing a batch of IO from there
23 in case of expiry.
24
25config IOSCHED_CFQ
26 tristate "CFQ I/O scheduler"
27 default y
28 ---help---
29 The CFQ I/O scheduler tries to distribute bandwidth equally
30 among all processes in the system. It should provide a fair
31 and low latency working environment, suitable for both desktop
32 and server systems.
33
34 This is the default I/O scheduler.
35
36config CFQ_GROUP_IOSCHED
37 bool "CFQ Group Scheduling support"
38 depends on IOSCHED_CFQ && BLK_CGROUP
39 ---help---
40 Enable group IO scheduling in CFQ.
41
42choice
43
44 prompt "Default I/O scheduler"
45 default DEFAULT_CFQ
46 help
47 Select the I/O scheduler which will be used by default for all
48 block devices.
49
50 config DEFAULT_DEADLINE
51 bool "Deadline" if IOSCHED_DEADLINE=y
52
53 config DEFAULT_CFQ
54 bool "CFQ" if IOSCHED_CFQ=y
55
56 config DEFAULT_NOOP
57 bool "No-op"
58
59endchoice
60
61config DEFAULT_IOSCHED
62 string
63 default "deadline" if DEFAULT_DEADLINE
64 default "cfq" if DEFAULT_CFQ
65 default "noop" if DEFAULT_NOOP
66
67config MQ_IOSCHED_DEADLINE 6config MQ_IOSCHED_DEADLINE
68 tristate "MQ deadline I/O scheduler" 7 tristate "MQ deadline I/O scheduler"
69 default y 8 default y
diff --git a/block/Makefile b/block/Makefile
index 27eac600474f..eee1b4ceecf9 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,7 +3,7 @@
3# Makefile for the kernel block layer 3# Makefile for the kernel block layer
4# 4#
5 5
6obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 6obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
7 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 7 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
8 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 8 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
9 blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ 9 blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
@@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
18obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 18obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
19obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o 19obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
20obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o 20obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
21obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
22obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
23obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
24obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o 21obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
25obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o 22obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
26bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o 23bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9fe5952d117d..c6113af31960 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -334,7 +334,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
334 334
335 parent = bfqg_parent(bfqg); 335 parent = bfqg_parent(bfqg);
336 336
337 lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); 337 lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock);
338 338
339 if (unlikely(!parent)) 339 if (unlikely(!parent))
340 return; 340 return;
@@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
642 uint64_t serial_nr; 642 uint64_t serial_nr;
643 643
644 rcu_read_lock(); 644 rcu_read_lock();
645 serial_nr = bio_blkcg(bio)->css.serial_nr; 645 serial_nr = __bio_blkcg(bio)->css.serial_nr;
646 646
647 /* 647 /*
648 * Check whether blkcg has changed. The condition may trigger 648 * Check whether blkcg has changed. The condition may trigger
@@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
651 if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) 651 if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
652 goto out; 652 goto out;
653 653
654 bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); 654 bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio));
655 /* 655 /*
656 * Update blkg_path for bfq_log_* functions. We cache this 656 * Update blkg_path for bfq_log_* functions. We cache this
657 * path, and update it here, for the following 657 * path, and update it here, for the following
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3a27d31fcda6..cd307767a134 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -399,9 +399,9 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
399 unsigned long flags; 399 unsigned long flags;
400 struct bfq_io_cq *icq; 400 struct bfq_io_cq *icq;
401 401
402 spin_lock_irqsave(q->queue_lock, flags); 402 spin_lock_irqsave(&q->queue_lock, flags);
403 icq = icq_to_bic(ioc_lookup_icq(ioc, q)); 403 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
404 spin_unlock_irqrestore(q->queue_lock, flags); 404 spin_unlock_irqrestore(&q->queue_lock, flags);
405 405
406 return icq; 406 return icq;
407 } 407 }
@@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
638 bfqd->queue_weights_tree.rb_node->rb_right) 638 bfqd->queue_weights_tree.rb_node->rb_right)
639#ifdef CONFIG_BFQ_GROUP_IOSCHED 639#ifdef CONFIG_BFQ_GROUP_IOSCHED
640 ) || 640 ) ||
641 (bfqd->num_active_groups > 0 641 (bfqd->num_groups_with_pending_reqs > 0
642#endif 642#endif
643 ); 643 );
644} 644}
@@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
802 */ 802 */
803 break; 803 break;
804 } 804 }
805 bfqd->num_active_groups--; 805
806 /*
807 * The decrement of num_groups_with_pending_reqs is
808 * not performed immediately upon the deactivation of
809 * entity, but it is delayed to when it also happens
810 * that the first leaf descendant bfqq of entity gets
811 * all its pending requests completed. The following
812 * instructions perform this delayed decrement, if
813 * needed. See the comments on
814 * num_groups_with_pending_reqs for details.
815 */
816 if (entity->in_groups_with_pending_reqs) {
817 entity->in_groups_with_pending_reqs = false;
818 bfqd->num_groups_with_pending_reqs--;
819 }
806 } 820 }
807} 821}
808 822
@@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3529 * fact, if there are active groups, then, for condition (i) 3543 * fact, if there are active groups, then, for condition (i)
3530 * to become false, it is enough that an active group contains 3544 * to become false, it is enough that an active group contains
3531 * more active processes or sub-groups than some other active 3545 * more active processes or sub-groups than some other active
3532 * group. We address this issue with the following bi-modal 3546 * group. More precisely, for condition (i) to hold because of
3533 * behavior, implemented in the function 3547 * such a group, it is not even necessary that the group is
3548 * (still) active: it is sufficient that, even if the group
3549 * has become inactive, some of its descendant processes still
3550 * have some request already dispatched but still waiting for
3551 * completion. In fact, requests have still to be guaranteed
3552 * their share of the throughput even after being
3553 * dispatched. In this respect, it is easy to show that, if a
3554 * group frequently becomes inactive while still having
3555 * in-flight requests, and if, when this happens, the group is
3556 * not considered in the calculation of whether the scenario
3557 * is asymmetric, then the group may fail to be guaranteed its
3558 * fair share of the throughput (basically because idling may
3559 * not be performed for the descendant processes of the group,
3560 * but it had to be). We address this issue with the
3561 * following bi-modal behavior, implemented in the function
3534 * bfq_symmetric_scenario(). 3562 * bfq_symmetric_scenario().
3535 * 3563 *
3536 * If there are active groups, then the scenario is tagged as 3564 * If there are groups with requests waiting for completion
3565 * (as commented above, some of these groups may even be
3566 * already inactive), then the scenario is tagged as
3537 * asymmetric, conservatively, without checking any of the 3567 * asymmetric, conservatively, without checking any of the
3538 * conditions (i) and (ii). So the device is idled for bfqq. 3568 * conditions (i) and (ii). So the device is idled for bfqq.
3539 * This behavior matches also the fact that groups are created 3569 * This behavior matches also the fact that groups are created
3540 * exactly if controlling I/O (to preserve bandwidth and 3570 * exactly if controlling I/O is a primary concern (to
3541 * latency guarantees) is a primary concern. 3571 * preserve bandwidth and latency guarantees).
3542 * 3572 *
3543 * On the opposite end, if there are no active groups, then 3573 * On the opposite end, if there are no groups with requests
3544 * only condition (i) is actually controlled, i.e., provided 3574 * waiting for completion, then only condition (i) is actually
3545 * that condition (i) holds, idling is not performed, 3575 * controlled, i.e., provided that condition (i) holds, idling
3546 * regardless of whether condition (ii) holds. In other words, 3576 * is not performed, regardless of whether condition (ii)
3547 * only if condition (i) does not hold, then idling is 3577 * holds. In other words, only if condition (i) does not hold,
3548 * allowed, and the device tends to be prevented from queueing 3578 * then idling is allowed, and the device tends to be
3549 * many requests, possibly of several processes. Since there 3579 * prevented from queueing many requests, possibly of several
3550 * are no active groups, then, to control condition (i) it is 3580 * processes. Since there are no groups with requests waiting
3551 * enough to check whether all active queues have the same 3581 * for completion, then, to control condition (i) it is enough
3552 * weight. 3582 * to check just whether all the queues with requests waiting
3583 * for completion also have the same weight.
3553 * 3584 *
3554 * Not checking condition (ii) evidently exposes bfqq to the 3585 * Not checking condition (ii) evidently exposes bfqq to the
3555 * risk of getting less throughput than its fair share. 3586 * risk of getting less throughput than its fair share.
@@ -3607,10 +3638,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3607 * bfqq is weight-raised is checked explicitly here. More 3638 * bfqq is weight-raised is checked explicitly here. More
3608 * precisely, the compound condition below takes into account 3639 * precisely, the compound condition below takes into account
3609 * also the fact that, even if bfqq is being weight-raised, 3640 * also the fact that, even if bfqq is being weight-raised,
3610 * the scenario is still symmetric if all active queues happen 3641 * the scenario is still symmetric if all queues with requests
3611 * to be weight-raised. Actually, we should be even more 3642 * waiting for completion happen to be
3612 * precise here, and differentiate between interactive weight 3643 * weight-raised. Actually, we should be even more precise
3613 * raising and soft real-time weight raising. 3644 * here, and differentiate between interactive weight raising
3645 * and soft real-time weight raising.
3614 * 3646 *
3615 * As a side note, it is worth considering that the above 3647 * As a side note, it is worth considering that the above
3616 * device-idling countermeasures may however fail in the 3648 * device-idling countermeasures may however fail in the
@@ -4034,7 +4066,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
4034 * In addition, the following queue lock guarantees that 4066 * In addition, the following queue lock guarantees that
4035 * bfqq_group(bfqq) exists as well. 4067 * bfqq_group(bfqq) exists as well.
4036 */ 4068 */
4037 spin_lock_irq(q->queue_lock); 4069 spin_lock_irq(&q->queue_lock);
4038 if (idle_timer_disabled) 4070 if (idle_timer_disabled)
4039 /* 4071 /*
4040 * Since the idle timer has been disabled, 4072 * Since the idle timer has been disabled,
@@ -4053,7 +4085,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
4053 bfqg_stats_set_start_empty_time(bfqg); 4085 bfqg_stats_set_start_empty_time(bfqg);
4054 bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); 4086 bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
4055 } 4087 }
4056 spin_unlock_irq(q->queue_lock); 4088 spin_unlock_irq(&q->queue_lock);
4057} 4089}
4058#else 4090#else
4059static inline void bfq_update_dispatch_stats(struct request_queue *q, 4091static inline void bfq_update_dispatch_stats(struct request_queue *q,
@@ -4384,7 +4416,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
4384 4416
4385 rcu_read_lock(); 4417 rcu_read_lock();
4386 4418
4387 bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); 4419 bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio));
4388 if (!bfqg) { 4420 if (!bfqg) {
4389 bfqq = &bfqd->oom_bfqq; 4421 bfqq = &bfqd->oom_bfqq;
4390 goto out; 4422 goto out;
@@ -4637,11 +4669,11 @@ static void bfq_update_insert_stats(struct request_queue *q,
4637 * In addition, the following queue lock guarantees that 4669 * In addition, the following queue lock guarantees that
4638 * bfqq_group(bfqq) exists as well. 4670 * bfqq_group(bfqq) exists as well.
4639 */ 4671 */
4640 spin_lock_irq(q->queue_lock); 4672 spin_lock_irq(&q->queue_lock);
4641 bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); 4673 bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
4642 if (idle_timer_disabled) 4674 if (idle_timer_disabled)
4643 bfqg_stats_update_idle_time(bfqq_group(bfqq)); 4675 bfqg_stats_update_idle_time(bfqq_group(bfqq));
4644 spin_unlock_irq(q->queue_lock); 4676 spin_unlock_irq(&q->queue_lock);
4645} 4677}
4646#else 4678#else
4647static inline void bfq_update_insert_stats(struct request_queue *q, 4679static inline void bfq_update_insert_stats(struct request_queue *q,
@@ -5382,9 +5414,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5382 } 5414 }
5383 eq->elevator_data = bfqd; 5415 eq->elevator_data = bfqd;
5384 5416
5385 spin_lock_irq(q->queue_lock); 5417 spin_lock_irq(&q->queue_lock);
5386 q->elevator = eq; 5418 q->elevator = eq;
5387 spin_unlock_irq(q->queue_lock); 5419 spin_unlock_irq(&q->queue_lock);
5388 5420
5389 /* 5421 /*
5390 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. 5422 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
@@ -5417,7 +5449,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5417 bfqd->idle_slice_timer.function = bfq_idle_slice_timer; 5449 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
5418 5450
5419 bfqd->queue_weights_tree = RB_ROOT; 5451 bfqd->queue_weights_tree = RB_ROOT;
5420 bfqd->num_active_groups = 0; 5452 bfqd->num_groups_with_pending_reqs = 0;
5421 5453
5422 INIT_LIST_HEAD(&bfqd->active_list); 5454 INIT_LIST_HEAD(&bfqd->active_list);
5423 INIT_LIST_HEAD(&bfqd->idle_list); 5455 INIT_LIST_HEAD(&bfqd->idle_list);
@@ -5724,7 +5756,7 @@ static struct elv_fs_entry bfq_attrs[] = {
5724}; 5756};
5725 5757
5726static struct elevator_type iosched_bfq_mq = { 5758static struct elevator_type iosched_bfq_mq = {
5727 .ops.mq = { 5759 .ops = {
5728 .limit_depth = bfq_limit_depth, 5760 .limit_depth = bfq_limit_depth,
5729 .prepare_request = bfq_prepare_request, 5761 .prepare_request = bfq_prepare_request,
5730 .requeue_request = bfq_finish_requeue_request, 5762 .requeue_request = bfq_finish_requeue_request,
@@ -5745,7 +5777,6 @@ static struct elevator_type iosched_bfq_mq = {
5745 .exit_sched = bfq_exit_queue, 5777 .exit_sched = bfq_exit_queue,
5746 }, 5778 },
5747 5779
5748 .uses_mq = true,
5749 .icq_size = sizeof(struct bfq_io_cq), 5780 .icq_size = sizeof(struct bfq_io_cq),
5750 .icq_align = __alignof__(struct bfq_io_cq), 5781 .icq_align = __alignof__(struct bfq_io_cq),
5751 .elevator_attrs = bfq_attrs, 5782 .elevator_attrs = bfq_attrs,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 77651d817ecd..0b02bf302de0 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -196,6 +196,9 @@ struct bfq_entity {
196 196
197 /* flag, set to request a weight, ioprio or ioprio_class change */ 197 /* flag, set to request a weight, ioprio or ioprio_class change */
198 int prio_changed; 198 int prio_changed;
199
200 /* flag, set if the entity is counted in groups_with_pending_reqs */
201 bool in_groups_with_pending_reqs;
199}; 202};
200 203
201struct bfq_group; 204struct bfq_group;
@@ -448,10 +451,54 @@ struct bfq_data {
448 * bfq_weights_tree_[add|remove] for further details). 451 * bfq_weights_tree_[add|remove] for further details).
449 */ 452 */
450 struct rb_root queue_weights_tree; 453 struct rb_root queue_weights_tree;
454
451 /* 455 /*
452 * number of groups with requests still waiting for completion 456 * Number of groups with at least one descendant process that
457 * has at least one request waiting for completion. Note that
458 * this accounts for also requests already dispatched, but not
459 * yet completed. Therefore this number of groups may differ
460 * (be larger) than the number of active groups, as a group is
461 * considered active only if its corresponding entity has
462 * descendant queues with at least one request queued. This
463 * number is used to decide whether a scenario is symmetric.
464 * For a detailed explanation see comments on the computation
465 * of the variable asymmetric_scenario in the function
466 * bfq_better_to_idle().
467 *
468 * However, it is hard to compute this number exactly, for
469 * groups with multiple descendant processes. Consider a group
470 * that is inactive, i.e., that has no descendant process with
471 * pending I/O inside BFQ queues. Then suppose that
472 * num_groups_with_pending_reqs is still accounting for this
473 * group, because the group has descendant processes with some
474 * I/O request still in flight. num_groups_with_pending_reqs
475 * should be decremented when the in-flight request of the
476 * last descendant process is finally completed (assuming that
477 * nothing else has changed for the group in the meantime, in
478 * terms of composition of the group and active/inactive state of child
479 * groups and processes). To accomplish this, an additional
480 * pending-request counter must be added to entities, and must
481 * be updated correctly. To avoid this additional field and operations,
482 * we resort to the following tradeoff between simplicity and
483 * accuracy: for an inactive group that is still counted in
484 * num_groups_with_pending_reqs, we decrement
485 * num_groups_with_pending_reqs when the first descendant
486 * process of the group remains with no request waiting for
487 * completion.
488 *
489 * Even this simpler decrement strategy requires a little
490 * carefulness: to avoid multiple decrements, we flag a group,
491 * more precisely an entity representing a group, as still
492 * counted in num_groups_with_pending_reqs when it becomes
493 * inactive. Then, when the first descendant queue of the
494 * entity remains with no request waiting for completion,
495 * num_groups_with_pending_reqs is decremented, and this flag
496 * is reset. After this flag is reset for the entity,
497 * num_groups_with_pending_reqs won't be decremented any
498 * longer in case a new descendant queue of the entity remains
499 * with no request waiting for completion.
453 */ 500 */
454 unsigned int num_active_groups; 501 unsigned int num_groups_with_pending_reqs;
455 502
456 /* 503 /*
457 * Number of bfq_queues containing requests (including the 504 * Number of bfq_queues containing requests (including the
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 4b0d5fb69160..63e0f12be7c9 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
1012 container_of(entity, struct bfq_group, entity); 1012 container_of(entity, struct bfq_group, entity);
1013 struct bfq_data *bfqd = bfqg->bfqd; 1013 struct bfq_data *bfqd = bfqg->bfqd;
1014 1014
1015 bfqd->num_active_groups++; 1015 if (!entity->in_groups_with_pending_reqs) {
1016 entity->in_groups_with_pending_reqs = true;
1017 bfqd->num_groups_with_pending_reqs++;
1018 }
1016 } 1019 }
1017#endif 1020#endif
1018 1021
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 290af497997b..1b633a3526d4 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -390,7 +390,6 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
390 bip->bip_iter.bi_sector += bytes_done >> 9; 390 bip->bip_iter.bi_sector += bytes_done >> 9;
391 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); 391 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
392} 392}
393EXPORT_SYMBOL(bio_integrity_advance);
394 393
395/** 394/**
396 * bio_integrity_trim - Trim integrity vector 395 * bio_integrity_trim - Trim integrity vector
@@ -460,7 +459,6 @@ void bioset_integrity_free(struct bio_set *bs)
460 mempool_exit(&bs->bio_integrity_pool); 459 mempool_exit(&bs->bio_integrity_pool);
461 mempool_exit(&bs->bvec_integrity_pool); 460 mempool_exit(&bs->bvec_integrity_pool);
462} 461}
463EXPORT_SYMBOL(bioset_integrity_free);
464 462
465void __init bio_integrity_init(void) 463void __init bio_integrity_init(void)
466{ 464{
diff --git a/block/bio.c b/block/bio.c
index d5368a445561..8281bfcbc265 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -244,7 +244,7 @@ fallback:
244 244
245void bio_uninit(struct bio *bio) 245void bio_uninit(struct bio *bio)
246{ 246{
247 bio_disassociate_task(bio); 247 bio_disassociate_blkg(bio);
248} 248}
249EXPORT_SYMBOL(bio_uninit); 249EXPORT_SYMBOL(bio_uninit);
250 250
@@ -571,14 +571,13 @@ void bio_put(struct bio *bio)
571} 571}
572EXPORT_SYMBOL(bio_put); 572EXPORT_SYMBOL(bio_put);
573 573
574inline int bio_phys_segments(struct request_queue *q, struct bio *bio) 574int bio_phys_segments(struct request_queue *q, struct bio *bio)
575{ 575{
576 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 576 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
577 blk_recount_segments(q, bio); 577 blk_recount_segments(q, bio);
578 578
579 return bio->bi_phys_segments; 579 return bio->bi_phys_segments;
580} 580}
581EXPORT_SYMBOL(bio_phys_segments);
582 581
583/** 582/**
584 * __bio_clone_fast - clone a bio that shares the original bio's biovec 583 * __bio_clone_fast - clone a bio that shares the original bio's biovec
@@ -605,11 +604,13 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
605 if (bio_flagged(bio_src, BIO_THROTTLED)) 604 if (bio_flagged(bio_src, BIO_THROTTLED))
606 bio_set_flag(bio, BIO_THROTTLED); 605 bio_set_flag(bio, BIO_THROTTLED);
607 bio->bi_opf = bio_src->bi_opf; 606 bio->bi_opf = bio_src->bi_opf;
607 bio->bi_ioprio = bio_src->bi_ioprio;
608 bio->bi_write_hint = bio_src->bi_write_hint; 608 bio->bi_write_hint = bio_src->bi_write_hint;
609 bio->bi_iter = bio_src->bi_iter; 609 bio->bi_iter = bio_src->bi_iter;
610 bio->bi_io_vec = bio_src->bi_io_vec; 610 bio->bi_io_vec = bio_src->bi_io_vec;
611 611
612 bio_clone_blkcg_association(bio, bio_src); 612 bio_clone_blkg_association(bio, bio_src);
613 blkcg_bio_issue_init(bio);
613} 614}
614EXPORT_SYMBOL(__bio_clone_fast); 615EXPORT_SYMBOL(__bio_clone_fast);
615 616
@@ -900,7 +901,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
900 901
901 return 0; 902 return 0;
902} 903}
903EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
904 904
905static void submit_bio_wait_endio(struct bio *bio) 905static void submit_bio_wait_endio(struct bio *bio)
906{ 906{
@@ -1260,6 +1260,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
1260 if (ret) 1260 if (ret)
1261 goto cleanup; 1261 goto cleanup;
1262 } else { 1262 } else {
1263 if (bmd->is_our_pages)
1264 zero_fill_bio(bio);
1263 iov_iter_advance(iter, bio->bi_iter.bi_size); 1265 iov_iter_advance(iter, bio->bi_iter.bi_size);
1264 } 1266 }
1265 1267
@@ -1589,7 +1591,6 @@ void bio_set_pages_dirty(struct bio *bio)
1589 set_page_dirty_lock(bvec->bv_page); 1591 set_page_dirty_lock(bvec->bv_page);
1590 } 1592 }
1591} 1593}
1592EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
1593 1594
1594static void bio_release_pages(struct bio *bio) 1595static void bio_release_pages(struct bio *bio)
1595{ 1596{
@@ -1659,17 +1660,33 @@ defer:
1659 spin_unlock_irqrestore(&bio_dirty_lock, flags); 1660 spin_unlock_irqrestore(&bio_dirty_lock, flags);
1660 schedule_work(&bio_dirty_work); 1661 schedule_work(&bio_dirty_work);
1661} 1662}
1662EXPORT_SYMBOL_GPL(bio_check_pages_dirty); 1663
1664void update_io_ticks(struct hd_struct *part, unsigned long now)
1665{
1666 unsigned long stamp;
1667again:
1668 stamp = READ_ONCE(part->stamp);
1669 if (unlikely(stamp != now)) {
1670 if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
1671 __part_stat_add(part, io_ticks, 1);
1672 }
1673 }
1674 if (part->partno) {
1675 part = &part_to_disk(part)->part0;
1676 goto again;
1677 }
1678}
1663 1679
1664void generic_start_io_acct(struct request_queue *q, int op, 1680void generic_start_io_acct(struct request_queue *q, int op,
1665 unsigned long sectors, struct hd_struct *part) 1681 unsigned long sectors, struct hd_struct *part)
1666{ 1682{
1667 const int sgrp = op_stat_group(op); 1683 const int sgrp = op_stat_group(op);
1668 int cpu = part_stat_lock();
1669 1684
1670 part_round_stats(q, cpu, part); 1685 part_stat_lock();
1671 part_stat_inc(cpu, part, ios[sgrp]); 1686
1672 part_stat_add(cpu, part, sectors[sgrp], sectors); 1687 update_io_ticks(part, jiffies);
1688 part_stat_inc(part, ios[sgrp]);
1689 part_stat_add(part, sectors[sgrp], sectors);
1673 part_inc_in_flight(q, part, op_is_write(op)); 1690 part_inc_in_flight(q, part, op_is_write(op));
1674 1691
1675 part_stat_unlock(); 1692 part_stat_unlock();
@@ -1679,12 +1696,15 @@ EXPORT_SYMBOL(generic_start_io_acct);
1679void generic_end_io_acct(struct request_queue *q, int req_op, 1696void generic_end_io_acct(struct request_queue *q, int req_op,
1680 struct hd_struct *part, unsigned long start_time) 1697 struct hd_struct *part, unsigned long start_time)
1681{ 1698{
1682 unsigned long duration = jiffies - start_time; 1699 unsigned long now = jiffies;
1700 unsigned long duration = now - start_time;
1683 const int sgrp = op_stat_group(req_op); 1701 const int sgrp = op_stat_group(req_op);
1684 int cpu = part_stat_lock();
1685 1702
1686 part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); 1703 part_stat_lock();
1687 part_round_stats(q, cpu, part); 1704
1705 update_io_ticks(part, now);
1706 part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
1707 part_stat_add(part, time_in_queue, duration);
1688 part_dec_in_flight(q, part, op_is_write(req_op)); 1708 part_dec_in_flight(q, part, op_is_write(req_op));
1689 1709
1690 part_stat_unlock(); 1710 part_stat_unlock();
@@ -1954,102 +1974,133 @@ EXPORT_SYMBOL(bioset_init_from_src);
1954 1974
1955#ifdef CONFIG_BLK_CGROUP 1975#ifdef CONFIG_BLK_CGROUP
1956 1976
1957#ifdef CONFIG_MEMCG
1958/** 1977/**
1959 * bio_associate_blkcg_from_page - associate a bio with the page's blkcg 1978 * bio_disassociate_blkg - puts back the blkg reference if associated
1960 * @bio: target bio 1979 * @bio: target bio
1961 * @page: the page to lookup the blkcg from
1962 * 1980 *
1963 * Associate @bio with the blkcg from @page's owning memcg. This works like 1981 * Helper to disassociate the blkg from @bio if a blkg is associated.
1964 * every other associate function wrt references.
1965 */ 1982 */
1966int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) 1983void bio_disassociate_blkg(struct bio *bio)
1967{ 1984{
1968 struct cgroup_subsys_state *blkcg_css; 1985 if (bio->bi_blkg) {
1969 1986 blkg_put(bio->bi_blkg);
1970 if (unlikely(bio->bi_css)) 1987 bio->bi_blkg = NULL;
1971 return -EBUSY; 1988 }
1972 if (!page->mem_cgroup)
1973 return 0;
1974 blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
1975 &io_cgrp_subsys);
1976 bio->bi_css = blkcg_css;
1977 return 0;
1978} 1989}
1979#endif /* CONFIG_MEMCG */ 1990EXPORT_SYMBOL_GPL(bio_disassociate_blkg);
1980 1991
1981/** 1992/**
1982 * bio_associate_blkcg - associate a bio with the specified blkcg 1993 * __bio_associate_blkg - associate a bio with the a blkg
1983 * @bio: target bio 1994 * @bio: target bio
1984 * @blkcg_css: css of the blkcg to associate 1995 * @blkg: the blkg to associate
1985 * 1996 *
1986 * Associate @bio with the blkcg specified by @blkcg_css. Block layer will 1997 * This tries to associate @bio with the specified @blkg. Association failure
1987 * treat @bio as if it were issued by a task which belongs to the blkcg. 1998 * is handled by walking up the blkg tree. Therefore, the blkg associated can
1999 * be anything between @blkg and the root_blkg. This situation only happens
2000 * when a cgroup is dying and then the remaining bios will spill to the closest
2001 * alive blkg.
1988 * 2002 *
1989 * This function takes an extra reference of @blkcg_css which will be put 2003 * A reference will be taken on the @blkg and will be released when @bio is
1990 * when @bio is released. The caller must own @bio and is responsible for 2004 * freed.
1991 * synchronizing calls to this function.
1992 */ 2005 */
1993int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) 2006static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
1994{ 2007{
1995 if (unlikely(bio->bi_css)) 2008 bio_disassociate_blkg(bio);
1996 return -EBUSY; 2009
1997 css_get(blkcg_css); 2010 bio->bi_blkg = blkg_tryget_closest(blkg);
1998 bio->bi_css = blkcg_css;
1999 return 0;
2000} 2011}
2001EXPORT_SYMBOL_GPL(bio_associate_blkcg);
2002 2012
2003/** 2013/**
2004 * bio_associate_blkg - associate a bio with the specified blkg 2014 * bio_associate_blkg_from_css - associate a bio with a specified css
2005 * @bio: target bio 2015 * @bio: target bio
2006 * @blkg: the blkg to associate 2016 * @css: target css
2007 * 2017 *
2008 * Associate @bio with the blkg specified by @blkg. This is the queue specific 2018 * Associate @bio with the blkg found by combining the css's blkg and the
2009 * blkcg information associated with the @bio, a reference will be taken on the 2019 * request_queue of the @bio. This falls back to the queue's root_blkg if
2010 * @blkg and will be freed when the bio is freed. 2020 * the association fails with the css.
2011 */ 2021 */
2012int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) 2022void bio_associate_blkg_from_css(struct bio *bio,
2023 struct cgroup_subsys_state *css)
2013{ 2024{
2014 if (unlikely(bio->bi_blkg)) 2025 struct request_queue *q = bio->bi_disk->queue;
2015 return -EBUSY; 2026 struct blkcg_gq *blkg;
2016 if (!blkg_try_get(blkg)) 2027
2017 return -ENODEV; 2028 rcu_read_lock();
2018 bio->bi_blkg = blkg; 2029
2019 return 0; 2030 if (!css || !css->parent)
2031 blkg = q->root_blkg;
2032 else
2033 blkg = blkg_lookup_create(css_to_blkcg(css), q);
2034
2035 __bio_associate_blkg(bio, blkg);
2036
2037 rcu_read_unlock();
2020} 2038}
2039EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
2021 2040
2041#ifdef CONFIG_MEMCG
2022/** 2042/**
2023 * bio_disassociate_task - undo bio_associate_current() 2043 * bio_associate_blkg_from_page - associate a bio with the page's blkg
2024 * @bio: target bio 2044 * @bio: target bio
2045 * @page: the page to lookup the blkcg from
2046 *
2047 * Associate @bio with the blkg from @page's owning memcg and the respective
2048 * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's
2049 * root_blkg.
2025 */ 2050 */
2026void bio_disassociate_task(struct bio *bio) 2051void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
2027{ 2052{
2028 if (bio->bi_ioc) { 2053 struct cgroup_subsys_state *css;
2029 put_io_context(bio->bi_ioc); 2054
2030 bio->bi_ioc = NULL; 2055 if (!page->mem_cgroup)
2031 } 2056 return;
2032 if (bio->bi_css) { 2057
2033 css_put(bio->bi_css); 2058 rcu_read_lock();
2034 bio->bi_css = NULL; 2059
2035 } 2060 css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
2036 if (bio->bi_blkg) { 2061 bio_associate_blkg_from_css(bio, css);
2037 blkg_put(bio->bi_blkg); 2062
2038 bio->bi_blkg = NULL; 2063 rcu_read_unlock();
2039 } 2064}
2065#endif /* CONFIG_MEMCG */
2066
2067/**
2068 * bio_associate_blkg - associate a bio with a blkg
2069 * @bio: target bio
2070 *
2071 * Associate @bio with the blkg found from the bio's css and request_queue.
2072 * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
2073 * already associated, the css is reused and association redone as the
2074 * request_queue may have changed.
2075 */
2076void bio_associate_blkg(struct bio *bio)
2077{
2078 struct cgroup_subsys_state *css;
2079
2080 rcu_read_lock();
2081
2082 if (bio->bi_blkg)
2083 css = &bio_blkcg(bio)->css;
2084 else
2085 css = blkcg_css();
2086
2087 bio_associate_blkg_from_css(bio, css);
2088
2089 rcu_read_unlock();
2040} 2090}
2091EXPORT_SYMBOL_GPL(bio_associate_blkg);
2041 2092
2042/** 2093/**
2043 * bio_clone_blkcg_association - clone blkcg association from src to dst bio 2094 * bio_clone_blkg_association - clone blkg association from src to dst bio
2044 * @dst: destination bio 2095 * @dst: destination bio
2045 * @src: source bio 2096 * @src: source bio
2046 */ 2097 */
2047void bio_clone_blkcg_association(struct bio *dst, struct bio *src) 2098void bio_clone_blkg_association(struct bio *dst, struct bio *src)
2048{ 2099{
2049 if (src->bi_css) 2100 if (src->bi_blkg)
2050 WARN_ON(bio_associate_blkcg(dst, src->bi_css)); 2101 __bio_associate_blkg(dst, src->bi_blkg);
2051} 2102}
2052EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); 2103EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
2053#endif /* CONFIG_BLK_CGROUP */ 2104#endif /* CONFIG_BLK_CGROUP */
2054 2105
2055static void __init biovec_init_slabs(void) 2106static void __init biovec_init_slabs(void)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c630e02836a8..c8cc1cbb6370 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -76,14 +76,42 @@ static void blkg_free(struct blkcg_gq *blkg)
76 if (blkg->pd[i]) 76 if (blkg->pd[i])
77 blkcg_policy[i]->pd_free_fn(blkg->pd[i]); 77 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
78 78
79 if (blkg->blkcg != &blkcg_root)
80 blk_exit_rl(blkg->q, &blkg->rl);
81
82 blkg_rwstat_exit(&blkg->stat_ios); 79 blkg_rwstat_exit(&blkg->stat_ios);
83 blkg_rwstat_exit(&blkg->stat_bytes); 80 blkg_rwstat_exit(&blkg->stat_bytes);
84 kfree(blkg); 81 kfree(blkg);
85} 82}
86 83
84static void __blkg_release(struct rcu_head *rcu)
85{
86 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
87
88 percpu_ref_exit(&blkg->refcnt);
89
90 /* release the blkcg and parent blkg refs this blkg has been holding */
91 css_put(&blkg->blkcg->css);
92 if (blkg->parent)
93 blkg_put(blkg->parent);
94
95 wb_congested_put(blkg->wb_congested);
96
97 blkg_free(blkg);
98}
99
100/*
101 * A group is RCU protected, but having an rcu lock does not mean that one
102 * can access all the fields of blkg and assume these are valid. For
103 * example, don't try to follow throtl_data and request queue links.
104 *
105 * Having a reference to blkg under an rcu allows accesses to only values
106 * local to groups like group stats and group rate limits.
107 */
108static void blkg_release(struct percpu_ref *ref)
109{
110 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
111
112 call_rcu(&blkg->rcu_head, __blkg_release);
113}
114
87/** 115/**
88 * blkg_alloc - allocate a blkg 116 * blkg_alloc - allocate a blkg
89 * @blkcg: block cgroup the new blkg is associated with 117 * @blkcg: block cgroup the new blkg is associated with
@@ -110,14 +138,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
110 blkg->q = q; 138 blkg->q = q;
111 INIT_LIST_HEAD(&blkg->q_node); 139 INIT_LIST_HEAD(&blkg->q_node);
112 blkg->blkcg = blkcg; 140 blkg->blkcg = blkcg;
113 atomic_set(&blkg->refcnt, 1);
114
115 /* root blkg uses @q->root_rl, init rl only for !root blkgs */
116 if (blkcg != &blkcg_root) {
117 if (blk_init_rl(&blkg->rl, q, gfp_mask))
118 goto err_free;
119 blkg->rl.blkg = blkg;
120 }
121 141
122 for (i = 0; i < BLKCG_MAX_POLS; i++) { 142 for (i = 0; i < BLKCG_MAX_POLS; i++) {
123 struct blkcg_policy *pol = blkcg_policy[i]; 143 struct blkcg_policy *pol = blkcg_policy[i];
@@ -157,7 +177,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
157 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 177 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
158 if (blkg && blkg->q == q) { 178 if (blkg && blkg->q == q) {
159 if (update_hint) { 179 if (update_hint) {
160 lockdep_assert_held(q->queue_lock); 180 lockdep_assert_held(&q->queue_lock);
161 rcu_assign_pointer(blkcg->blkg_hint, blkg); 181 rcu_assign_pointer(blkcg->blkg_hint, blkg);
162 } 182 }
163 return blkg; 183 return blkg;
@@ -180,7 +200,13 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
180 int i, ret; 200 int i, ret;
181 201
182 WARN_ON_ONCE(!rcu_read_lock_held()); 202 WARN_ON_ONCE(!rcu_read_lock_held());
183 lockdep_assert_held(q->queue_lock); 203 lockdep_assert_held(&q->queue_lock);
204
205 /* request_queue is dying, do not create/recreate a blkg */
206 if (blk_queue_dying(q)) {
207 ret = -ENODEV;
208 goto err_free_blkg;
209 }
184 210
185 /* blkg holds a reference to blkcg */ 211 /* blkg holds a reference to blkcg */
186 if (!css_tryget_online(&blkcg->css)) { 212 if (!css_tryget_online(&blkcg->css)) {
@@ -217,6 +243,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
217 blkg_get(blkg->parent); 243 blkg_get(blkg->parent);
218 } 244 }
219 245
246 ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
247 GFP_NOWAIT | __GFP_NOWARN);
248 if (ret)
249 goto err_cancel_ref;
250
220 /* invoke per-policy init */ 251 /* invoke per-policy init */
221 for (i = 0; i < BLKCG_MAX_POLS; i++) { 252 for (i = 0; i < BLKCG_MAX_POLS; i++) {
222 struct blkcg_policy *pol = blkcg_policy[i]; 253 struct blkcg_policy *pol = blkcg_policy[i];
@@ -249,6 +280,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
249 blkg_put(blkg); 280 blkg_put(blkg);
250 return ERR_PTR(ret); 281 return ERR_PTR(ret);
251 282
283err_cancel_ref:
284 percpu_ref_exit(&blkg->refcnt);
252err_put_congested: 285err_put_congested:
253 wb_congested_put(wb_congested); 286 wb_congested_put(wb_congested);
254err_put_css: 287err_put_css:
@@ -259,7 +292,7 @@ err_free_blkg:
259} 292}
260 293
261/** 294/**
262 * blkg_lookup_create - lookup blkg, try to create one if not there 295 * __blkg_lookup_create - lookup blkg, try to create one if not there
263 * @blkcg: blkcg of interest 296 * @blkcg: blkcg of interest
264 * @q: request_queue of interest 297 * @q: request_queue of interest
265 * 298 *
@@ -268,24 +301,16 @@ err_free_blkg:
268 * that all non-root blkg's have access to the parent blkg. This function 301 * that all non-root blkg's have access to the parent blkg. This function
269 * should be called under RCU read lock and @q->queue_lock. 302 * should be called under RCU read lock and @q->queue_lock.
270 * 303 *
271 * Returns pointer to the looked up or created blkg on success, ERR_PTR() 304 * Returns the blkg or the closest blkg if blkg_create() fails as it walks
272 * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not 305 * down from root.
273 * dead and bypassing, returns ERR_PTR(-EBUSY).
274 */ 306 */
275struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 307struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
276 struct request_queue *q) 308 struct request_queue *q)
277{ 309{
278 struct blkcg_gq *blkg; 310 struct blkcg_gq *blkg;
279 311
280 WARN_ON_ONCE(!rcu_read_lock_held()); 312 WARN_ON_ONCE(!rcu_read_lock_held());
281 lockdep_assert_held(q->queue_lock); 313 lockdep_assert_held(&q->queue_lock);
282
283 /*
284 * This could be the first entry point of blkcg implementation and
285 * we shouldn't allow anything to go through for a bypassing queue.
286 */
287 if (unlikely(blk_queue_bypass(q)))
288 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
289 314
290 blkg = __blkg_lookup(blkcg, q, true); 315 blkg = __blkg_lookup(blkcg, q, true);
291 if (blkg) 316 if (blkg)
@@ -293,30 +318,64 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
293 318
294 /* 319 /*
295 * Create blkgs walking down from blkcg_root to @blkcg, so that all 320 * Create blkgs walking down from blkcg_root to @blkcg, so that all
296 * non-root blkgs have access to their parents. 321 * non-root blkgs have access to their parents. Returns the closest
322 * blkg to the intended blkg should blkg_create() fail.
297 */ 323 */
298 while (true) { 324 while (true) {
299 struct blkcg *pos = blkcg; 325 struct blkcg *pos = blkcg;
300 struct blkcg *parent = blkcg_parent(blkcg); 326 struct blkcg *parent = blkcg_parent(blkcg);
301 327 struct blkcg_gq *ret_blkg = q->root_blkg;
302 while (parent && !__blkg_lookup(parent, q, false)) { 328
329 while (parent) {
330 blkg = __blkg_lookup(parent, q, false);
331 if (blkg) {
332 /* remember closest blkg */
333 ret_blkg = blkg;
334 break;
335 }
303 pos = parent; 336 pos = parent;
304 parent = blkcg_parent(parent); 337 parent = blkcg_parent(parent);
305 } 338 }
306 339
307 blkg = blkg_create(pos, q, NULL); 340 blkg = blkg_create(pos, q, NULL);
308 if (pos == blkcg || IS_ERR(blkg)) 341 if (IS_ERR(blkg))
342 return ret_blkg;
343 if (pos == blkcg)
309 return blkg; 344 return blkg;
310 } 345 }
311} 346}
312 347
348/**
349 * blkg_lookup_create - find or create a blkg
350 * @blkcg: target block cgroup
351 * @q: target request_queue
352 *
353 * This looks up or creates the blkg representing the unique pair
354 * of the blkcg and the request_queue.
355 */
356struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
357 struct request_queue *q)
358{
359 struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
360
361 if (unlikely(!blkg)) {
362 unsigned long flags;
363
364 spin_lock_irqsave(&q->queue_lock, flags);
365 blkg = __blkg_lookup_create(blkcg, q);
366 spin_unlock_irqrestore(&q->queue_lock, flags);
367 }
368
369 return blkg;
370}
371
313static void blkg_destroy(struct blkcg_gq *blkg) 372static void blkg_destroy(struct blkcg_gq *blkg)
314{ 373{
315 struct blkcg *blkcg = blkg->blkcg; 374 struct blkcg *blkcg = blkg->blkcg;
316 struct blkcg_gq *parent = blkg->parent; 375 struct blkcg_gq *parent = blkg->parent;
317 int i; 376 int i;
318 377
319 lockdep_assert_held(blkg->q->queue_lock); 378 lockdep_assert_held(&blkg->q->queue_lock);
320 lockdep_assert_held(&blkcg->lock); 379 lockdep_assert_held(&blkcg->lock);
321 380
322 /* Something wrong if we are trying to remove same group twice */ 381 /* Something wrong if we are trying to remove same group twice */
@@ -353,7 +412,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
353 * Put the reference taken at the time of creation so that when all 412 * Put the reference taken at the time of creation so that when all
354 * queues are gone, group can be destroyed. 413 * queues are gone, group can be destroyed.
355 */ 414 */
356 blkg_put(blkg); 415 percpu_ref_kill(&blkg->refcnt);
357} 416}
358 417
359/** 418/**
@@ -366,8 +425,7 @@ static void blkg_destroy_all(struct request_queue *q)
366{ 425{
367 struct blkcg_gq *blkg, *n; 426 struct blkcg_gq *blkg, *n;
368 427
369 lockdep_assert_held(q->queue_lock); 428 spin_lock_irq(&q->queue_lock);
370
371 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 429 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
372 struct blkcg *blkcg = blkg->blkcg; 430 struct blkcg *blkcg = blkg->blkcg;
373 431
@@ -377,7 +435,7 @@ static void blkg_destroy_all(struct request_queue *q)
377 } 435 }
378 436
379 q->root_blkg = NULL; 437 q->root_blkg = NULL;
380 q->root_rl.blkg = NULL; 438 spin_unlock_irq(&q->queue_lock);
381} 439}
382 440
383/* 441/*
@@ -403,41 +461,6 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
403} 461}
404EXPORT_SYMBOL_GPL(__blkg_release_rcu); 462EXPORT_SYMBOL_GPL(__blkg_release_rcu);
405 463
406/*
407 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
408 * because the root blkg uses @q->root_rl instead of its own rl.
409 */
410struct request_list *__blk_queue_next_rl(struct request_list *rl,
411 struct request_queue *q)
412{
413 struct list_head *ent;
414 struct blkcg_gq *blkg;
415
416 /*
417 * Determine the current blkg list_head. The first entry is
418 * root_rl which is off @q->blkg_list and mapped to the head.
419 */
420 if (rl == &q->root_rl) {
421 ent = &q->blkg_list;
422 /* There are no more block groups, hence no request lists */
423 if (list_empty(ent))
424 return NULL;
425 } else {
426 blkg = container_of(rl, struct blkcg_gq, rl);
427 ent = &blkg->q_node;
428 }
429
430 /* walk to the next list_head, skip root blkcg */
431 ent = ent->next;
432 if (ent == &q->root_blkg->q_node)
433 ent = ent->next;
434 if (ent == &q->blkg_list)
435 return NULL;
436
437 blkg = container_of(ent, struct blkcg_gq, q_node);
438 return &blkg->rl;
439}
440
441static int blkcg_reset_stats(struct cgroup_subsys_state *css, 464static int blkcg_reset_stats(struct cgroup_subsys_state *css,
442 struct cftype *cftype, u64 val) 465 struct cftype *cftype, u64 val)
443{ 466{
@@ -477,7 +500,6 @@ const char *blkg_dev_name(struct blkcg_gq *blkg)
477 return dev_name(blkg->q->backing_dev_info->dev); 500 return dev_name(blkg->q->backing_dev_info->dev);
478 return NULL; 501 return NULL;
479} 502}
480EXPORT_SYMBOL_GPL(blkg_dev_name);
481 503
482/** 504/**
483 * blkcg_print_blkgs - helper for printing per-blkg data 505 * blkcg_print_blkgs - helper for printing per-blkg data
@@ -508,10 +530,10 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
508 530
509 rcu_read_lock(); 531 rcu_read_lock();
510 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 532 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
511 spin_lock_irq(blkg->q->queue_lock); 533 spin_lock_irq(&blkg->q->queue_lock);
512 if (blkcg_policy_enabled(blkg->q, pol)) 534 if (blkcg_policy_enabled(blkg->q, pol))
513 total += prfill(sf, blkg->pd[pol->plid], data); 535 total += prfill(sf, blkg->pd[pol->plid], data);
514 spin_unlock_irq(blkg->q->queue_lock); 536 spin_unlock_irq(&blkg->q->queue_lock);
515 } 537 }
516 rcu_read_unlock(); 538 rcu_read_unlock();
517 539
@@ -709,7 +731,7 @@ u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
709 struct cgroup_subsys_state *pos_css; 731 struct cgroup_subsys_state *pos_css;
710 u64 sum = 0; 732 u64 sum = 0;
711 733
712 lockdep_assert_held(blkg->q->queue_lock); 734 lockdep_assert_held(&blkg->q->queue_lock);
713 735
714 rcu_read_lock(); 736 rcu_read_lock();
715 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { 737 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
@@ -752,7 +774,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
752 struct blkg_rwstat sum = { }; 774 struct blkg_rwstat sum = { };
753 int i; 775 int i;
754 776
755 lockdep_assert_held(blkg->q->queue_lock); 777 lockdep_assert_held(&blkg->q->queue_lock);
756 778
757 rcu_read_lock(); 779 rcu_read_lock();
758 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { 780 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
@@ -783,18 +805,10 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
783 struct request_queue *q) 805 struct request_queue *q)
784{ 806{
785 WARN_ON_ONCE(!rcu_read_lock_held()); 807 WARN_ON_ONCE(!rcu_read_lock_held());
786 lockdep_assert_held(q->queue_lock); 808 lockdep_assert_held(&q->queue_lock);
787 809
788 if (!blkcg_policy_enabled(q, pol)) 810 if (!blkcg_policy_enabled(q, pol))
789 return ERR_PTR(-EOPNOTSUPP); 811 return ERR_PTR(-EOPNOTSUPP);
790
791 /*
792 * This could be the first entry point of blkcg implementation and
793 * we shouldn't allow anything to go through for a bypassing queue.
794 */
795 if (unlikely(blk_queue_bypass(q)))
796 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
797
798 return __blkg_lookup(blkcg, q, true /* update_hint */); 812 return __blkg_lookup(blkcg, q, true /* update_hint */);
799} 813}
800 814
@@ -812,7 +826,7 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
812 */ 826 */
813int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 827int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
814 char *input, struct blkg_conf_ctx *ctx) 828 char *input, struct blkg_conf_ctx *ctx)
815 __acquires(rcu) __acquires(disk->queue->queue_lock) 829 __acquires(rcu) __acquires(&disk->queue->queue_lock)
816{ 830{
817 struct gendisk *disk; 831 struct gendisk *disk;
818 struct request_queue *q; 832 struct request_queue *q;
@@ -840,7 +854,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
840 q = disk->queue; 854 q = disk->queue;
841 855
842 rcu_read_lock(); 856 rcu_read_lock();
843 spin_lock_irq(q->queue_lock); 857 spin_lock_irq(&q->queue_lock);
844 858
845 blkg = blkg_lookup_check(blkcg, pol, q); 859 blkg = blkg_lookup_check(blkcg, pol, q);
846 if (IS_ERR(blkg)) { 860 if (IS_ERR(blkg)) {
@@ -867,7 +881,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
867 } 881 }
868 882
869 /* Drop locks to do new blkg allocation with GFP_KERNEL. */ 883 /* Drop locks to do new blkg allocation with GFP_KERNEL. */
870 spin_unlock_irq(q->queue_lock); 884 spin_unlock_irq(&q->queue_lock);
871 rcu_read_unlock(); 885 rcu_read_unlock();
872 886
873 new_blkg = blkg_alloc(pos, q, GFP_KERNEL); 887 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
@@ -877,7 +891,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
877 } 891 }
878 892
879 rcu_read_lock(); 893 rcu_read_lock();
880 spin_lock_irq(q->queue_lock); 894 spin_lock_irq(&q->queue_lock);
881 895
882 blkg = blkg_lookup_check(pos, pol, q); 896 blkg = blkg_lookup_check(pos, pol, q);
883 if (IS_ERR(blkg)) { 897 if (IS_ERR(blkg)) {
@@ -905,7 +919,7 @@ success:
905 return 0; 919 return 0;
906 920
907fail_unlock: 921fail_unlock:
908 spin_unlock_irq(q->queue_lock); 922 spin_unlock_irq(&q->queue_lock);
909 rcu_read_unlock(); 923 rcu_read_unlock();
910fail: 924fail:
911 put_disk_and_module(disk); 925 put_disk_and_module(disk);
@@ -921,7 +935,6 @@ fail:
921 } 935 }
922 return ret; 936 return ret;
923} 937}
924EXPORT_SYMBOL_GPL(blkg_conf_prep);
925 938
926/** 939/**
927 * blkg_conf_finish - finish up per-blkg config update 940 * blkg_conf_finish - finish up per-blkg config update
@@ -931,13 +944,12 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
931 * with blkg_conf_prep(). 944 * with blkg_conf_prep().
932 */ 945 */
933void blkg_conf_finish(struct blkg_conf_ctx *ctx) 946void blkg_conf_finish(struct blkg_conf_ctx *ctx)
934 __releases(ctx->disk->queue->queue_lock) __releases(rcu) 947 __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
935{ 948{
936 spin_unlock_irq(ctx->disk->queue->queue_lock); 949 spin_unlock_irq(&ctx->disk->queue->queue_lock);
937 rcu_read_unlock(); 950 rcu_read_unlock();
938 put_disk_and_module(ctx->disk); 951 put_disk_and_module(ctx->disk);
939} 952}
940EXPORT_SYMBOL_GPL(blkg_conf_finish);
941 953
942static int blkcg_print_stat(struct seq_file *sf, void *v) 954static int blkcg_print_stat(struct seq_file *sf, void *v)
943{ 955{
@@ -967,7 +979,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
967 */ 979 */
968 off += scnprintf(buf+off, size-off, "%s ", dname); 980 off += scnprintf(buf+off, size-off, "%s ", dname);
969 981
970 spin_lock_irq(blkg->q->queue_lock); 982 spin_lock_irq(&blkg->q->queue_lock);
971 983
972 rwstat = blkg_rwstat_recursive_sum(blkg, NULL, 984 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
973 offsetof(struct blkcg_gq, stat_bytes)); 985 offsetof(struct blkcg_gq, stat_bytes));
@@ -981,7 +993,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
981 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); 993 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
982 dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); 994 dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
983 995
984 spin_unlock_irq(blkg->q->queue_lock); 996 spin_unlock_irq(&blkg->q->queue_lock);
985 997
986 if (rbytes || wbytes || rios || wios) { 998 if (rbytes || wbytes || rios || wios) {
987 has_stats = true; 999 has_stats = true;
@@ -1102,9 +1114,9 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg)
1102 struct blkcg_gq, blkcg_node); 1114 struct blkcg_gq, blkcg_node);
1103 struct request_queue *q = blkg->q; 1115 struct request_queue *q = blkg->q;
1104 1116
1105 if (spin_trylock(q->queue_lock)) { 1117 if (spin_trylock(&q->queue_lock)) {
1106 blkg_destroy(blkg); 1118 blkg_destroy(blkg);
1107 spin_unlock(q->queue_lock); 1119 spin_unlock(&q->queue_lock);
1108 } else { 1120 } else {
1109 spin_unlock_irq(&blkcg->lock); 1121 spin_unlock_irq(&blkcg->lock);
1110 cpu_relax(); 1122 cpu_relax();
@@ -1225,36 +1237,31 @@ int blkcg_init_queue(struct request_queue *q)
1225 1237
1226 /* Make sure the root blkg exists. */ 1238 /* Make sure the root blkg exists. */
1227 rcu_read_lock(); 1239 rcu_read_lock();
1228 spin_lock_irq(q->queue_lock); 1240 spin_lock_irq(&q->queue_lock);
1229 blkg = blkg_create(&blkcg_root, q, new_blkg); 1241 blkg = blkg_create(&blkcg_root, q, new_blkg);
1230 if (IS_ERR(blkg)) 1242 if (IS_ERR(blkg))
1231 goto err_unlock; 1243 goto err_unlock;
1232 q->root_blkg = blkg; 1244 q->root_blkg = blkg;
1233 q->root_rl.blkg = blkg; 1245 spin_unlock_irq(&q->queue_lock);
1234 spin_unlock_irq(q->queue_lock);
1235 rcu_read_unlock(); 1246 rcu_read_unlock();
1236 1247
1237 if (preloaded) 1248 if (preloaded)
1238 radix_tree_preload_end(); 1249 radix_tree_preload_end();
1239 1250
1240 ret = blk_iolatency_init(q); 1251 ret = blk_iolatency_init(q);
1241 if (ret) { 1252 if (ret)
1242 spin_lock_irq(q->queue_lock); 1253 goto err_destroy_all;
1243 blkg_destroy_all(q);
1244 spin_unlock_irq(q->queue_lock);
1245 return ret;
1246 }
1247 1254
1248 ret = blk_throtl_init(q); 1255 ret = blk_throtl_init(q);
1249 if (ret) { 1256 if (ret)
1250 spin_lock_irq(q->queue_lock); 1257 goto err_destroy_all;
1251 blkg_destroy_all(q); 1258 return 0;
1252 spin_unlock_irq(q->queue_lock);
1253 }
1254 return ret;
1255 1259
1260err_destroy_all:
1261 blkg_destroy_all(q);
1262 return ret;
1256err_unlock: 1263err_unlock:
1257 spin_unlock_irq(q->queue_lock); 1264 spin_unlock_irq(&q->queue_lock);
1258 rcu_read_unlock(); 1265 rcu_read_unlock();
1259 if (preloaded) 1266 if (preloaded)
1260 radix_tree_preload_end(); 1267 radix_tree_preload_end();
@@ -1269,7 +1276,7 @@ err_unlock:
1269 */ 1276 */
1270void blkcg_drain_queue(struct request_queue *q) 1277void blkcg_drain_queue(struct request_queue *q)
1271{ 1278{
1272 lockdep_assert_held(q->queue_lock); 1279 lockdep_assert_held(&q->queue_lock);
1273 1280
1274 /* 1281 /*
1275 * @q could be exiting and already have destroyed all blkgs as 1282 * @q could be exiting and already have destroyed all blkgs as
@@ -1289,10 +1296,7 @@ void blkcg_drain_queue(struct request_queue *q)
1289 */ 1296 */
1290void blkcg_exit_queue(struct request_queue *q) 1297void blkcg_exit_queue(struct request_queue *q)
1291{ 1298{
1292 spin_lock_irq(q->queue_lock);
1293 blkg_destroy_all(q); 1299 blkg_destroy_all(q);
1294 spin_unlock_irq(q->queue_lock);
1295
1296 blk_throtl_exit(q); 1300 blk_throtl_exit(q);
1297} 1301}
1298 1302
@@ -1396,10 +1400,8 @@ int blkcg_activate_policy(struct request_queue *q,
1396 if (blkcg_policy_enabled(q, pol)) 1400 if (blkcg_policy_enabled(q, pol))
1397 return 0; 1401 return 0;
1398 1402
1399 if (q->mq_ops) 1403 if (queue_is_mq(q))
1400 blk_mq_freeze_queue(q); 1404 blk_mq_freeze_queue(q);
1401 else
1402 blk_queue_bypass_start(q);
1403pd_prealloc: 1405pd_prealloc:
1404 if (!pd_prealloc) { 1406 if (!pd_prealloc) {
1405 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); 1407 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
@@ -1409,7 +1411,7 @@ pd_prealloc:
1409 } 1411 }
1410 } 1412 }
1411 1413
1412 spin_lock_irq(q->queue_lock); 1414 spin_lock_irq(&q->queue_lock);
1413 1415
1414 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1416 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1415 struct blkg_policy_data *pd; 1417 struct blkg_policy_data *pd;
@@ -1421,7 +1423,7 @@ pd_prealloc:
1421 if (!pd) 1423 if (!pd)
1422 swap(pd, pd_prealloc); 1424 swap(pd, pd_prealloc);
1423 if (!pd) { 1425 if (!pd) {
1424 spin_unlock_irq(q->queue_lock); 1426 spin_unlock_irq(&q->queue_lock);
1425 goto pd_prealloc; 1427 goto pd_prealloc;
1426 } 1428 }
1427 1429
@@ -1435,12 +1437,10 @@ pd_prealloc:
1435 __set_bit(pol->plid, q->blkcg_pols); 1437 __set_bit(pol->plid, q->blkcg_pols);
1436 ret = 0; 1438 ret = 0;
1437 1439
1438 spin_unlock_irq(q->queue_lock); 1440 spin_unlock_irq(&q->queue_lock);
1439out_bypass_end: 1441out_bypass_end:
1440 if (q->mq_ops) 1442 if (queue_is_mq(q))
1441 blk_mq_unfreeze_queue(q); 1443 blk_mq_unfreeze_queue(q);
1442 else
1443 blk_queue_bypass_end(q);
1444 if (pd_prealloc) 1444 if (pd_prealloc)
1445 pol->pd_free_fn(pd_prealloc); 1445 pol->pd_free_fn(pd_prealloc);
1446 return ret; 1446 return ret;
@@ -1463,12 +1463,10 @@ void blkcg_deactivate_policy(struct request_queue *q,
1463 if (!blkcg_policy_enabled(q, pol)) 1463 if (!blkcg_policy_enabled(q, pol))
1464 return; 1464 return;
1465 1465
1466 if (q->mq_ops) 1466 if (queue_is_mq(q))
1467 blk_mq_freeze_queue(q); 1467 blk_mq_freeze_queue(q);
1468 else
1469 blk_queue_bypass_start(q);
1470 1468
1471 spin_lock_irq(q->queue_lock); 1469 spin_lock_irq(&q->queue_lock);
1472 1470
1473 __clear_bit(pol->plid, q->blkcg_pols); 1471 __clear_bit(pol->plid, q->blkcg_pols);
1474 1472
@@ -1481,12 +1479,10 @@ void blkcg_deactivate_policy(struct request_queue *q,
1481 } 1479 }
1482 } 1480 }
1483 1481
1484 spin_unlock_irq(q->queue_lock); 1482 spin_unlock_irq(&q->queue_lock);
1485 1483
1486 if (q->mq_ops) 1484 if (queue_is_mq(q))
1487 blk_mq_unfreeze_queue(q); 1485 blk_mq_unfreeze_queue(q);
1488 else
1489 blk_queue_bypass_end(q);
1490} 1486}
1491EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); 1487EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1492 1488
@@ -1748,8 +1744,7 @@ void blkcg_maybe_throttle_current(void)
1748 blkg = blkg_lookup(blkcg, q); 1744 blkg = blkg_lookup(blkcg, q);
1749 if (!blkg) 1745 if (!blkg)
1750 goto out; 1746 goto out;
1751 blkg = blkg_try_get(blkg); 1747 if (!blkg_tryget(blkg))
1752 if (!blkg)
1753 goto out; 1748 goto out;
1754 rcu_read_unlock(); 1749 rcu_read_unlock();
1755 1750
@@ -1761,7 +1756,6 @@ out:
1761 rcu_read_unlock(); 1756 rcu_read_unlock();
1762 blk_put_queue(q); 1757 blk_put_queue(q);
1763} 1758}
1764EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
1765 1759
1766/** 1760/**
1767 * blkcg_schedule_throttle - this task needs to check for throttling 1761 * blkcg_schedule_throttle - this task needs to check for throttling
@@ -1795,7 +1789,6 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1795 current->use_memdelay = use_memdelay; 1789 current->use_memdelay = use_memdelay;
1796 set_notify_resume(current); 1790 set_notify_resume(current);
1797} 1791}
1798EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
1799 1792
1800/** 1793/**
1801 * blkcg_add_delay - add delay to this blkg 1794 * blkcg_add_delay - add delay to this blkg
@@ -1810,7 +1803,6 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1810 blkcg_scale_delay(blkg, now); 1803 blkcg_scale_delay(blkg, now);
1811 atomic64_add(delta, &blkg->delay_nsec); 1804 atomic64_add(delta, &blkg->delay_nsec);
1812} 1805}
1813EXPORT_SYMBOL_GPL(blkcg_add_delay);
1814 1806
1815module_param(blkcg_debug_stats, bool, 0644); 1807module_param(blkcg_debug_stats, bool, 0644);
1816MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); 1808MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index ce12515f9b9b..c78042975737 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -58,11 +58,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
58DEFINE_IDA(blk_queue_ida); 58DEFINE_IDA(blk_queue_ida);
59 59
60/* 60/*
61 * For the allocated request tables
62 */
63struct kmem_cache *request_cachep;
64
65/*
66 * For queue allocation 61 * For queue allocation
67 */ 62 */
68struct kmem_cache *blk_requestq_cachep; 63struct kmem_cache *blk_requestq_cachep;
@@ -79,11 +74,7 @@ static struct workqueue_struct *kblockd_workqueue;
79 */ 74 */
80void blk_queue_flag_set(unsigned int flag, struct request_queue *q) 75void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
81{ 76{
82 unsigned long flags; 77 set_bit(flag, &q->queue_flags);
83
84 spin_lock_irqsave(q->queue_lock, flags);
85 queue_flag_set(flag, q);
86 spin_unlock_irqrestore(q->queue_lock, flags);
87} 78}
88EXPORT_SYMBOL(blk_queue_flag_set); 79EXPORT_SYMBOL(blk_queue_flag_set);
89 80
@@ -94,11 +85,7 @@ EXPORT_SYMBOL(blk_queue_flag_set);
94 */ 85 */
95void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) 86void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
96{ 87{
97 unsigned long flags; 88 clear_bit(flag, &q->queue_flags);
98
99 spin_lock_irqsave(q->queue_lock, flags);
100 queue_flag_clear(flag, q);
101 spin_unlock_irqrestore(q->queue_lock, flags);
102} 89}
103EXPORT_SYMBOL(blk_queue_flag_clear); 90EXPORT_SYMBOL(blk_queue_flag_clear);
104 91
@@ -112,85 +99,15 @@ EXPORT_SYMBOL(blk_queue_flag_clear);
112 */ 99 */
113bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) 100bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
114{ 101{
115 unsigned long flags; 102 return test_and_set_bit(flag, &q->queue_flags);
116 bool res;
117
118 spin_lock_irqsave(q->queue_lock, flags);
119 res = queue_flag_test_and_set(flag, q);
120 spin_unlock_irqrestore(q->queue_lock, flags);
121
122 return res;
123} 103}
124EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); 104EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
125 105
126/**
127 * blk_queue_flag_test_and_clear - atomically test and clear a queue flag
128 * @flag: flag to be cleared
129 * @q: request queue
130 *
131 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
132 * the flag was set.
133 */
134bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q)
135{
136 unsigned long flags;
137 bool res;
138
139 spin_lock_irqsave(q->queue_lock, flags);
140 res = queue_flag_test_and_clear(flag, q);
141 spin_unlock_irqrestore(q->queue_lock, flags);
142
143 return res;
144}
145EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear);
146
147static void blk_clear_congested(struct request_list *rl, int sync)
148{
149#ifdef CONFIG_CGROUP_WRITEBACK
150 clear_wb_congested(rl->blkg->wb_congested, sync);
151#else
152 /*
153 * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
154 * flip its congestion state for events on other blkcgs.
155 */
156 if (rl == &rl->q->root_rl)
157 clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
158#endif
159}
160
161static void blk_set_congested(struct request_list *rl, int sync)
162{
163#ifdef CONFIG_CGROUP_WRITEBACK
164 set_wb_congested(rl->blkg->wb_congested, sync);
165#else
166 /* see blk_clear_congested() */
167 if (rl == &rl->q->root_rl)
168 set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
169#endif
170}
171
172void blk_queue_congestion_threshold(struct request_queue *q)
173{
174 int nr;
175
176 nr = q->nr_requests - (q->nr_requests / 8) + 1;
177 if (nr > q->nr_requests)
178 nr = q->nr_requests;
179 q->nr_congestion_on = nr;
180
181 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
182 if (nr < 1)
183 nr = 1;
184 q->nr_congestion_off = nr;
185}
186
187void blk_rq_init(struct request_queue *q, struct request *rq) 106void blk_rq_init(struct request_queue *q, struct request *rq)
188{ 107{
189 memset(rq, 0, sizeof(*rq)); 108 memset(rq, 0, sizeof(*rq));
190 109
191 INIT_LIST_HEAD(&rq->queuelist); 110 INIT_LIST_HEAD(&rq->queuelist);
192 INIT_LIST_HEAD(&rq->timeout_list);
193 rq->cpu = -1;
194 rq->q = q; 111 rq->q = q;
195 rq->__sector = (sector_t) -1; 112 rq->__sector = (sector_t) -1;
196 INIT_HLIST_NODE(&rq->hash); 113 INIT_HLIST_NODE(&rq->hash);
@@ -256,10 +173,11 @@ static void print_req_error(struct request *req, blk_status_t status)
256 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) 173 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
257 return; 174 return;
258 175
259 printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", 176 printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n",
260 __func__, blk_errors[idx].name, req->rq_disk ? 177 __func__, blk_errors[idx].name,
261 req->rq_disk->disk_name : "?", 178 req->rq_disk ? req->rq_disk->disk_name : "?",
262 (unsigned long long)blk_rq_pos(req)); 179 (unsigned long long)blk_rq_pos(req),
180 req->cmd_flags);
263} 181}
264 182
265static void req_bio_endio(struct request *rq, struct bio *bio, 183static void req_bio_endio(struct request *rq, struct bio *bio,
@@ -292,99 +210,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
292} 210}
293EXPORT_SYMBOL(blk_dump_rq_flags); 211EXPORT_SYMBOL(blk_dump_rq_flags);
294 212
295static void blk_delay_work(struct work_struct *work)
296{
297 struct request_queue *q;
298
299 q = container_of(work, struct request_queue, delay_work.work);
300 spin_lock_irq(q->queue_lock);
301 __blk_run_queue(q);
302 spin_unlock_irq(q->queue_lock);
303}
304
305/**
306 * blk_delay_queue - restart queueing after defined interval
307 * @q: The &struct request_queue in question
308 * @msecs: Delay in msecs
309 *
310 * Description:
311 * Sometimes queueing needs to be postponed for a little while, to allow
312 * resources to come back. This function will make sure that queueing is
313 * restarted around the specified time.
314 */
315void blk_delay_queue(struct request_queue *q, unsigned long msecs)
316{
317 lockdep_assert_held(q->queue_lock);
318 WARN_ON_ONCE(q->mq_ops);
319
320 if (likely(!blk_queue_dead(q)))
321 queue_delayed_work(kblockd_workqueue, &q->delay_work,
322 msecs_to_jiffies(msecs));
323}
324EXPORT_SYMBOL(blk_delay_queue);
325
326/**
327 * blk_start_queue_async - asynchronously restart a previously stopped queue
328 * @q: The &struct request_queue in question
329 *
330 * Description:
331 * blk_start_queue_async() will clear the stop flag on the queue, and
332 * ensure that the request_fn for the queue is run from an async
333 * context.
334 **/
335void blk_start_queue_async(struct request_queue *q)
336{
337 lockdep_assert_held(q->queue_lock);
338 WARN_ON_ONCE(q->mq_ops);
339
340 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
341 blk_run_queue_async(q);
342}
343EXPORT_SYMBOL(blk_start_queue_async);
344
345/**
346 * blk_start_queue - restart a previously stopped queue
347 * @q: The &struct request_queue in question
348 *
349 * Description:
350 * blk_start_queue() will clear the stop flag on the queue, and call
351 * the request_fn for the queue if it was in a stopped state when
352 * entered. Also see blk_stop_queue().
353 **/
354void blk_start_queue(struct request_queue *q)
355{
356 lockdep_assert_held(q->queue_lock);
357 WARN_ON_ONCE(q->mq_ops);
358
359 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
360 __blk_run_queue(q);
361}
362EXPORT_SYMBOL(blk_start_queue);
363
364/**
365 * blk_stop_queue - stop a queue
366 * @q: The &struct request_queue in question
367 *
368 * Description:
369 * The Linux block layer assumes that a block driver will consume all
370 * entries on the request queue when the request_fn strategy is called.
371 * Often this will not happen, because of hardware limitations (queue
372 * depth settings). If a device driver gets a 'queue full' response,
373 * or if it simply chooses not to queue more I/O at one point, it can
374 * call this function to prevent the request_fn from being called until
375 * the driver has signalled it's ready to go again. This happens by calling
376 * blk_start_queue() to restart queue operations.
377 **/
378void blk_stop_queue(struct request_queue *q)
379{
380 lockdep_assert_held(q->queue_lock);
381 WARN_ON_ONCE(q->mq_ops);
382
383 cancel_delayed_work(&q->delay_work);
384 queue_flag_set(QUEUE_FLAG_STOPPED, q);
385}
386EXPORT_SYMBOL(blk_stop_queue);
387
388/** 213/**
389 * blk_sync_queue - cancel any pending callbacks on a queue 214 * blk_sync_queue - cancel any pending callbacks on a queue
390 * @q: the queue 215 * @q: the queue
@@ -408,15 +233,13 @@ void blk_sync_queue(struct request_queue *q)
408 del_timer_sync(&q->timeout); 233 del_timer_sync(&q->timeout);
409 cancel_work_sync(&q->timeout_work); 234 cancel_work_sync(&q->timeout_work);
410 235
411 if (q->mq_ops) { 236 if (queue_is_mq(q)) {
412 struct blk_mq_hw_ctx *hctx; 237 struct blk_mq_hw_ctx *hctx;
413 int i; 238 int i;
414 239
415 cancel_delayed_work_sync(&q->requeue_work); 240 cancel_delayed_work_sync(&q->requeue_work);
416 queue_for_each_hw_ctx(q, hctx, i) 241 queue_for_each_hw_ctx(q, hctx, i)
417 cancel_delayed_work_sync(&hctx->run_work); 242 cancel_delayed_work_sync(&hctx->run_work);
418 } else {
419 cancel_delayed_work_sync(&q->delay_work);
420 } 243 }
421} 244}
422EXPORT_SYMBOL(blk_sync_queue); 245EXPORT_SYMBOL(blk_sync_queue);
@@ -442,250 +265,12 @@ void blk_clear_pm_only(struct request_queue *q)
442} 265}
443EXPORT_SYMBOL_GPL(blk_clear_pm_only); 266EXPORT_SYMBOL_GPL(blk_clear_pm_only);
444 267
445/**
446 * __blk_run_queue_uncond - run a queue whether or not it has been stopped
447 * @q: The queue to run
448 *
449 * Description:
450 * Invoke request handling on a queue if there are any pending requests.
451 * May be used to restart request handling after a request has completed.
452 * This variant runs the queue whether or not the queue has been
453 * stopped. Must be called with the queue lock held and interrupts
454 * disabled. See also @blk_run_queue.
455 */
456inline void __blk_run_queue_uncond(struct request_queue *q)
457{
458 lockdep_assert_held(q->queue_lock);
459 WARN_ON_ONCE(q->mq_ops);
460
461 if (unlikely(blk_queue_dead(q)))
462 return;
463
464 /*
465 * Some request_fn implementations, e.g. scsi_request_fn(), unlock
466 * the queue lock internally. As a result multiple threads may be
467 * running such a request function concurrently. Keep track of the
468 * number of active request_fn invocations such that blk_drain_queue()
469 * can wait until all these request_fn calls have finished.
470 */
471 q->request_fn_active++;
472 q->request_fn(q);
473 q->request_fn_active--;
474}
475EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
476
477/**
478 * __blk_run_queue - run a single device queue
479 * @q: The queue to run
480 *
481 * Description:
482 * See @blk_run_queue.
483 */
484void __blk_run_queue(struct request_queue *q)
485{
486 lockdep_assert_held(q->queue_lock);
487 WARN_ON_ONCE(q->mq_ops);
488
489 if (unlikely(blk_queue_stopped(q)))
490 return;
491
492 __blk_run_queue_uncond(q);
493}
494EXPORT_SYMBOL(__blk_run_queue);
495
496/**
497 * blk_run_queue_async - run a single device queue in workqueue context
498 * @q: The queue to run
499 *
500 * Description:
501 * Tells kblockd to perform the equivalent of @blk_run_queue on behalf
502 * of us.
503 *
504 * Note:
505 * Since it is not allowed to run q->delay_work after blk_cleanup_queue()
506 * has canceled q->delay_work, callers must hold the queue lock to avoid
507 * race conditions between blk_cleanup_queue() and blk_run_queue_async().
508 */
509void blk_run_queue_async(struct request_queue *q)
510{
511 lockdep_assert_held(q->queue_lock);
512 WARN_ON_ONCE(q->mq_ops);
513
514 if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
515 mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
516}
517EXPORT_SYMBOL(blk_run_queue_async);
518
519/**
520 * blk_run_queue - run a single device queue
521 * @q: The queue to run
522 *
523 * Description:
524 * Invoke request handling on this queue, if it has pending work to do.
525 * May be used to restart queueing when a request has completed.
526 */
527void blk_run_queue(struct request_queue *q)
528{
529 unsigned long flags;
530
531 WARN_ON_ONCE(q->mq_ops);
532
533 spin_lock_irqsave(q->queue_lock, flags);
534 __blk_run_queue(q);
535 spin_unlock_irqrestore(q->queue_lock, flags);
536}
537EXPORT_SYMBOL(blk_run_queue);
538
539void blk_put_queue(struct request_queue *q) 268void blk_put_queue(struct request_queue *q)
540{ 269{
541 kobject_put(&q->kobj); 270 kobject_put(&q->kobj);
542} 271}
543EXPORT_SYMBOL(blk_put_queue); 272EXPORT_SYMBOL(blk_put_queue);
544 273
545/**
546 * __blk_drain_queue - drain requests from request_queue
547 * @q: queue to drain
548 * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
549 *
550 * Drain requests from @q. If @drain_all is set, all requests are drained.
551 * If not, only ELVPRIV requests are drained. The caller is responsible
552 * for ensuring that no new requests which need to be drained are queued.
553 */
554static void __blk_drain_queue(struct request_queue *q, bool drain_all)
555 __releases(q->queue_lock)
556 __acquires(q->queue_lock)
557{
558 int i;
559
560 lockdep_assert_held(q->queue_lock);
561 WARN_ON_ONCE(q->mq_ops);
562
563 while (true) {
564 bool drain = false;
565
566 /*
567 * The caller might be trying to drain @q before its
568 * elevator is initialized.
569 */
570 if (q->elevator)
571 elv_drain_elevator(q);
572
573 blkcg_drain_queue(q);
574
575 /*
576 * This function might be called on a queue which failed
577 * driver init after queue creation or is not yet fully
578 * active yet. Some drivers (e.g. fd and loop) get unhappy
579 * in such cases. Kick queue iff dispatch queue has
580 * something on it and @q has request_fn set.
581 */
582 if (!list_empty(&q->queue_head) && q->request_fn)
583 __blk_run_queue(q);
584
585 drain |= q->nr_rqs_elvpriv;
586 drain |= q->request_fn_active;
587
588 /*
589 * Unfortunately, requests are queued at and tracked from
590 * multiple places and there's no single counter which can
591 * be drained. Check all the queues and counters.
592 */
593 if (drain_all) {
594 struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
595 drain |= !list_empty(&q->queue_head);
596 for (i = 0; i < 2; i++) {
597 drain |= q->nr_rqs[i];
598 drain |= q->in_flight[i];
599 if (fq)
600 drain |= !list_empty(&fq->flush_queue[i]);
601 }
602 }
603
604 if (!drain)
605 break;
606
607 spin_unlock_irq(q->queue_lock);
608
609 msleep(10);
610
611 spin_lock_irq(q->queue_lock);
612 }
613
614 /*
615 * With queue marked dead, any woken up waiter will fail the
616 * allocation path, so the wakeup chaining is lost and we're
617 * left with hung waiters. We need to wake up those waiters.
618 */
619 if (q->request_fn) {
620 struct request_list *rl;
621
622 blk_queue_for_each_rl(rl, q)
623 for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
624 wake_up_all(&rl->wait[i]);
625 }
626}
627
628void blk_drain_queue(struct request_queue *q)
629{
630 spin_lock_irq(q->queue_lock);
631 __blk_drain_queue(q, true);
632 spin_unlock_irq(q->queue_lock);
633}
634
635/**
636 * blk_queue_bypass_start - enter queue bypass mode
637 * @q: queue of interest
638 *
639 * In bypass mode, only the dispatch FIFO queue of @q is used. This
640 * function makes @q enter bypass mode and drains all requests which were
641 * throttled or issued before. On return, it's guaranteed that no request
642 * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
643 * inside queue or RCU read lock.
644 */
645void blk_queue_bypass_start(struct request_queue *q)
646{
647 WARN_ON_ONCE(q->mq_ops);
648
649 spin_lock_irq(q->queue_lock);
650 q->bypass_depth++;
651 queue_flag_set(QUEUE_FLAG_BYPASS, q);
652 spin_unlock_irq(q->queue_lock);
653
654 /*
655 * Queues start drained. Skip actual draining till init is
656 * complete. This avoids lenghty delays during queue init which
657 * can happen many times during boot.
658 */
659 if (blk_queue_init_done(q)) {
660 spin_lock_irq(q->queue_lock);
661 __blk_drain_queue(q, false);
662 spin_unlock_irq(q->queue_lock);
663
664 /* ensure blk_queue_bypass() is %true inside RCU read lock */
665 synchronize_rcu();
666 }
667}
668EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
669
670/**
671 * blk_queue_bypass_end - leave queue bypass mode
672 * @q: queue of interest
673 *
674 * Leave bypass mode and restore the normal queueing behavior.
675 *
676 * Note: although blk_queue_bypass_start() is only called for blk-sq queues,
677 * this function is called for both blk-sq and blk-mq queues.
678 */
679void blk_queue_bypass_end(struct request_queue *q)
680{
681 spin_lock_irq(q->queue_lock);
682 if (!--q->bypass_depth)
683 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
684 WARN_ON_ONCE(q->bypass_depth < 0);
685 spin_unlock_irq(q->queue_lock);
686}
687EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
688
689void blk_set_queue_dying(struct request_queue *q) 274void blk_set_queue_dying(struct request_queue *q)
690{ 275{
691 blk_queue_flag_set(QUEUE_FLAG_DYING, q); 276 blk_queue_flag_set(QUEUE_FLAG_DYING, q);
@@ -697,20 +282,8 @@ void blk_set_queue_dying(struct request_queue *q)
697 */ 282 */
698 blk_freeze_queue_start(q); 283 blk_freeze_queue_start(q);
699 284
700 if (q->mq_ops) 285 if (queue_is_mq(q))
701 blk_mq_wake_waiters(q); 286 blk_mq_wake_waiters(q);
702 else {
703 struct request_list *rl;
704
705 spin_lock_irq(q->queue_lock);
706 blk_queue_for_each_rl(rl, q) {
707 if (rl->rq_pool) {
708 wake_up_all(&rl->wait[BLK_RW_SYNC]);
709 wake_up_all(&rl->wait[BLK_RW_ASYNC]);
710 }
711 }
712 spin_unlock_irq(q->queue_lock);
713 }
714 287
715 /* Make blk_queue_enter() reexamine the DYING flag. */ 288 /* Make blk_queue_enter() reexamine the DYING flag. */
716 wake_up_all(&q->mq_freeze_wq); 289 wake_up_all(&q->mq_freeze_wq);
@@ -755,29 +328,13 @@ void blk_exit_queue(struct request_queue *q)
755 */ 328 */
756void blk_cleanup_queue(struct request_queue *q) 329void blk_cleanup_queue(struct request_queue *q)
757{ 330{
758 spinlock_t *lock = q->queue_lock;
759
760 /* mark @q DYING, no new request or merges will be allowed afterwards */ 331 /* mark @q DYING, no new request or merges will be allowed afterwards */
761 mutex_lock(&q->sysfs_lock); 332 mutex_lock(&q->sysfs_lock);
762 blk_set_queue_dying(q); 333 blk_set_queue_dying(q);
763 spin_lock_irq(lock);
764
765 /*
766 * A dying queue is permanently in bypass mode till released. Note
767 * that, unlike blk_queue_bypass_start(), we aren't performing
768 * synchronize_rcu() after entering bypass mode to avoid the delay
769 * as some drivers create and destroy a lot of queues while
770 * probing. This is still safe because blk_release_queue() will be
771 * called only after the queue refcnt drops to zero and nothing,
772 * RCU or not, would be traversing the queue by then.
773 */
774 q->bypass_depth++;
775 queue_flag_set(QUEUE_FLAG_BYPASS, q);
776 334
777 queue_flag_set(QUEUE_FLAG_NOMERGES, q); 335 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
778 queue_flag_set(QUEUE_FLAG_NOXMERGES, q); 336 blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
779 queue_flag_set(QUEUE_FLAG_DYING, q); 337 blk_queue_flag_set(QUEUE_FLAG_DYING, q);
780 spin_unlock_irq(lock);
781 mutex_unlock(&q->sysfs_lock); 338 mutex_unlock(&q->sysfs_lock);
782 339
783 /* 340 /*
@@ -788,9 +345,7 @@ void blk_cleanup_queue(struct request_queue *q)
788 345
789 rq_qos_exit(q); 346 rq_qos_exit(q);
790 347
791 spin_lock_irq(lock); 348 blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
792 queue_flag_set(QUEUE_FLAG_DEAD, q);
793 spin_unlock_irq(lock);
794 349
795 /* 350 /*
796 * make sure all in-progress dispatch are completed because 351 * make sure all in-progress dispatch are completed because
@@ -798,11 +353,10 @@ void blk_cleanup_queue(struct request_queue *q)
798 * dispatch may still be in-progress since we dispatch requests 353 * dispatch may still be in-progress since we dispatch requests
799 * from more than one contexts. 354 * from more than one contexts.
800 * 355 *
801 * No need to quiesce queue if it isn't initialized yet since 356 * We rely on driver to deal with the race in case that queue
802 * blk_freeze_queue() should be enough for cases of passthrough 357 * initialization isn't done.
803 * request.
804 */ 358 */
805 if (q->mq_ops && blk_queue_init_done(q)) 359 if (queue_is_mq(q) && blk_queue_init_done(q))
806 blk_mq_quiesce_queue(q); 360 blk_mq_quiesce_queue(q);
807 361
808 /* for synchronous bio-based driver finish in-flight integrity i/o */ 362 /* for synchronous bio-based driver finish in-flight integrity i/o */
@@ -820,98 +374,19 @@ void blk_cleanup_queue(struct request_queue *q)
820 374
821 blk_exit_queue(q); 375 blk_exit_queue(q);
822 376
823 if (q->mq_ops) 377 if (queue_is_mq(q))
824 blk_mq_free_queue(q); 378 blk_mq_free_queue(q);
825 percpu_ref_exit(&q->q_usage_counter);
826 379
827 spin_lock_irq(lock); 380 percpu_ref_exit(&q->q_usage_counter);
828 if (q->queue_lock != &q->__queue_lock)
829 q->queue_lock = &q->__queue_lock;
830 spin_unlock_irq(lock);
831 381
832 /* @q is and will stay empty, shutdown and put */ 382 /* @q is and will stay empty, shutdown and put */
833 blk_put_queue(q); 383 blk_put_queue(q);
834} 384}
835EXPORT_SYMBOL(blk_cleanup_queue); 385EXPORT_SYMBOL(blk_cleanup_queue);
836 386
837/* Allocate memory local to the request queue */
838static void *alloc_request_simple(gfp_t gfp_mask, void *data)
839{
840 struct request_queue *q = data;
841
842 return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
843}
844
845static void free_request_simple(void *element, void *data)
846{
847 kmem_cache_free(request_cachep, element);
848}
849
850static void *alloc_request_size(gfp_t gfp_mask, void *data)
851{
852 struct request_queue *q = data;
853 struct request *rq;
854
855 rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
856 q->node);
857 if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
858 kfree(rq);
859 rq = NULL;
860 }
861 return rq;
862}
863
864static void free_request_size(void *element, void *data)
865{
866 struct request_queue *q = data;
867
868 if (q->exit_rq_fn)
869 q->exit_rq_fn(q, element);
870 kfree(element);
871}
872
873int blk_init_rl(struct request_list *rl, struct request_queue *q,
874 gfp_t gfp_mask)
875{
876 if (unlikely(rl->rq_pool) || q->mq_ops)
877 return 0;
878
879 rl->q = q;
880 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
881 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
882 init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
883 init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
884
885 if (q->cmd_size) {
886 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
887 alloc_request_size, free_request_size,
888 q, gfp_mask, q->node);
889 } else {
890 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
891 alloc_request_simple, free_request_simple,
892 q, gfp_mask, q->node);
893 }
894 if (!rl->rq_pool)
895 return -ENOMEM;
896
897 if (rl != &q->root_rl)
898 WARN_ON_ONCE(!blk_get_queue(q));
899
900 return 0;
901}
902
903void blk_exit_rl(struct request_queue *q, struct request_list *rl)
904{
905 if (rl->rq_pool) {
906 mempool_destroy(rl->rq_pool);
907 if (rl != &q->root_rl)
908 blk_put_queue(q);
909 }
910}
911
912struct request_queue *blk_alloc_queue(gfp_t gfp_mask) 387struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
913{ 388{
914 return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL); 389 return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
915} 390}
916EXPORT_SYMBOL(blk_alloc_queue); 391EXPORT_SYMBOL(blk_alloc_queue);
917 392
@@ -991,17 +466,8 @@ static void blk_rq_timed_out_timer(struct timer_list *t)
991 * blk_alloc_queue_node - allocate a request queue 466 * blk_alloc_queue_node - allocate a request queue
992 * @gfp_mask: memory allocation flags 467 * @gfp_mask: memory allocation flags
993 * @node_id: NUMA node to allocate memory from 468 * @node_id: NUMA node to allocate memory from
994 * @lock: For legacy queues, pointer to a spinlock that will be used to e.g.
995 * serialize calls to the legacy .request_fn() callback. Ignored for
996 * blk-mq request queues.
997 *
998 * Note: pass the queue lock as the third argument to this function instead of
999 * setting the queue lock pointer explicitly to avoid triggering a sporadic
1000 * crash in the blkcg code. This function namely calls blkcg_init_queue() and
1001 * the queue lock pointer must be set before blkcg_init_queue() is called.
1002 */ 469 */
1003struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, 470struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1004 spinlock_t *lock)
1005{ 471{
1006 struct request_queue *q; 472 struct request_queue *q;
1007 int ret; 473 int ret;
@@ -1013,8 +479,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
1013 479
1014 INIT_LIST_HEAD(&q->queue_head); 480 INIT_LIST_HEAD(&q->queue_head);
1015 q->last_merge = NULL; 481 q->last_merge = NULL;
1016 q->end_sector = 0;
1017 q->boundary_rq = NULL;
1018 482
1019 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 483 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
1020 if (q->id < 0) 484 if (q->id < 0)
@@ -1042,12 +506,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
1042 laptop_mode_timer_fn, 0); 506 laptop_mode_timer_fn, 0);
1043 timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); 507 timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
1044 INIT_WORK(&q->timeout_work, NULL); 508 INIT_WORK(&q->timeout_work, NULL);
1045 INIT_LIST_HEAD(&q->timeout_list);
1046 INIT_LIST_HEAD(&q->icq_list); 509 INIT_LIST_HEAD(&q->icq_list);
1047#ifdef CONFIG_BLK_CGROUP 510#ifdef CONFIG_BLK_CGROUP
1048 INIT_LIST_HEAD(&q->blkg_list); 511 INIT_LIST_HEAD(&q->blkg_list);
1049#endif 512#endif
1050 INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
1051 513
1052 kobject_init(&q->kobj, &blk_queue_ktype); 514 kobject_init(&q->kobj, &blk_queue_ktype);
1053 515
@@ -1055,18 +517,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
1055 mutex_init(&q->blk_trace_mutex); 517 mutex_init(&q->blk_trace_mutex);
1056#endif 518#endif
1057 mutex_init(&q->sysfs_lock); 519 mutex_init(&q->sysfs_lock);
1058 spin_lock_init(&q->__queue_lock); 520 spin_lock_init(&q->queue_lock);
1059
1060 q->queue_lock = lock ? : &q->__queue_lock;
1061
1062 /*
1063 * A queue starts its life with bypass turned on to avoid
1064 * unnecessary bypass on/off overhead and nasty surprises during
1065 * init. The initial bypass will be finished when the queue is
1066 * registered by blk_register_queue().
1067 */
1068 q->bypass_depth = 1;
1069 queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q);
1070 521
1071 init_waitqueue_head(&q->mq_freeze_wq); 522 init_waitqueue_head(&q->mq_freeze_wq);
1072 523
@@ -1100,105 +551,6 @@ fail_q:
1100} 551}
1101EXPORT_SYMBOL(blk_alloc_queue_node); 552EXPORT_SYMBOL(blk_alloc_queue_node);
1102 553
1103/**
1104 * blk_init_queue - prepare a request queue for use with a block device
1105 * @rfn: The function to be called to process requests that have been
1106 * placed on the queue.
1107 * @lock: Request queue spin lock
1108 *
1109 * Description:
1110 * If a block device wishes to use the standard request handling procedures,
1111 * which sorts requests and coalesces adjacent requests, then it must
1112 * call blk_init_queue(). The function @rfn will be called when there
1113 * are requests on the queue that need to be processed. If the device
1114 * supports plugging, then @rfn may not be called immediately when requests
1115 * are available on the queue, but may be called at some time later instead.
1116 * Plugged queues are generally unplugged when a buffer belonging to one
1117 * of the requests on the queue is needed, or due to memory pressure.
1118 *
1119 * @rfn is not required, or even expected, to remove all requests off the
1120 * queue, but only as many as it can handle at a time. If it does leave
1121 * requests on the queue, it is responsible for arranging that the requests
1122 * get dealt with eventually.
1123 *
1124 * The queue spin lock must be held while manipulating the requests on the
1125 * request queue; this lock will be taken also from interrupt context, so irq
1126 * disabling is needed for it.
1127 *
1128 * Function returns a pointer to the initialized request queue, or %NULL if
1129 * it didn't succeed.
1130 *
1131 * Note:
1132 * blk_init_queue() must be paired with a blk_cleanup_queue() call
1133 * when the block device is deactivated (such as at module unload).
1134 **/
1135
1136struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1137{
1138 return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
1139}
1140EXPORT_SYMBOL(blk_init_queue);
1141
1142struct request_queue *
1143blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1144{
1145 struct request_queue *q;
1146
1147 q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock);
1148 if (!q)
1149 return NULL;
1150
1151 q->request_fn = rfn;
1152 if (blk_init_allocated_queue(q) < 0) {
1153 blk_cleanup_queue(q);
1154 return NULL;
1155 }
1156
1157 return q;
1158}
1159EXPORT_SYMBOL(blk_init_queue_node);
1160
1161static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
1162
1163
1164int blk_init_allocated_queue(struct request_queue *q)
1165{
1166 WARN_ON_ONCE(q->mq_ops);
1167
1168 q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL);
1169 if (!q->fq)
1170 return -ENOMEM;
1171
1172 if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
1173 goto out_free_flush_queue;
1174
1175 if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
1176 goto out_exit_flush_rq;
1177
1178 INIT_WORK(&q->timeout_work, blk_timeout_work);
1179 q->queue_flags |= QUEUE_FLAG_DEFAULT;
1180
1181 /*
1182 * This also sets hw/phys segments, boundary and size
1183 */
1184 blk_queue_make_request(q, blk_queue_bio);
1185
1186 q->sg_reserved_size = INT_MAX;
1187
1188 if (elevator_init(q))
1189 goto out_exit_flush_rq;
1190 return 0;
1191
1192out_exit_flush_rq:
1193 if (q->exit_rq_fn)
1194 q->exit_rq_fn(q, q->fq->flush_rq);
1195out_free_flush_queue:
1196 blk_free_flush_queue(q->fq);
1197 q->fq = NULL;
1198 return -ENOMEM;
1199}
1200EXPORT_SYMBOL(blk_init_allocated_queue);
1201
1202bool blk_get_queue(struct request_queue *q) 554bool blk_get_queue(struct request_queue *q)
1203{ 555{
1204 if (likely(!blk_queue_dying(q))) { 556 if (likely(!blk_queue_dying(q))) {
@@ -1210,406 +562,6 @@ bool blk_get_queue(struct request_queue *q)
1210} 562}
1211EXPORT_SYMBOL(blk_get_queue); 563EXPORT_SYMBOL(blk_get_queue);
1212 564
1213static inline void blk_free_request(struct request_list *rl, struct request *rq)
1214{
1215 if (rq->rq_flags & RQF_ELVPRIV) {
1216 elv_put_request(rl->q, rq);
1217 if (rq->elv.icq)
1218 put_io_context(rq->elv.icq->ioc);
1219 }
1220
1221 mempool_free(rq, rl->rq_pool);
1222}
1223
1224/*
1225 * ioc_batching returns true if the ioc is a valid batching request and
1226 * should be given priority access to a request.
1227 */
1228static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
1229{
1230 if (!ioc)
1231 return 0;
1232
1233 /*
1234 * Make sure the process is able to allocate at least 1 request
1235 * even if the batch times out, otherwise we could theoretically
1236 * lose wakeups.
1237 */
1238 return ioc->nr_batch_requests == q->nr_batching ||
1239 (ioc->nr_batch_requests > 0
1240 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
1241}
1242
1243/*
1244 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
1245 * will cause the process to be a "batcher" on all queues in the system. This
1246 * is the behaviour we want though - once it gets a wakeup it should be given
1247 * a nice run.
1248 */
1249static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
1250{
1251 if (!ioc || ioc_batching(q, ioc))
1252 return;
1253
1254 ioc->nr_batch_requests = q->nr_batching;
1255 ioc->last_waited = jiffies;
1256}
1257
1258static void __freed_request(struct request_list *rl, int sync)
1259{
1260 struct request_queue *q = rl->q;
1261
1262 if (rl->count[sync] < queue_congestion_off_threshold(q))
1263 blk_clear_congested(rl, sync);
1264
1265 if (rl->count[sync] + 1 <= q->nr_requests) {
1266 if (waitqueue_active(&rl->wait[sync]))
1267 wake_up(&rl->wait[sync]);
1268
1269 blk_clear_rl_full(rl, sync);
1270 }
1271}
1272
1273/*
1274 * A request has just been released. Account for it, update the full and
1275 * congestion status, wake up any waiters. Called under q->queue_lock.
1276 */
1277static void freed_request(struct request_list *rl, bool sync,
1278 req_flags_t rq_flags)
1279{
1280 struct request_queue *q = rl->q;
1281
1282 q->nr_rqs[sync]--;
1283 rl->count[sync]--;
1284 if (rq_flags & RQF_ELVPRIV)
1285 q->nr_rqs_elvpriv--;
1286
1287 __freed_request(rl, sync);
1288
1289 if (unlikely(rl->starved[sync ^ 1]))
1290 __freed_request(rl, sync ^ 1);
1291}
1292
1293int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
1294{
1295 struct request_list *rl;
1296 int on_thresh, off_thresh;
1297
1298 WARN_ON_ONCE(q->mq_ops);
1299
1300 spin_lock_irq(q->queue_lock);
1301 q->nr_requests = nr;
1302 blk_queue_congestion_threshold(q);
1303 on_thresh = queue_congestion_on_threshold(q);
1304 off_thresh = queue_congestion_off_threshold(q);
1305
1306 blk_queue_for_each_rl(rl, q) {
1307 if (rl->count[BLK_RW_SYNC] >= on_thresh)
1308 blk_set_congested(rl, BLK_RW_SYNC);
1309 else if (rl->count[BLK_RW_SYNC] < off_thresh)
1310 blk_clear_congested(rl, BLK_RW_SYNC);
1311
1312 if (rl->count[BLK_RW_ASYNC] >= on_thresh)
1313 blk_set_congested(rl, BLK_RW_ASYNC);
1314 else if (rl->count[BLK_RW_ASYNC] < off_thresh)
1315 blk_clear_congested(rl, BLK_RW_ASYNC);
1316
1317 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
1318 blk_set_rl_full(rl, BLK_RW_SYNC);
1319 } else {
1320 blk_clear_rl_full(rl, BLK_RW_SYNC);
1321 wake_up(&rl->wait[BLK_RW_SYNC]);
1322 }
1323
1324 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
1325 blk_set_rl_full(rl, BLK_RW_ASYNC);
1326 } else {
1327 blk_clear_rl_full(rl, BLK_RW_ASYNC);
1328 wake_up(&rl->wait[BLK_RW_ASYNC]);
1329 }
1330 }
1331
1332 spin_unlock_irq(q->queue_lock);
1333 return 0;
1334}
1335
1336/**
1337 * __get_request - get a free request
1338 * @rl: request list to allocate from
1339 * @op: operation and flags
1340 * @bio: bio to allocate request for (can be %NULL)
1341 * @flags: BLQ_MQ_REQ_* flags
1342 * @gfp_mask: allocator flags
1343 *
1344 * Get a free request from @q. This function may fail under memory
1345 * pressure or if @q is dead.
1346 *
1347 * Must be called with @q->queue_lock held and,
1348 * Returns ERR_PTR on failure, with @q->queue_lock held.
1349 * Returns request pointer on success, with @q->queue_lock *not held*.
1350 */
1351static struct request *__get_request(struct request_list *rl, unsigned int op,
1352 struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask)
1353{
1354 struct request_queue *q = rl->q;
1355 struct request *rq;
1356 struct elevator_type *et = q->elevator->type;
1357 struct io_context *ioc = rq_ioc(bio);
1358 struct io_cq *icq = NULL;
1359 const bool is_sync = op_is_sync(op);
1360 int may_queue;
1361 req_flags_t rq_flags = RQF_ALLOCED;
1362
1363 lockdep_assert_held(q->queue_lock);
1364
1365 if (unlikely(blk_queue_dying(q)))
1366 return ERR_PTR(-ENODEV);
1367
1368 may_queue = elv_may_queue(q, op);
1369 if (may_queue == ELV_MQUEUE_NO)
1370 goto rq_starved;
1371
1372 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
1373 if (rl->count[is_sync]+1 >= q->nr_requests) {
1374 /*
1375 * The queue will fill after this allocation, so set
1376 * it as full, and mark this process as "batching".
1377 * This process will be allowed to complete a batch of
1378 * requests, others will be blocked.
1379 */
1380 if (!blk_rl_full(rl, is_sync)) {
1381 ioc_set_batching(q, ioc);
1382 blk_set_rl_full(rl, is_sync);
1383 } else {
1384 if (may_queue != ELV_MQUEUE_MUST
1385 && !ioc_batching(q, ioc)) {
1386 /*
1387 * The queue is full and the allocating
1388 * process is not a "batcher", and not
1389 * exempted by the IO scheduler
1390 */
1391 return ERR_PTR(-ENOMEM);
1392 }
1393 }
1394 }
1395 blk_set_congested(rl, is_sync);
1396 }
1397
1398 /*
1399 * Only allow batching queuers to allocate up to 50% over the defined
1400 * limit of requests, otherwise we could have thousands of requests
1401 * allocated with any setting of ->nr_requests
1402 */
1403 if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
1404 return ERR_PTR(-ENOMEM);
1405
1406 q->nr_rqs[is_sync]++;
1407 rl->count[is_sync]++;
1408 rl->starved[is_sync] = 0;
1409
1410 /*
1411 * Decide whether the new request will be managed by elevator. If
1412 * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will
1413 * prevent the current elevator from being destroyed until the new
1414 * request is freed. This guarantees icq's won't be destroyed and
1415 * makes creating new ones safe.
1416 *
1417 * Flush requests do not use the elevator so skip initialization.
1418 * This allows a request to share the flush and elevator data.
1419 *
1420 * Also, lookup icq while holding queue_lock. If it doesn't exist,
1421 * it will be created after releasing queue_lock.
1422 */
1423 if (!op_is_flush(op) && !blk_queue_bypass(q)) {
1424 rq_flags |= RQF_ELVPRIV;
1425 q->nr_rqs_elvpriv++;
1426 if (et->icq_cache && ioc)
1427 icq = ioc_lookup_icq(ioc, q);
1428 }
1429
1430 if (blk_queue_io_stat(q))
1431 rq_flags |= RQF_IO_STAT;
1432 spin_unlock_irq(q->queue_lock);
1433
1434 /* allocate and init request */
1435 rq = mempool_alloc(rl->rq_pool, gfp_mask);
1436 if (!rq)
1437 goto fail_alloc;
1438
1439 blk_rq_init(q, rq);
1440 blk_rq_set_rl(rq, rl);
1441 rq->cmd_flags = op;
1442 rq->rq_flags = rq_flags;
1443 if (flags & BLK_MQ_REQ_PREEMPT)
1444 rq->rq_flags |= RQF_PREEMPT;
1445
1446 /* init elvpriv */
1447 if (rq_flags & RQF_ELVPRIV) {
1448 if (unlikely(et->icq_cache && !icq)) {
1449 if (ioc)
1450 icq = ioc_create_icq(ioc, q, gfp_mask);
1451 if (!icq)
1452 goto fail_elvpriv;
1453 }
1454
1455 rq->elv.icq = icq;
1456 if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
1457 goto fail_elvpriv;
1458
1459 /* @rq->elv.icq holds io_context until @rq is freed */
1460 if (icq)
1461 get_io_context(icq->ioc);
1462 }
1463out:
1464 /*
1465 * ioc may be NULL here, and ioc_batching will be false. That's
1466 * OK, if the queue is under the request limit then requests need
1467 * not count toward the nr_batch_requests limit. There will always
1468 * be some limit enforced by BLK_BATCH_TIME.
1469 */
1470 if (ioc_batching(q, ioc))
1471 ioc->nr_batch_requests--;
1472
1473 trace_block_getrq(q, bio, op);
1474 return rq;
1475
1476fail_elvpriv:
1477 /*
1478 * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed
1479 * and may fail indefinitely under memory pressure and thus
1480 * shouldn't stall IO. Treat this request as !elvpriv. This will
1481 * disturb iosched and blkcg but weird is bettern than dead.
1482 */
1483 printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
1484 __func__, dev_name(q->backing_dev_info->dev));
1485
1486 rq->rq_flags &= ~RQF_ELVPRIV;
1487 rq->elv.icq = NULL;
1488
1489 spin_lock_irq(q->queue_lock);
1490 q->nr_rqs_elvpriv--;
1491 spin_unlock_irq(q->queue_lock);
1492 goto out;
1493
1494fail_alloc:
1495 /*
1496 * Allocation failed presumably due to memory. Undo anything we
1497 * might have messed up.
1498 *
1499 * Allocating task should really be put onto the front of the wait
1500 * queue, but this is pretty rare.
1501 */
1502 spin_lock_irq(q->queue_lock);
1503 freed_request(rl, is_sync, rq_flags);
1504
1505 /*
1506 * in the very unlikely event that allocation failed and no
1507 * requests for this direction was pending, mark us starved so that
1508 * freeing of a request in the other direction will notice
1509 * us. another possible fix would be to split the rq mempool into
1510 * READ and WRITE
1511 */
1512rq_starved:
1513 if (unlikely(rl->count[is_sync] == 0))
1514 rl->starved[is_sync] = 1;
1515 return ERR_PTR(-ENOMEM);
1516}
1517
1518/**
1519 * get_request - get a free request
1520 * @q: request_queue to allocate request from
1521 * @op: operation and flags
1522 * @bio: bio to allocate request for (can be %NULL)
1523 * @flags: BLK_MQ_REQ_* flags.
1524 * @gfp: allocator flags
1525 *
1526 * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags,
1527 * this function keeps retrying under memory pressure and fails iff @q is dead.
1528 *
1529 * Must be called with @q->queue_lock held and,
1530 * Returns ERR_PTR on failure, with @q->queue_lock held.
1531 * Returns request pointer on success, with @q->queue_lock *not held*.
1532 */
1533static struct request *get_request(struct request_queue *q, unsigned int op,
1534 struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp)
1535{
1536 const bool is_sync = op_is_sync(op);
1537 DEFINE_WAIT(wait);
1538 struct request_list *rl;
1539 struct request *rq;
1540
1541 lockdep_assert_held(q->queue_lock);
1542 WARN_ON_ONCE(q->mq_ops);
1543
1544 rl = blk_get_rl(q, bio); /* transferred to @rq on success */
1545retry:
1546 rq = __get_request(rl, op, bio, flags, gfp);
1547 if (!IS_ERR(rq))
1548 return rq;
1549
1550 if (op & REQ_NOWAIT) {
1551 blk_put_rl(rl);
1552 return ERR_PTR(-EAGAIN);
1553 }
1554
1555 if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
1556 blk_put_rl(rl);
1557 return rq;
1558 }
1559
1560 /* wait on @rl and retry */
1561 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1562 TASK_UNINTERRUPTIBLE);
1563
1564 trace_block_sleeprq(q, bio, op);
1565
1566 spin_unlock_irq(q->queue_lock);
1567 io_schedule();
1568
1569 /*
1570 * After sleeping, we become a "batching" process and will be able
1571 * to allocate at least one request, and up to a big batch of them
1572 * for a small period time. See ioc_batching, ioc_set_batching
1573 */
1574 ioc_set_batching(q, current->io_context);
1575
1576 spin_lock_irq(q->queue_lock);
1577 finish_wait(&rl->wait[is_sync], &wait);
1578
1579 goto retry;
1580}
1581
1582/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
1583static struct request *blk_old_get_request(struct request_queue *q,
1584 unsigned int op, blk_mq_req_flags_t flags)
1585{
1586 struct request *rq;
1587 gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO;
1588 int ret = 0;
1589
1590 WARN_ON_ONCE(q->mq_ops);
1591
1592 /* create ioc upfront */
1593 create_io_context(gfp_mask, q->node);
1594
1595 ret = blk_queue_enter(q, flags);
1596 if (ret)
1597 return ERR_PTR(ret);
1598 spin_lock_irq(q->queue_lock);
1599 rq = get_request(q, op, NULL, flags, gfp_mask);
1600 if (IS_ERR(rq)) {
1601 spin_unlock_irq(q->queue_lock);
1602 blk_queue_exit(q);
1603 return rq;
1604 }
1605
1606 /* q->queue_lock is unlocked at this point */
1607 rq->__data_len = 0;
1608 rq->__sector = (sector_t) -1;
1609 rq->bio = rq->biotail = NULL;
1610 return rq;
1611}
1612
1613/** 565/**
1614 * blk_get_request - allocate a request 566 * blk_get_request - allocate a request
1615 * @q: request queue to allocate a request for 567 * @q: request queue to allocate a request for
@@ -1624,170 +576,17 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op,
1624 WARN_ON_ONCE(op & REQ_NOWAIT); 576 WARN_ON_ONCE(op & REQ_NOWAIT);
1625 WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); 577 WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
1626 578
1627 if (q->mq_ops) { 579 req = blk_mq_alloc_request(q, op, flags);
1628 req = blk_mq_alloc_request(q, op, flags); 580 if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
1629 if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) 581 q->mq_ops->initialize_rq_fn(req);
1630 q->mq_ops->initialize_rq_fn(req);
1631 } else {
1632 req = blk_old_get_request(q, op, flags);
1633 if (!IS_ERR(req) && q->initialize_rq_fn)
1634 q->initialize_rq_fn(req);
1635 }
1636 582
1637 return req; 583 return req;
1638} 584}
1639EXPORT_SYMBOL(blk_get_request); 585EXPORT_SYMBOL(blk_get_request);
1640 586
1641/**
1642 * blk_requeue_request - put a request back on queue
1643 * @q: request queue where request should be inserted
1644 * @rq: request to be inserted
1645 *
1646 * Description:
1647 * Drivers often keep queueing requests until the hardware cannot accept
1648 * more, when that condition happens we need to put the request back
1649 * on the queue. Must be called with queue lock held.
1650 */
1651void blk_requeue_request(struct request_queue *q, struct request *rq)
1652{
1653 lockdep_assert_held(q->queue_lock);
1654 WARN_ON_ONCE(q->mq_ops);
1655
1656 blk_delete_timer(rq);
1657 blk_clear_rq_complete(rq);
1658 trace_block_rq_requeue(q, rq);
1659 rq_qos_requeue(q, rq);
1660
1661 if (rq->rq_flags & RQF_QUEUED)
1662 blk_queue_end_tag(q, rq);
1663
1664 BUG_ON(blk_queued_rq(rq));
1665
1666 elv_requeue_request(q, rq);
1667}
1668EXPORT_SYMBOL(blk_requeue_request);
1669
1670static void add_acct_request(struct request_queue *q, struct request *rq,
1671 int where)
1672{
1673 blk_account_io_start(rq, true);
1674 __elv_add_request(q, rq, where);
1675}
1676
1677static void part_round_stats_single(struct request_queue *q, int cpu,
1678 struct hd_struct *part, unsigned long now,
1679 unsigned int inflight)
1680{
1681 if (inflight) {
1682 __part_stat_add(cpu, part, time_in_queue,
1683 inflight * (now - part->stamp));
1684 __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1685 }
1686 part->stamp = now;
1687}
1688
1689/**
1690 * part_round_stats() - Round off the performance stats on a struct disk_stats.
1691 * @q: target block queue
1692 * @cpu: cpu number for stats access
1693 * @part: target partition
1694 *
1695 * The average IO queue length and utilisation statistics are maintained
1696 * by observing the current state of the queue length and the amount of
1697 * time it has been in this state for.
1698 *
1699 * Normally, that accounting is done on IO completion, but that can result
1700 * in more than a second's worth of IO being accounted for within any one
1701 * second, leading to >100% utilisation. To deal with that, we call this
1702 * function to do a round-off before returning the results when reading
1703 * /proc/diskstats. This accounts immediately for all queue usage up to
1704 * the current jiffies and restarts the counters again.
1705 */
1706void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
1707{
1708 struct hd_struct *part2 = NULL;
1709 unsigned long now = jiffies;
1710 unsigned int inflight[2];
1711 int stats = 0;
1712
1713 if (part->stamp != now)
1714 stats |= 1;
1715
1716 if (part->partno) {
1717 part2 = &part_to_disk(part)->part0;
1718 if (part2->stamp != now)
1719 stats |= 2;
1720 }
1721
1722 if (!stats)
1723 return;
1724
1725 part_in_flight(q, part, inflight);
1726
1727 if (stats & 2)
1728 part_round_stats_single(q, cpu, part2, now, inflight[1]);
1729 if (stats & 1)
1730 part_round_stats_single(q, cpu, part, now, inflight[0]);
1731}
1732EXPORT_SYMBOL_GPL(part_round_stats);
1733
1734void __blk_put_request(struct request_queue *q, struct request *req)
1735{
1736 req_flags_t rq_flags = req->rq_flags;
1737
1738 if (unlikely(!q))
1739 return;
1740
1741 if (q->mq_ops) {
1742 blk_mq_free_request(req);
1743 return;
1744 }
1745
1746 lockdep_assert_held(q->queue_lock);
1747
1748 blk_req_zone_write_unlock(req);
1749 blk_pm_put_request(req);
1750 blk_pm_mark_last_busy(req);
1751
1752 elv_completed_request(q, req);
1753
1754 /* this is a bio leak */
1755 WARN_ON(req->bio != NULL);
1756
1757 rq_qos_done(q, req);
1758
1759 /*
1760 * Request may not have originated from ll_rw_blk. if not,
1761 * it didn't come out of our reserved rq pools
1762 */
1763 if (rq_flags & RQF_ALLOCED) {
1764 struct request_list *rl = blk_rq_rl(req);
1765 bool sync = op_is_sync(req->cmd_flags);
1766
1767 BUG_ON(!list_empty(&req->queuelist));
1768 BUG_ON(ELV_ON_HASH(req));
1769
1770 blk_free_request(rl, req);
1771 freed_request(rl, sync, rq_flags);
1772 blk_put_rl(rl);
1773 blk_queue_exit(q);
1774 }
1775}
1776EXPORT_SYMBOL_GPL(__blk_put_request);
1777
1778void blk_put_request(struct request *req) 587void blk_put_request(struct request *req)
1779{ 588{
1780 struct request_queue *q = req->q; 589 blk_mq_free_request(req);
1781
1782 if (q->mq_ops)
1783 blk_mq_free_request(req);
1784 else {
1785 unsigned long flags;
1786
1787 spin_lock_irqsave(q->queue_lock, flags);
1788 __blk_put_request(q, req);
1789 spin_unlock_irqrestore(q->queue_lock, flags);
1790 }
1791} 590}
1792EXPORT_SYMBOL(blk_put_request); 591EXPORT_SYMBOL(blk_put_request);
1793 592
@@ -1807,7 +606,6 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1807 req->biotail->bi_next = bio; 606 req->biotail->bi_next = bio;
1808 req->biotail = bio; 607 req->biotail = bio;
1809 req->__data_len += bio->bi_iter.bi_size; 608 req->__data_len += bio->bi_iter.bi_size;
1810 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1811 609
1812 blk_account_io_start(req, false); 610 blk_account_io_start(req, false);
1813 return true; 611 return true;
@@ -1831,7 +629,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1831 629
1832 req->__sector = bio->bi_iter.bi_sector; 630 req->__sector = bio->bi_iter.bi_sector;
1833 req->__data_len += bio->bi_iter.bi_size; 631 req->__data_len += bio->bi_iter.bi_size;
1834 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1835 632
1836 blk_account_io_start(req, false); 633 blk_account_io_start(req, false);
1837 return true; 634 return true;
@@ -1851,7 +648,6 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
1851 req->biotail->bi_next = bio; 648 req->biotail->bi_next = bio;
1852 req->biotail = bio; 649 req->biotail = bio;
1853 req->__data_len += bio->bi_iter.bi_size; 650 req->__data_len += bio->bi_iter.bi_size;
1854 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1855 req->nr_phys_segments = segments + 1; 651 req->nr_phys_segments = segments + 1;
1856 652
1857 blk_account_io_start(req, false); 653 blk_account_io_start(req, false);
@@ -1884,7 +680,6 @@ no_merge:
1884 * Caller must ensure !blk_queue_nomerges(q) beforehand. 680 * Caller must ensure !blk_queue_nomerges(q) beforehand.
1885 */ 681 */
1886bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 682bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1887 unsigned int *request_count,
1888 struct request **same_queue_rq) 683 struct request **same_queue_rq)
1889{ 684{
1890 struct blk_plug *plug; 685 struct blk_plug *plug;
@@ -1894,25 +689,19 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1894 plug = current->plug; 689 plug = current->plug;
1895 if (!plug) 690 if (!plug)
1896 return false; 691 return false;
1897 *request_count = 0;
1898 692
1899 if (q->mq_ops) 693 plug_list = &plug->mq_list;
1900 plug_list = &plug->mq_list;
1901 else
1902 plug_list = &plug->list;
1903 694
1904 list_for_each_entry_reverse(rq, plug_list, queuelist) { 695 list_for_each_entry_reverse(rq, plug_list, queuelist) {
1905 bool merged = false; 696 bool merged = false;
1906 697
1907 if (rq->q == q) { 698 if (rq->q == q && same_queue_rq) {
1908 (*request_count)++;
1909 /* 699 /*
1910 * Only blk-mq multiple hardware queues case checks the 700 * Only blk-mq multiple hardware queues case checks the
1911 * rq in the same queue, there should be only one such 701 * rq in the same queue, there should be only one such
1912 * rq in a queue 702 * rq in a queue
1913 **/ 703 **/
1914 if (same_queue_rq) 704 *same_queue_rq = rq;
1915 *same_queue_rq = rq;
1916 } 705 }
1917 706
1918 if (rq->q != q || !blk_rq_merge_ok(rq, bio)) 707 if (rq->q != q || !blk_rq_merge_ok(rq, bio))
@@ -1939,176 +728,18 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1939 return false; 728 return false;
1940} 729}
1941 730
1942unsigned int blk_plug_queued_count(struct request_queue *q)
1943{
1944 struct blk_plug *plug;
1945 struct request *rq;
1946 struct list_head *plug_list;
1947 unsigned int ret = 0;
1948
1949 plug = current->plug;
1950 if (!plug)
1951 goto out;
1952
1953 if (q->mq_ops)
1954 plug_list = &plug->mq_list;
1955 else
1956 plug_list = &plug->list;
1957
1958 list_for_each_entry(rq, plug_list, queuelist) {
1959 if (rq->q == q)
1960 ret++;
1961 }
1962out:
1963 return ret;
1964}
1965
1966void blk_init_request_from_bio(struct request *req, struct bio *bio) 731void blk_init_request_from_bio(struct request *req, struct bio *bio)
1967{ 732{
1968 struct io_context *ioc = rq_ioc(bio);
1969
1970 if (bio->bi_opf & REQ_RAHEAD) 733 if (bio->bi_opf & REQ_RAHEAD)
1971 req->cmd_flags |= REQ_FAILFAST_MASK; 734 req->cmd_flags |= REQ_FAILFAST_MASK;
1972 735
1973 req->__sector = bio->bi_iter.bi_sector; 736 req->__sector = bio->bi_iter.bi_sector;
1974 if (ioprio_valid(bio_prio(bio))) 737 req->ioprio = bio_prio(bio);
1975 req->ioprio = bio_prio(bio);
1976 else if (ioc)
1977 req->ioprio = ioc->ioprio;
1978 else
1979 req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
1980 req->write_hint = bio->bi_write_hint; 738 req->write_hint = bio->bi_write_hint;
1981 blk_rq_bio_prep(req->q, req, bio); 739 blk_rq_bio_prep(req->q, req, bio);
1982} 740}
1983EXPORT_SYMBOL_GPL(blk_init_request_from_bio); 741EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
1984 742
1985static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
1986{
1987 struct blk_plug *plug;
1988 int where = ELEVATOR_INSERT_SORT;
1989 struct request *req, *free;
1990 unsigned int request_count = 0;
1991
1992 /*
1993 * low level driver can indicate that it wants pages above a
1994 * certain limit bounced to low memory (ie for highmem, or even
1995 * ISA dma in theory)
1996 */
1997 blk_queue_bounce(q, &bio);
1998
1999 blk_queue_split(q, &bio);
2000
2001 if (!bio_integrity_prep(bio))
2002 return BLK_QC_T_NONE;
2003
2004 if (op_is_flush(bio->bi_opf)) {
2005 spin_lock_irq(q->queue_lock);
2006 where = ELEVATOR_INSERT_FLUSH;
2007 goto get_rq;
2008 }
2009
2010 /*
2011 * Check if we can merge with the plugged list before grabbing
2012 * any locks.
2013 */
2014 if (!blk_queue_nomerges(q)) {
2015 if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
2016 return BLK_QC_T_NONE;
2017 } else
2018 request_count = blk_plug_queued_count(q);
2019
2020 spin_lock_irq(q->queue_lock);
2021
2022 switch (elv_merge(q, &req, bio)) {
2023 case ELEVATOR_BACK_MERGE:
2024 if (!bio_attempt_back_merge(q, req, bio))
2025 break;
2026 elv_bio_merged(q, req, bio);
2027 free = attempt_back_merge(q, req);
2028 if (free)
2029 __blk_put_request(q, free);
2030 else
2031 elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
2032 goto out_unlock;
2033 case ELEVATOR_FRONT_MERGE:
2034 if (!bio_attempt_front_merge(q, req, bio))
2035 break;
2036 elv_bio_merged(q, req, bio);
2037 free = attempt_front_merge(q, req);
2038 if (free)
2039 __blk_put_request(q, free);
2040 else
2041 elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
2042 goto out_unlock;
2043 default:
2044 break;
2045 }
2046
2047get_rq:
2048 rq_qos_throttle(q, bio, q->queue_lock);
2049
2050 /*
2051 * Grab a free request. This is might sleep but can not fail.
2052 * Returns with the queue unlocked.
2053 */
2054 blk_queue_enter_live(q);
2055 req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
2056 if (IS_ERR(req)) {
2057 blk_queue_exit(q);
2058 rq_qos_cleanup(q, bio);
2059 if (PTR_ERR(req) == -ENOMEM)
2060 bio->bi_status = BLK_STS_RESOURCE;
2061 else
2062 bio->bi_status = BLK_STS_IOERR;
2063 bio_endio(bio);
2064 goto out_unlock;
2065 }
2066
2067 rq_qos_track(q, req, bio);
2068
2069 /*
2070 * After dropping the lock and possibly sleeping here, our request
2071 * may now be mergeable after it had proven unmergeable (above).
2072 * We don't worry about that case for efficiency. It won't happen
2073 * often, and the elevators are able to handle it.
2074 */
2075 blk_init_request_from_bio(req, bio);
2076
2077 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
2078 req->cpu = raw_smp_processor_id();
2079
2080 plug = current->plug;
2081 if (plug) {
2082 /*
2083 * If this is the first request added after a plug, fire
2084 * of a plug trace.
2085 *
2086 * @request_count may become stale because of schedule
2087 * out, so check plug list again.
2088 */
2089 if (!request_count || list_empty(&plug->list))
2090 trace_block_plug(q);
2091 else {
2092 struct request *last = list_entry_rq(plug->list.prev);
2093 if (request_count >= BLK_MAX_REQUEST_COUNT ||
2094 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
2095 blk_flush_plug_list(plug, false);
2096 trace_block_plug(q);
2097 }
2098 }
2099 list_add_tail(&req->queuelist, &plug->list);
2100 blk_account_io_start(req, true);
2101 } else {
2102 spin_lock_irq(q->queue_lock);
2103 add_acct_request(q, req, where);
2104 __blk_run_queue(q);
2105out_unlock:
2106 spin_unlock_irq(q->queue_lock);
2107 }
2108
2109 return BLK_QC_T_NONE;
2110}
2111
2112static void handle_bad_sector(struct bio *bio, sector_t maxsector) 743static void handle_bad_sector(struct bio *bio, sector_t maxsector)
2113{ 744{
2114 char b[BDEVNAME_SIZE]; 745 char b[BDEVNAME_SIZE];
@@ -2260,7 +891,7 @@ generic_make_request_checks(struct bio *bio)
2260 * For a REQ_NOWAIT based request, return -EOPNOTSUPP 891 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
2261 * if queue is not a request based queue. 892 * if queue is not a request based queue.
2262 */ 893 */
2263 if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) 894 if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
2264 goto not_supported; 895 goto not_supported;
2265 896
2266 if (should_fail_bio(bio)) 897 if (should_fail_bio(bio))
@@ -2290,6 +921,9 @@ generic_make_request_checks(struct bio *bio)
2290 } 921 }
2291 } 922 }
2292 923
924 if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
925 bio->bi_opf &= ~REQ_HIPRI;
926
2293 switch (bio_op(bio)) { 927 switch (bio_op(bio)) {
2294 case REQ_OP_DISCARD: 928 case REQ_OP_DISCARD:
2295 if (!blk_queue_discard(q)) 929 if (!blk_queue_discard(q))
@@ -2562,17 +1196,6 @@ blk_qc_t submit_bio(struct bio *bio)
2562} 1196}
2563EXPORT_SYMBOL(submit_bio); 1197EXPORT_SYMBOL(submit_bio);
2564 1198
2565bool blk_poll(struct request_queue *q, blk_qc_t cookie)
2566{
2567 if (!q->poll_fn || !blk_qc_t_valid(cookie))
2568 return false;
2569
2570 if (current->plug)
2571 blk_flush_plug_list(current->plug, false);
2572 return q->poll_fn(q, cookie);
2573}
2574EXPORT_SYMBOL_GPL(blk_poll);
2575
2576/** 1199/**
2577 * blk_cloned_rq_check_limits - Helper function to check a cloned request 1200 * blk_cloned_rq_check_limits - Helper function to check a cloned request
2578 * for new the queue limits 1201 * for new the queue limits
@@ -2620,8 +1243,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
2620 */ 1243 */
2621blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) 1244blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
2622{ 1245{
2623 unsigned long flags; 1246 blk_qc_t unused;
2624 int where = ELEVATOR_INSERT_BACK;
2625 1247
2626 if (blk_cloned_rq_check_limits(q, rq)) 1248 if (blk_cloned_rq_check_limits(q, rq))
2627 return BLK_STS_IOERR; 1249 return BLK_STS_IOERR;
@@ -2630,38 +1252,15 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
2630 should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) 1252 should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
2631 return BLK_STS_IOERR; 1253 return BLK_STS_IOERR;
2632 1254
2633 if (q->mq_ops) { 1255 if (blk_queue_io_stat(q))
2634 if (blk_queue_io_stat(q)) 1256 blk_account_io_start(rq, true);
2635 blk_account_io_start(rq, true);
2636 /*
2637 * Since we have a scheduler attached on the top device,
2638 * bypass a potential scheduler on the bottom device for
2639 * insert.
2640 */
2641 return blk_mq_request_issue_directly(rq);
2642 }
2643
2644 spin_lock_irqsave(q->queue_lock, flags);
2645 if (unlikely(blk_queue_dying(q))) {
2646 spin_unlock_irqrestore(q->queue_lock, flags);
2647 return BLK_STS_IOERR;
2648 }
2649 1257
2650 /* 1258 /*
2651 * Submitting request must be dequeued before calling this function 1259 * Since we have a scheduler attached on the top device,
2652 * because it will be linked to another request_queue 1260 * bypass a potential scheduler on the bottom device for
1261 * insert.
2653 */ 1262 */
2654 BUG_ON(blk_queued_rq(rq)); 1263 return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, true);
2655
2656 if (op_is_flush(rq->cmd_flags))
2657 where = ELEVATOR_INSERT_FLUSH;
2658
2659 add_acct_request(q, rq, where);
2660 if (where == ELEVATOR_INSERT_FLUSH)
2661 __blk_run_queue(q);
2662 spin_unlock_irqrestore(q->queue_lock, flags);
2663
2664 return BLK_STS_OK;
2665} 1264}
2666EXPORT_SYMBOL_GPL(blk_insert_cloned_request); 1265EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
2667 1266
@@ -2711,11 +1310,10 @@ void blk_account_io_completion(struct request *req, unsigned int bytes)
2711 if (blk_do_io_stat(req)) { 1310 if (blk_do_io_stat(req)) {
2712 const int sgrp = op_stat_group(req_op(req)); 1311 const int sgrp = op_stat_group(req_op(req));
2713 struct hd_struct *part; 1312 struct hd_struct *part;
2714 int cpu;
2715 1313
2716 cpu = part_stat_lock(); 1314 part_stat_lock();
2717 part = req->part; 1315 part = req->part;
2718 part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); 1316 part_stat_add(part, sectors[sgrp], bytes >> 9);
2719 part_stat_unlock(); 1317 part_stat_unlock();
2720 } 1318 }
2721} 1319}
@@ -2730,14 +1328,14 @@ void blk_account_io_done(struct request *req, u64 now)
2730 if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { 1328 if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
2731 const int sgrp = op_stat_group(req_op(req)); 1329 const int sgrp = op_stat_group(req_op(req));
2732 struct hd_struct *part; 1330 struct hd_struct *part;
2733 int cpu;
2734 1331
2735 cpu = part_stat_lock(); 1332 part_stat_lock();
2736 part = req->part; 1333 part = req->part;
2737 1334
2738 part_stat_inc(cpu, part, ios[sgrp]); 1335 update_io_ticks(part, jiffies);
2739 part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); 1336 part_stat_inc(part, ios[sgrp]);
2740 part_round_stats(req->q, cpu, part); 1337 part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
1338 part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns));
2741 part_dec_in_flight(req->q, part, rq_data_dir(req)); 1339 part_dec_in_flight(req->q, part, rq_data_dir(req));
2742 1340
2743 hd_struct_put(part); 1341 hd_struct_put(part);
@@ -2749,16 +1347,15 @@ void blk_account_io_start(struct request *rq, bool new_io)
2749{ 1347{
2750 struct hd_struct *part; 1348 struct hd_struct *part;
2751 int rw = rq_data_dir(rq); 1349 int rw = rq_data_dir(rq);
2752 int cpu;
2753 1350
2754 if (!blk_do_io_stat(rq)) 1351 if (!blk_do_io_stat(rq))
2755 return; 1352 return;
2756 1353
2757 cpu = part_stat_lock(); 1354 part_stat_lock();
2758 1355
2759 if (!new_io) { 1356 if (!new_io) {
2760 part = rq->part; 1357 part = rq->part;
2761 part_stat_inc(cpu, part, merges[rw]); 1358 part_stat_inc(part, merges[rw]);
2762 } else { 1359 } else {
2763 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); 1360 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
2764 if (!hd_struct_try_get(part)) { 1361 if (!hd_struct_try_get(part)) {
@@ -2773,232 +1370,14 @@ void blk_account_io_start(struct request *rq, bool new_io)
2773 part = &rq->rq_disk->part0; 1370 part = &rq->rq_disk->part0;
2774 hd_struct_get(part); 1371 hd_struct_get(part);
2775 } 1372 }
2776 part_round_stats(rq->q, cpu, part);
2777 part_inc_in_flight(rq->q, part, rw); 1373 part_inc_in_flight(rq->q, part, rw);
2778 rq->part = part; 1374 rq->part = part;
2779 } 1375 }
2780 1376
2781 part_stat_unlock(); 1377 update_io_ticks(part, jiffies);
2782}
2783
2784static struct request *elv_next_request(struct request_queue *q)
2785{
2786 struct request *rq;
2787 struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
2788
2789 WARN_ON_ONCE(q->mq_ops);
2790
2791 while (1) {
2792 list_for_each_entry(rq, &q->queue_head, queuelist) {
2793#ifdef CONFIG_PM
2794 /*
2795 * If a request gets queued in state RPM_SUSPENDED
2796 * then that's a kernel bug.
2797 */
2798 WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED);
2799#endif
2800 return rq;
2801 }
2802
2803 /*
2804 * Flush request is running and flush request isn't queueable
2805 * in the drive, we can hold the queue till flush request is
2806 * finished. Even we don't do this, driver can't dispatch next
2807 * requests and will requeue them. And this can improve
2808 * throughput too. For example, we have request flush1, write1,
2809 * flush 2. flush1 is dispatched, then queue is hold, write1
2810 * isn't inserted to queue. After flush1 is finished, flush2
2811 * will be dispatched. Since disk cache is already clean,
2812 * flush2 will be finished very soon, so looks like flush2 is
2813 * folded to flush1.
2814 * Since the queue is hold, a flag is set to indicate the queue
2815 * should be restarted later. Please see flush_end_io() for
2816 * details.
2817 */
2818 if (fq->flush_pending_idx != fq->flush_running_idx &&
2819 !queue_flush_queueable(q)) {
2820 fq->flush_queue_delayed = 1;
2821 return NULL;
2822 }
2823 if (unlikely(blk_queue_bypass(q)) ||
2824 !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
2825 return NULL;
2826 }
2827}
2828
2829/**
2830 * blk_peek_request - peek at the top of a request queue
2831 * @q: request queue to peek at
2832 *
2833 * Description:
2834 * Return the request at the top of @q. The returned request
2835 * should be started using blk_start_request() before LLD starts
2836 * processing it.
2837 *
2838 * Return:
2839 * Pointer to the request at the top of @q if available. Null
2840 * otherwise.
2841 */
2842struct request *blk_peek_request(struct request_queue *q)
2843{
2844 struct request *rq;
2845 int ret;
2846
2847 lockdep_assert_held(q->queue_lock);
2848 WARN_ON_ONCE(q->mq_ops);
2849
2850 while ((rq = elv_next_request(q)) != NULL) {
2851 if (!(rq->rq_flags & RQF_STARTED)) {
2852 /*
2853 * This is the first time the device driver
2854 * sees this request (possibly after
2855 * requeueing). Notify IO scheduler.
2856 */
2857 if (rq->rq_flags & RQF_SORTED)
2858 elv_activate_rq(q, rq);
2859
2860 /*
2861 * just mark as started even if we don't start
2862 * it, a request that has been delayed should
2863 * not be passed by new incoming requests
2864 */
2865 rq->rq_flags |= RQF_STARTED;
2866 trace_block_rq_issue(q, rq);
2867 }
2868
2869 if (!q->boundary_rq || q->boundary_rq == rq) {
2870 q->end_sector = rq_end_sector(rq);
2871 q->boundary_rq = NULL;
2872 }
2873
2874 if (rq->rq_flags & RQF_DONTPREP)
2875 break;
2876
2877 if (q->dma_drain_size && blk_rq_bytes(rq)) {
2878 /*
2879 * make sure space for the drain appears we
2880 * know we can do this because max_hw_segments
2881 * has been adjusted to be one fewer than the
2882 * device can handle
2883 */
2884 rq->nr_phys_segments++;
2885 }
2886
2887 if (!q->prep_rq_fn)
2888 break;
2889
2890 ret = q->prep_rq_fn(q, rq);
2891 if (ret == BLKPREP_OK) {
2892 break;
2893 } else if (ret == BLKPREP_DEFER) {
2894 /*
2895 * the request may have been (partially) prepped.
2896 * we need to keep this request in the front to
2897 * avoid resource deadlock. RQF_STARTED will
2898 * prevent other fs requests from passing this one.
2899 */
2900 if (q->dma_drain_size && blk_rq_bytes(rq) &&
2901 !(rq->rq_flags & RQF_DONTPREP)) {
2902 /*
2903 * remove the space for the drain we added
2904 * so that we don't add it again
2905 */
2906 --rq->nr_phys_segments;
2907 }
2908
2909 rq = NULL;
2910 break;
2911 } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
2912 rq->rq_flags |= RQF_QUIET;
2913 /*
2914 * Mark this request as started so we don't trigger
2915 * any debug logic in the end I/O path.
2916 */
2917 blk_start_request(rq);
2918 __blk_end_request_all(rq, ret == BLKPREP_INVALID ?
2919 BLK_STS_TARGET : BLK_STS_IOERR);
2920 } else {
2921 printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
2922 break;
2923 }
2924 }
2925
2926 return rq;
2927}
2928EXPORT_SYMBOL(blk_peek_request);
2929
2930static void blk_dequeue_request(struct request *rq)
2931{
2932 struct request_queue *q = rq->q;
2933 1378
2934 BUG_ON(list_empty(&rq->queuelist)); 1379 part_stat_unlock();
2935 BUG_ON(ELV_ON_HASH(rq));
2936
2937 list_del_init(&rq->queuelist);
2938
2939 /*
2940 * the time frame between a request being removed from the lists
2941 * and to it is freed is accounted as io that is in progress at
2942 * the driver side.
2943 */
2944 if (blk_account_rq(rq))
2945 q->in_flight[rq_is_sync(rq)]++;
2946}
2947
2948/**
2949 * blk_start_request - start request processing on the driver
2950 * @req: request to dequeue
2951 *
2952 * Description:
2953 * Dequeue @req and start timeout timer on it. This hands off the
2954 * request to the driver.
2955 */
2956void blk_start_request(struct request *req)
2957{
2958 lockdep_assert_held(req->q->queue_lock);
2959 WARN_ON_ONCE(req->q->mq_ops);
2960
2961 blk_dequeue_request(req);
2962
2963 if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
2964 req->io_start_time_ns = ktime_get_ns();
2965#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2966 req->throtl_size = blk_rq_sectors(req);
2967#endif
2968 req->rq_flags |= RQF_STATS;
2969 rq_qos_issue(req->q, req);
2970 }
2971
2972 BUG_ON(blk_rq_is_complete(req));
2973 blk_add_timer(req);
2974}
2975EXPORT_SYMBOL(blk_start_request);
2976
2977/**
2978 * blk_fetch_request - fetch a request from a request queue
2979 * @q: request queue to fetch a request from
2980 *
2981 * Description:
2982 * Return the request at the top of @q. The request is started on
2983 * return and LLD can start processing it immediately.
2984 *
2985 * Return:
2986 * Pointer to the request at the top of @q if available. Null
2987 * otherwise.
2988 */
2989struct request *blk_fetch_request(struct request_queue *q)
2990{
2991 struct request *rq;
2992
2993 lockdep_assert_held(q->queue_lock);
2994 WARN_ON_ONCE(q->mq_ops);
2995
2996 rq = blk_peek_request(q);
2997 if (rq)
2998 blk_start_request(rq);
2999 return rq;
3000} 1380}
3001EXPORT_SYMBOL(blk_fetch_request);
3002 1381
3003/* 1382/*
3004 * Steal bios from a request and add them to a bio list. 1383 * Steal bios from a request and add them to a bio list.
@@ -3125,255 +1504,6 @@ bool blk_update_request(struct request *req, blk_status_t error,
3125} 1504}
3126EXPORT_SYMBOL_GPL(blk_update_request); 1505EXPORT_SYMBOL_GPL(blk_update_request);
3127 1506
3128static bool blk_update_bidi_request(struct request *rq, blk_status_t error,
3129 unsigned int nr_bytes,
3130 unsigned int bidi_bytes)
3131{
3132 if (blk_update_request(rq, error, nr_bytes))
3133 return true;
3134
3135 /* Bidi request must be completed as a whole */
3136 if (unlikely(blk_bidi_rq(rq)) &&
3137 blk_update_request(rq->next_rq, error, bidi_bytes))
3138 return true;
3139
3140 if (blk_queue_add_random(rq->q))
3141 add_disk_randomness(rq->rq_disk);
3142
3143 return false;
3144}
3145
3146/**
3147 * blk_unprep_request - unprepare a request
3148 * @req: the request
3149 *
3150 * This function makes a request ready for complete resubmission (or
3151 * completion). It happens only after all error handling is complete,
3152 * so represents the appropriate moment to deallocate any resources
3153 * that were allocated to the request in the prep_rq_fn. The queue
3154 * lock is held when calling this.
3155 */
3156void blk_unprep_request(struct request *req)
3157{
3158 struct request_queue *q = req->q;
3159
3160 req->rq_flags &= ~RQF_DONTPREP;
3161 if (q->unprep_rq_fn)
3162 q->unprep_rq_fn(q, req);
3163}
3164EXPORT_SYMBOL_GPL(blk_unprep_request);
3165
3166void blk_finish_request(struct request *req, blk_status_t error)
3167{
3168 struct request_queue *q = req->q;
3169 u64 now = ktime_get_ns();
3170
3171 lockdep_assert_held(req->q->queue_lock);
3172 WARN_ON_ONCE(q->mq_ops);
3173
3174 if (req->rq_flags & RQF_STATS)
3175 blk_stat_add(req, now);
3176
3177 if (req->rq_flags & RQF_QUEUED)
3178 blk_queue_end_tag(q, req);
3179
3180 BUG_ON(blk_queued_rq(req));
3181
3182 if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
3183 laptop_io_completion(req->q->backing_dev_info);
3184
3185 blk_delete_timer(req);
3186
3187 if (req->rq_flags & RQF_DONTPREP)
3188 blk_unprep_request(req);
3189
3190 blk_account_io_done(req, now);
3191
3192 if (req->end_io) {
3193 rq_qos_done(q, req);
3194 req->end_io(req, error);
3195 } else {
3196 if (blk_bidi_rq(req))
3197 __blk_put_request(req->next_rq->q, req->next_rq);
3198
3199 __blk_put_request(q, req);
3200 }
3201}
3202EXPORT_SYMBOL(blk_finish_request);
3203
3204/**
3205 * blk_end_bidi_request - Complete a bidi request
3206 * @rq: the request to complete
3207 * @error: block status code
3208 * @nr_bytes: number of bytes to complete @rq
3209 * @bidi_bytes: number of bytes to complete @rq->next_rq
3210 *
3211 * Description:
3212 * Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
3213 * Drivers that supports bidi can safely call this member for any
3214 * type of request, bidi or uni. In the later case @bidi_bytes is
3215 * just ignored.
3216 *
3217 * Return:
3218 * %false - we are done with this request
3219 * %true - still buffers pending for this request
3220 **/
3221static bool blk_end_bidi_request(struct request *rq, blk_status_t error,
3222 unsigned int nr_bytes, unsigned int bidi_bytes)
3223{
3224 struct request_queue *q = rq->q;
3225 unsigned long flags;
3226
3227 WARN_ON_ONCE(q->mq_ops);
3228
3229 if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3230 return true;
3231
3232 spin_lock_irqsave(q->queue_lock, flags);
3233 blk_finish_request(rq, error);
3234 spin_unlock_irqrestore(q->queue_lock, flags);
3235
3236 return false;
3237}
3238
3239/**
3240 * __blk_end_bidi_request - Complete a bidi request with queue lock held
3241 * @rq: the request to complete
3242 * @error: block status code
3243 * @nr_bytes: number of bytes to complete @rq
3244 * @bidi_bytes: number of bytes to complete @rq->next_rq
3245 *
3246 * Description:
3247 * Identical to blk_end_bidi_request() except that queue lock is
3248 * assumed to be locked on entry and remains so on return.
3249 *
3250 * Return:
3251 * %false - we are done with this request
3252 * %true - still buffers pending for this request
3253 **/
3254static bool __blk_end_bidi_request(struct request *rq, blk_status_t error,
3255 unsigned int nr_bytes, unsigned int bidi_bytes)
3256{
3257 lockdep_assert_held(rq->q->queue_lock);
3258 WARN_ON_ONCE(rq->q->mq_ops);
3259
3260 if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3261 return true;
3262
3263 blk_finish_request(rq, error);
3264
3265 return false;
3266}
3267
3268/**
3269 * blk_end_request - Helper function for drivers to complete the request.
3270 * @rq: the request being processed
3271 * @error: block status code
3272 * @nr_bytes: number of bytes to complete
3273 *
3274 * Description:
3275 * Ends I/O on a number of bytes attached to @rq.
3276 * If @rq has leftover, sets it up for the next range of segments.
3277 *
3278 * Return:
3279 * %false - we are done with this request
3280 * %true - still buffers pending for this request
3281 **/
3282bool blk_end_request(struct request *rq, blk_status_t error,
3283 unsigned int nr_bytes)
3284{
3285 WARN_ON_ONCE(rq->q->mq_ops);
3286 return blk_end_bidi_request(rq, error, nr_bytes, 0);
3287}
3288EXPORT_SYMBOL(blk_end_request);
3289
3290/**
3291 * blk_end_request_all - Helper function for drives to finish the request.
3292 * @rq: the request to finish
3293 * @error: block status code
3294 *
3295 * Description:
3296 * Completely finish @rq.
3297 */
3298void blk_end_request_all(struct request *rq, blk_status_t error)
3299{
3300 bool pending;
3301 unsigned int bidi_bytes = 0;
3302
3303 if (unlikely(blk_bidi_rq(rq)))
3304 bidi_bytes = blk_rq_bytes(rq->next_rq);
3305
3306 pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3307 BUG_ON(pending);
3308}
3309EXPORT_SYMBOL(blk_end_request_all);
3310
3311/**
3312 * __blk_end_request - Helper function for drivers to complete the request.
3313 * @rq: the request being processed
3314 * @error: block status code
3315 * @nr_bytes: number of bytes to complete
3316 *
3317 * Description:
3318 * Must be called with queue lock held unlike blk_end_request().
3319 *
3320 * Return:
3321 * %false - we are done with this request
3322 * %true - still buffers pending for this request
3323 **/
3324bool __blk_end_request(struct request *rq, blk_status_t error,
3325 unsigned int nr_bytes)
3326{
3327 lockdep_assert_held(rq->q->queue_lock);
3328 WARN_ON_ONCE(rq->q->mq_ops);
3329
3330 return __blk_end_bidi_request(rq, error, nr_bytes, 0);
3331}
3332EXPORT_SYMBOL(__blk_end_request);
3333
3334/**
3335 * __blk_end_request_all - Helper function for drives to finish the request.
3336 * @rq: the request to finish
3337 * @error: block status code
3338 *
3339 * Description:
3340 * Completely finish @rq. Must be called with queue lock held.
3341 */
3342void __blk_end_request_all(struct request *rq, blk_status_t error)
3343{
3344 bool pending;
3345 unsigned int bidi_bytes = 0;
3346
3347 lockdep_assert_held(rq->q->queue_lock);
3348 WARN_ON_ONCE(rq->q->mq_ops);
3349
3350 if (unlikely(blk_bidi_rq(rq)))
3351 bidi_bytes = blk_rq_bytes(rq->next_rq);
3352
3353 pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3354 BUG_ON(pending);
3355}
3356EXPORT_SYMBOL(__blk_end_request_all);
3357
3358/**
3359 * __blk_end_request_cur - Helper function to finish the current request chunk.
3360 * @rq: the request to finish the current chunk for
3361 * @error: block status code
3362 *
3363 * Description:
3364 * Complete the current consecutively mapped chunk from @rq. Must
3365 * be called with queue lock held.
3366 *
3367 * Return:
3368 * %false - we are done with this request
3369 * %true - still buffers pending for this request
3370 */
3371bool __blk_end_request_cur(struct request *rq, blk_status_t error)
3372{
3373 return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
3374}
3375EXPORT_SYMBOL(__blk_end_request_cur);
3376
3377void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 1507void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
3378 struct bio *bio) 1508 struct bio *bio)
3379{ 1509{
@@ -3429,8 +1559,8 @@ EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
3429 */ 1559 */
3430int blk_lld_busy(struct request_queue *q) 1560int blk_lld_busy(struct request_queue *q)
3431{ 1561{
3432 if (q->lld_busy_fn) 1562 if (queue_is_mq(q) && q->mq_ops->busy)
3433 return q->lld_busy_fn(q); 1563 return q->mq_ops->busy(q);
3434 1564
3435 return 0; 1565 return 0;
3436} 1566}
@@ -3461,7 +1591,6 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
3461 */ 1591 */
3462static void __blk_rq_prep_clone(struct request *dst, struct request *src) 1592static void __blk_rq_prep_clone(struct request *dst, struct request *src)
3463{ 1593{
3464 dst->cpu = src->cpu;
3465 dst->__sector = blk_rq_pos(src); 1594 dst->__sector = blk_rq_pos(src);
3466 dst->__data_len = blk_rq_bytes(src); 1595 dst->__data_len = blk_rq_bytes(src);
3467 if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { 1596 if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
@@ -3573,9 +1702,11 @@ void blk_start_plug(struct blk_plug *plug)
3573 if (tsk->plug) 1702 if (tsk->plug)
3574 return; 1703 return;
3575 1704
3576 INIT_LIST_HEAD(&plug->list);
3577 INIT_LIST_HEAD(&plug->mq_list); 1705 INIT_LIST_HEAD(&plug->mq_list);
3578 INIT_LIST_HEAD(&plug->cb_list); 1706 INIT_LIST_HEAD(&plug->cb_list);
1707 plug->rq_count = 0;
1708 plug->multiple_queues = false;
1709
3579 /* 1710 /*
3580 * Store ordering should not be needed here, since a potential 1711 * Store ordering should not be needed here, since a potential
3581 * preempt will imply a full memory barrier 1712 * preempt will imply a full memory barrier
@@ -3584,36 +1715,6 @@ void blk_start_plug(struct blk_plug *plug)
3584} 1715}
3585EXPORT_SYMBOL(blk_start_plug); 1716EXPORT_SYMBOL(blk_start_plug);
3586 1717
3587static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
3588{
3589 struct request *rqa = container_of(a, struct request, queuelist);
3590 struct request *rqb = container_of(b, struct request, queuelist);
3591
3592 return !(rqa->q < rqb->q ||
3593 (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
3594}
3595
3596/*
3597 * If 'from_schedule' is true, then postpone the dispatch of requests
3598 * until a safe kblockd context. We due this to avoid accidental big
3599 * additional stack usage in driver dispatch, in places where the originally
3600 * plugger did not intend it.
3601 */
3602static void queue_unplugged(struct request_queue *q, unsigned int depth,
3603 bool from_schedule)
3604 __releases(q->queue_lock)
3605{
3606 lockdep_assert_held(q->queue_lock);
3607
3608 trace_block_unplug(q, depth, !from_schedule);
3609
3610 if (from_schedule)
3611 blk_run_queue_async(q);
3612 else
3613 __blk_run_queue(q);
3614 spin_unlock_irq(q->queue_lock);
3615}
3616
3617static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) 1718static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3618{ 1719{
3619 LIST_HEAD(callbacks); 1720 LIST_HEAD(callbacks);
@@ -3658,65 +1759,10 @@ EXPORT_SYMBOL(blk_check_plugged);
3658 1759
3659void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1760void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3660{ 1761{
3661 struct request_queue *q;
3662 struct request *rq;
3663 LIST_HEAD(list);
3664 unsigned int depth;
3665
3666 flush_plug_callbacks(plug, from_schedule); 1762 flush_plug_callbacks(plug, from_schedule);
3667 1763
3668 if (!list_empty(&plug->mq_list)) 1764 if (!list_empty(&plug->mq_list))
3669 blk_mq_flush_plug_list(plug, from_schedule); 1765 blk_mq_flush_plug_list(plug, from_schedule);
3670
3671 if (list_empty(&plug->list))
3672 return;
3673
3674 list_splice_init(&plug->list, &list);
3675
3676 list_sort(NULL, &list, plug_rq_cmp);
3677
3678 q = NULL;
3679 depth = 0;
3680
3681 while (!list_empty(&list)) {
3682 rq = list_entry_rq(list.next);
3683 list_del_init(&rq->queuelist);
3684 BUG_ON(!rq->q);
3685 if (rq->q != q) {
3686 /*
3687 * This drops the queue lock
3688 */
3689 if (q)
3690 queue_unplugged(q, depth, from_schedule);
3691 q = rq->q;
3692 depth = 0;
3693 spin_lock_irq(q->queue_lock);
3694 }
3695
3696 /*
3697 * Short-circuit if @q is dead
3698 */
3699 if (unlikely(blk_queue_dying(q))) {
3700 __blk_end_request_all(rq, BLK_STS_IOERR);
3701 continue;
3702 }
3703
3704 /*
3705 * rq is already accounted, so use raw insert
3706 */
3707 if (op_is_flush(rq->cmd_flags))
3708 __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
3709 else
3710 __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
3711
3712 depth++;
3713 }
3714
3715 /*
3716 * This drops the queue lock
3717 */
3718 if (q)
3719 queue_unplugged(q, depth, from_schedule);
3720} 1766}
3721 1767
3722void blk_finish_plug(struct blk_plug *plug) 1768void blk_finish_plug(struct blk_plug *plug)
@@ -3743,9 +1789,6 @@ int __init blk_dev_init(void)
3743 if (!kblockd_workqueue) 1789 if (!kblockd_workqueue)
3744 panic("Failed to create kblockd\n"); 1790 panic("Failed to create kblockd\n");
3745 1791
3746 request_cachep = kmem_cache_create("blkdev_requests",
3747 sizeof(struct request), 0, SLAB_PANIC, NULL);
3748
3749 blk_requestq_cachep = kmem_cache_create("request_queue", 1792 blk_requestq_cachep = kmem_cache_create("request_queue",
3750 sizeof(struct request_queue), 0, SLAB_PANIC, NULL); 1793 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
3751 1794
diff --git a/block/blk-exec.c b/block/blk-exec.c
index f7b292f12449..a34b7d918742 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -48,8 +48,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
48 struct request *rq, int at_head, 48 struct request *rq, int at_head,
49 rq_end_io_fn *done) 49 rq_end_io_fn *done)
50{ 50{
51 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
52
53 WARN_ON(irqs_disabled()); 51 WARN_ON(irqs_disabled());
54 WARN_ON(!blk_rq_is_passthrough(rq)); 52 WARN_ON(!blk_rq_is_passthrough(rq));
55 53
@@ -60,23 +58,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
60 * don't check dying flag for MQ because the request won't 58 * don't check dying flag for MQ because the request won't
61 * be reused after dying flag is set 59 * be reused after dying flag is set
62 */ 60 */
63 if (q->mq_ops) { 61 blk_mq_sched_insert_request(rq, at_head, true, false);
64 blk_mq_sched_insert_request(rq, at_head, true, false);
65 return;
66 }
67
68 spin_lock_irq(q->queue_lock);
69
70 if (unlikely(blk_queue_dying(q))) {
71 rq->rq_flags |= RQF_QUIET;
72 __blk_end_request_all(rq, BLK_STS_IOERR);
73 spin_unlock_irq(q->queue_lock);
74 return;
75 }
76
77 __elv_add_request(q, rq, where);
78 __blk_run_queue(q);
79 spin_unlock_irq(q->queue_lock);
80} 62}
81EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 63EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
82 64
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8b44b86779da..a3fc7191c694 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -93,7 +93,7 @@ enum {
93 FLUSH_PENDING_TIMEOUT = 5 * HZ, 93 FLUSH_PENDING_TIMEOUT = 5 * HZ,
94}; 94};
95 95
96static bool blk_kick_flush(struct request_queue *q, 96static void blk_kick_flush(struct request_queue *q,
97 struct blk_flush_queue *fq, unsigned int flags); 97 struct blk_flush_queue *fq, unsigned int flags);
98 98
99static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) 99static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
@@ -132,18 +132,9 @@ static void blk_flush_restore_request(struct request *rq)
132 rq->end_io = rq->flush.saved_end_io; 132 rq->end_io = rq->flush.saved_end_io;
133} 133}
134 134
135static bool blk_flush_queue_rq(struct request *rq, bool add_front) 135static void blk_flush_queue_rq(struct request *rq, bool add_front)
136{ 136{
137 if (rq->q->mq_ops) { 137 blk_mq_add_to_requeue_list(rq, add_front, true);
138 blk_mq_add_to_requeue_list(rq, add_front, true);
139 return false;
140 } else {
141 if (add_front)
142 list_add(&rq->queuelist, &rq->q->queue_head);
143 else
144 list_add_tail(&rq->queuelist, &rq->q->queue_head);
145 return true;
146 }
147} 138}
148 139
149/** 140/**
@@ -157,18 +148,17 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
157 * completion and trigger the next step. 148 * completion and trigger the next step.
158 * 149 *
159 * CONTEXT: 150 * CONTEXT:
160 * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) 151 * spin_lock_irq(fq->mq_flush_lock)
161 * 152 *
162 * RETURNS: 153 * RETURNS:
163 * %true if requests were added to the dispatch queue, %false otherwise. 154 * %true if requests were added to the dispatch queue, %false otherwise.
164 */ 155 */
165static bool blk_flush_complete_seq(struct request *rq, 156static void blk_flush_complete_seq(struct request *rq,
166 struct blk_flush_queue *fq, 157 struct blk_flush_queue *fq,
167 unsigned int seq, blk_status_t error) 158 unsigned int seq, blk_status_t error)
168{ 159{
169 struct request_queue *q = rq->q; 160 struct request_queue *q = rq->q;
170 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 161 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
171 bool queued = false, kicked;
172 unsigned int cmd_flags; 162 unsigned int cmd_flags;
173 163
174 BUG_ON(rq->flush.seq & seq); 164 BUG_ON(rq->flush.seq & seq);
@@ -191,7 +181,7 @@ static bool blk_flush_complete_seq(struct request *rq,
191 181
192 case REQ_FSEQ_DATA: 182 case REQ_FSEQ_DATA:
193 list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); 183 list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
194 queued = blk_flush_queue_rq(rq, true); 184 blk_flush_queue_rq(rq, true);
195 break; 185 break;
196 186
197 case REQ_FSEQ_DONE: 187 case REQ_FSEQ_DONE:
@@ -204,42 +194,34 @@ static bool blk_flush_complete_seq(struct request *rq,
204 BUG_ON(!list_empty(&rq->queuelist)); 194 BUG_ON(!list_empty(&rq->queuelist));
205 list_del_init(&rq->flush.list); 195 list_del_init(&rq->flush.list);
206 blk_flush_restore_request(rq); 196 blk_flush_restore_request(rq);
207 if (q->mq_ops) 197 blk_mq_end_request(rq, error);
208 blk_mq_end_request(rq, error);
209 else
210 __blk_end_request_all(rq, error);
211 break; 198 break;
212 199
213 default: 200 default:
214 BUG(); 201 BUG();
215 } 202 }
216 203
217 kicked = blk_kick_flush(q, fq, cmd_flags); 204 blk_kick_flush(q, fq, cmd_flags);
218 return kicked | queued;
219} 205}
220 206
221static void flush_end_io(struct request *flush_rq, blk_status_t error) 207static void flush_end_io(struct request *flush_rq, blk_status_t error)
222{ 208{
223 struct request_queue *q = flush_rq->q; 209 struct request_queue *q = flush_rq->q;
224 struct list_head *running; 210 struct list_head *running;
225 bool queued = false;
226 struct request *rq, *n; 211 struct request *rq, *n;
227 unsigned long flags = 0; 212 unsigned long flags = 0;
228 struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); 213 struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
214 struct blk_mq_hw_ctx *hctx;
229 215
230 if (q->mq_ops) { 216 /* release the tag's ownership to the req cloned from */
231 struct blk_mq_hw_ctx *hctx; 217 spin_lock_irqsave(&fq->mq_flush_lock, flags);
232 218 hctx = flush_rq->mq_hctx;
233 /* release the tag's ownership to the req cloned from */ 219 if (!q->elevator) {
234 spin_lock_irqsave(&fq->mq_flush_lock, flags); 220 blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
235 hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); 221 flush_rq->tag = -1;
236 if (!q->elevator) { 222 } else {
237 blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); 223 blk_mq_put_driver_tag_hctx(hctx, flush_rq);
238 flush_rq->tag = -1; 224 flush_rq->internal_tag = -1;
239 } else {
240 blk_mq_put_driver_tag_hctx(hctx, flush_rq);
241 flush_rq->internal_tag = -1;
242 }
243 } 225 }
244 226
245 running = &fq->flush_queue[fq->flush_running_idx]; 227 running = &fq->flush_queue[fq->flush_running_idx];
@@ -248,35 +230,16 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
248 /* account completion of the flush request */ 230 /* account completion of the flush request */
249 fq->flush_running_idx ^= 1; 231 fq->flush_running_idx ^= 1;
250 232
251 if (!q->mq_ops)
252 elv_completed_request(q, flush_rq);
253
254 /* and push the waiting requests to the next stage */ 233 /* and push the waiting requests to the next stage */
255 list_for_each_entry_safe(rq, n, running, flush.list) { 234 list_for_each_entry_safe(rq, n, running, flush.list) {
256 unsigned int seq = blk_flush_cur_seq(rq); 235 unsigned int seq = blk_flush_cur_seq(rq);
257 236
258 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); 237 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
259 queued |= blk_flush_complete_seq(rq, fq, seq, error); 238 blk_flush_complete_seq(rq, fq, seq, error);
260 } 239 }
261 240
262 /*
263 * Kick the queue to avoid stall for two cases:
264 * 1. Moving a request silently to empty queue_head may stall the
265 * queue.
266 * 2. When flush request is running in non-queueable queue, the
267 * queue is hold. Restart the queue after flush request is finished
268 * to avoid stall.
269 * This function is called from request completion path and calling
270 * directly into request_fn may confuse the driver. Always use
271 * kblockd.
272 */
273 if (queued || fq->flush_queue_delayed) {
274 WARN_ON(q->mq_ops);
275 blk_run_queue_async(q);
276 }
277 fq->flush_queue_delayed = 0; 241 fq->flush_queue_delayed = 0;
278 if (q->mq_ops) 242 spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
279 spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
280} 243}
281 244
282/** 245/**
@@ -289,12 +252,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
289 * Please read the comment at the top of this file for more info. 252 * Please read the comment at the top of this file for more info.
290 * 253 *
291 * CONTEXT: 254 * CONTEXT:
292 * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) 255 * spin_lock_irq(fq->mq_flush_lock)
293 * 256 *
294 * RETURNS:
295 * %true if flush was issued, %false otherwise.
296 */ 257 */
297static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, 258static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
298 unsigned int flags) 259 unsigned int flags)
299{ 260{
300 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 261 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
@@ -304,7 +265,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
304 265
305 /* C1 described at the top of this file */ 266 /* C1 described at the top of this file */
306 if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) 267 if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
307 return false; 268 return;
308 269
309 /* C2 and C3 270 /* C2 and C3
310 * 271 *
@@ -312,11 +273,10 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
312 * assigned to empty flushes, and we deadlock if we are expecting 273 * assigned to empty flushes, and we deadlock if we are expecting
313 * other requests to make progress. Don't defer for that case. 274 * other requests to make progress. Don't defer for that case.
314 */ 275 */
315 if (!list_empty(&fq->flush_data_in_flight) && 276 if (!list_empty(&fq->flush_data_in_flight) && q->elevator &&
316 !(q->mq_ops && q->elevator) &&
317 time_before(jiffies, 277 time_before(jiffies,
318 fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) 278 fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
319 return false; 279 return;
320 280
321 /* 281 /*
322 * Issue flush and toggle pending_idx. This makes pending_idx 282 * Issue flush and toggle pending_idx. This makes pending_idx
@@ -334,19 +294,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
334 * In case of IO scheduler, flush rq need to borrow scheduler tag 294 * In case of IO scheduler, flush rq need to borrow scheduler tag
335 * just for cheating put/get driver tag. 295 * just for cheating put/get driver tag.
336 */ 296 */
337 if (q->mq_ops) { 297 flush_rq->mq_ctx = first_rq->mq_ctx;
338 struct blk_mq_hw_ctx *hctx; 298 flush_rq->mq_hctx = first_rq->mq_hctx;
339 299
340 flush_rq->mq_ctx = first_rq->mq_ctx; 300 if (!q->elevator) {
341 301 fq->orig_rq = first_rq;
342 if (!q->elevator) { 302 flush_rq->tag = first_rq->tag;
343 fq->orig_rq = first_rq; 303 blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
344 flush_rq->tag = first_rq->tag; 304 } else {
345 hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); 305 flush_rq->internal_tag = first_rq->internal_tag;
346 blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
347 } else {
348 flush_rq->internal_tag = first_rq->internal_tag;
349 }
350 } 306 }
351 307
352 flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; 308 flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -355,62 +311,17 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
355 flush_rq->rq_disk = first_rq->rq_disk; 311 flush_rq->rq_disk = first_rq->rq_disk;
356 flush_rq->end_io = flush_end_io; 312 flush_rq->end_io = flush_end_io;
357 313
358 return blk_flush_queue_rq(flush_rq, false); 314 blk_flush_queue_rq(flush_rq, false);
359}
360
361static void flush_data_end_io(struct request *rq, blk_status_t error)
362{
363 struct request_queue *q = rq->q;
364 struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
365
366 lockdep_assert_held(q->queue_lock);
367
368 /*
369 * Updating q->in_flight[] here for making this tag usable
370 * early. Because in blk_queue_start_tag(),
371 * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and
372 * reserve tags for sync I/O.
373 *
374 * More importantly this way can avoid the following I/O
375 * deadlock:
376 *
377 * - suppose there are 40 fua requests comming to flush queue
378 * and queue depth is 31
379 * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc
380 * tag for async I/O any more
381 * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT
382 * and flush_data_end_io() is called
383 * - the other rqs still can't go ahead if not updating
384 * q->in_flight[BLK_RW_ASYNC] here, meantime these rqs
385 * are held in flush data queue and make no progress of
386 * handling post flush rq
387 * - only after the post flush rq is handled, all these rqs
388 * can be completed
389 */
390
391 elv_completed_request(q, rq);
392
393 /* for avoiding double accounting */
394 rq->rq_flags &= ~RQF_STARTED;
395
396 /*
397 * After populating an empty queue, kick it to avoid stall. Read
398 * the comment in flush_end_io().
399 */
400 if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
401 blk_run_queue_async(q);
402} 315}
403 316
404static void mq_flush_data_end_io(struct request *rq, blk_status_t error) 317static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
405{ 318{
406 struct request_queue *q = rq->q; 319 struct request_queue *q = rq->q;
407 struct blk_mq_hw_ctx *hctx; 320 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
408 struct blk_mq_ctx *ctx = rq->mq_ctx; 321 struct blk_mq_ctx *ctx = rq->mq_ctx;
409 unsigned long flags; 322 unsigned long flags;
410 struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); 323 struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
411 324
412 hctx = blk_mq_map_queue(q, ctx->cpu);
413
414 if (q->elevator) { 325 if (q->elevator) {
415 WARN_ON(rq->tag < 0); 326 WARN_ON(rq->tag < 0);
416 blk_mq_put_driver_tag_hctx(hctx, rq); 327 blk_mq_put_driver_tag_hctx(hctx, rq);
@@ -443,9 +354,6 @@ void blk_insert_flush(struct request *rq)
443 unsigned int policy = blk_flush_policy(fflags, rq); 354 unsigned int policy = blk_flush_policy(fflags, rq);
444 struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); 355 struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
445 356
446 if (!q->mq_ops)
447 lockdep_assert_held(q->queue_lock);
448
449 /* 357 /*
450 * @policy now records what operations need to be done. Adjust 358 * @policy now records what operations need to be done. Adjust
451 * REQ_PREFLUSH and FUA for the driver. 359 * REQ_PREFLUSH and FUA for the driver.
@@ -468,10 +376,7 @@ void blk_insert_flush(struct request *rq)
468 * complete the request. 376 * complete the request.
469 */ 377 */
470 if (!policy) { 378 if (!policy) {
471 if (q->mq_ops) 379 blk_mq_end_request(rq, 0);
472 blk_mq_end_request(rq, 0);
473 else
474 __blk_end_request(rq, 0, 0);
475 return; 380 return;
476 } 381 }
477 382
@@ -484,10 +389,7 @@ void blk_insert_flush(struct request *rq)
484 */ 389 */
485 if ((policy & REQ_FSEQ_DATA) && 390 if ((policy & REQ_FSEQ_DATA) &&
486 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 391 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
487 if (q->mq_ops) 392 blk_mq_request_bypass_insert(rq, false);
488 blk_mq_request_bypass_insert(rq, false);
489 else
490 list_add_tail(&rq->queuelist, &q->queue_head);
491 return; 393 return;
492 } 394 }
493 395
@@ -499,17 +401,12 @@ void blk_insert_flush(struct request *rq)
499 INIT_LIST_HEAD(&rq->flush.list); 401 INIT_LIST_HEAD(&rq->flush.list);
500 rq->rq_flags |= RQF_FLUSH_SEQ; 402 rq->rq_flags |= RQF_FLUSH_SEQ;
501 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 403 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
502 if (q->mq_ops) {
503 rq->end_io = mq_flush_data_end_io;
504 404
505 spin_lock_irq(&fq->mq_flush_lock); 405 rq->end_io = mq_flush_data_end_io;
506 blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
507 spin_unlock_irq(&fq->mq_flush_lock);
508 return;
509 }
510 rq->end_io = flush_data_end_io;
511 406
407 spin_lock_irq(&fq->mq_flush_lock);
512 blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); 408 blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
409 spin_unlock_irq(&fq->mq_flush_lock);
513} 410}
514 411
515/** 412/**
@@ -575,8 +472,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
575 if (!fq) 472 if (!fq)
576 goto fail; 473 goto fail;
577 474
578 if (q->mq_ops) 475 spin_lock_init(&fq->mq_flush_lock);
579 spin_lock_init(&fq->mq_flush_lock);
580 476
581 rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); 477 rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
582 fq->flush_rq = kzalloc_node(rq_sz, flags, node); 478 fq->flush_rq = kzalloc_node(rq_sz, flags, node);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 01580f88fcb3..5ed59ac6ae58 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -28,7 +28,6 @@ void get_io_context(struct io_context *ioc)
28 BUG_ON(atomic_long_read(&ioc->refcount) <= 0); 28 BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
29 atomic_long_inc(&ioc->refcount); 29 atomic_long_inc(&ioc->refcount);
30} 30}
31EXPORT_SYMBOL(get_io_context);
32 31
33static void icq_free_icq_rcu(struct rcu_head *head) 32static void icq_free_icq_rcu(struct rcu_head *head)
34{ 33{
@@ -48,10 +47,8 @@ static void ioc_exit_icq(struct io_cq *icq)
48 if (icq->flags & ICQ_EXITED) 47 if (icq->flags & ICQ_EXITED)
49 return; 48 return;
50 49
51 if (et->uses_mq && et->ops.mq.exit_icq) 50 if (et->ops.exit_icq)
52 et->ops.mq.exit_icq(icq); 51 et->ops.exit_icq(icq);
53 else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
54 et->ops.sq.elevator_exit_icq_fn(icq);
55 52
56 icq->flags |= ICQ_EXITED; 53 icq->flags |= ICQ_EXITED;
57} 54}
@@ -113,9 +110,9 @@ static void ioc_release_fn(struct work_struct *work)
113 struct io_cq, ioc_node); 110 struct io_cq, ioc_node);
114 struct request_queue *q = icq->q; 111 struct request_queue *q = icq->q;
115 112
116 if (spin_trylock(q->queue_lock)) { 113 if (spin_trylock(&q->queue_lock)) {
117 ioc_destroy_icq(icq); 114 ioc_destroy_icq(icq);
118 spin_unlock(q->queue_lock); 115 spin_unlock(&q->queue_lock);
119 } else { 116 } else {
120 spin_unlock_irqrestore(&ioc->lock, flags); 117 spin_unlock_irqrestore(&ioc->lock, flags);
121 cpu_relax(); 118 cpu_relax();
@@ -162,7 +159,6 @@ void put_io_context(struct io_context *ioc)
162 if (free_ioc) 159 if (free_ioc)
163 kmem_cache_free(iocontext_cachep, ioc); 160 kmem_cache_free(iocontext_cachep, ioc);
164} 161}
165EXPORT_SYMBOL(put_io_context);
166 162
167/** 163/**
168 * put_io_context_active - put active reference on ioc 164 * put_io_context_active - put active reference on ioc
@@ -173,7 +169,6 @@ EXPORT_SYMBOL(put_io_context);
173 */ 169 */
174void put_io_context_active(struct io_context *ioc) 170void put_io_context_active(struct io_context *ioc)
175{ 171{
176 struct elevator_type *et;
177 unsigned long flags; 172 unsigned long flags;
178 struct io_cq *icq; 173 struct io_cq *icq;
179 174
@@ -187,25 +182,12 @@ void put_io_context_active(struct io_context *ioc)
187 * reverse double locking. Read comment in ioc_release_fn() for 182 * reverse double locking. Read comment in ioc_release_fn() for
188 * explanation on the nested locking annotation. 183 * explanation on the nested locking annotation.
189 */ 184 */
190retry:
191 spin_lock_irqsave_nested(&ioc->lock, flags, 1); 185 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
192 hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { 186 hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
193 if (icq->flags & ICQ_EXITED) 187 if (icq->flags & ICQ_EXITED)
194 continue; 188 continue;
195 189
196 et = icq->q->elevator->type; 190 ioc_exit_icq(icq);
197 if (et->uses_mq) {
198 ioc_exit_icq(icq);
199 } else {
200 if (spin_trylock(icq->q->queue_lock)) {
201 ioc_exit_icq(icq);
202 spin_unlock(icq->q->queue_lock);
203 } else {
204 spin_unlock_irqrestore(&ioc->lock, flags);
205 cpu_relax();
206 goto retry;
207 }
208 }
209 } 191 }
210 spin_unlock_irqrestore(&ioc->lock, flags); 192 spin_unlock_irqrestore(&ioc->lock, flags);
211 193
@@ -232,7 +214,7 @@ static void __ioc_clear_queue(struct list_head *icq_list)
232 214
233 while (!list_empty(icq_list)) { 215 while (!list_empty(icq_list)) {
234 struct io_cq *icq = list_entry(icq_list->next, 216 struct io_cq *icq = list_entry(icq_list->next,
235 struct io_cq, q_node); 217 struct io_cq, q_node);
236 struct io_context *ioc = icq->ioc; 218 struct io_context *ioc = icq->ioc;
237 219
238 spin_lock_irqsave(&ioc->lock, flags); 220 spin_lock_irqsave(&ioc->lock, flags);
@@ -251,16 +233,11 @@ void ioc_clear_queue(struct request_queue *q)
251{ 233{
252 LIST_HEAD(icq_list); 234 LIST_HEAD(icq_list);
253 235
254 spin_lock_irq(q->queue_lock); 236 spin_lock_irq(&q->queue_lock);
255 list_splice_init(&q->icq_list, &icq_list); 237 list_splice_init(&q->icq_list, &icq_list);
238 spin_unlock_irq(&q->queue_lock);
256 239
257 if (q->mq_ops) { 240 __ioc_clear_queue(&icq_list);
258 spin_unlock_irq(q->queue_lock);
259 __ioc_clear_queue(&icq_list);
260 } else {
261 __ioc_clear_queue(&icq_list);
262 spin_unlock_irq(q->queue_lock);
263 }
264} 241}
265 242
266int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) 243int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
@@ -336,7 +313,6 @@ struct io_context *get_task_io_context(struct task_struct *task,
336 313
337 return NULL; 314 return NULL;
338} 315}
339EXPORT_SYMBOL(get_task_io_context);
340 316
341/** 317/**
342 * ioc_lookup_icq - lookup io_cq from ioc 318 * ioc_lookup_icq - lookup io_cq from ioc
@@ -350,7 +326,7 @@ struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
350{ 326{
351 struct io_cq *icq; 327 struct io_cq *icq;
352 328
353 lockdep_assert_held(q->queue_lock); 329 lockdep_assert_held(&q->queue_lock);
354 330
355 /* 331 /*
356 * icq's are indexed from @ioc using radix tree and hint pointer, 332 * icq's are indexed from @ioc using radix tree and hint pointer,
@@ -409,16 +385,14 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
409 INIT_HLIST_NODE(&icq->ioc_node); 385 INIT_HLIST_NODE(&icq->ioc_node);
410 386
411 /* lock both q and ioc and try to link @icq */ 387 /* lock both q and ioc and try to link @icq */
412 spin_lock_irq(q->queue_lock); 388 spin_lock_irq(&q->queue_lock);
413 spin_lock(&ioc->lock); 389 spin_lock(&ioc->lock);
414 390
415 if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { 391 if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
416 hlist_add_head(&icq->ioc_node, &ioc->icq_list); 392 hlist_add_head(&icq->ioc_node, &ioc->icq_list);
417 list_add(&icq->q_node, &q->icq_list); 393 list_add(&icq->q_node, &q->icq_list);
418 if (et->uses_mq && et->ops.mq.init_icq) 394 if (et->ops.init_icq)
419 et->ops.mq.init_icq(icq); 395 et->ops.init_icq(icq);
420 else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
421 et->ops.sq.elevator_init_icq_fn(icq);
422 } else { 396 } else {
423 kmem_cache_free(et->icq_cache, icq); 397 kmem_cache_free(et->icq_cache, icq);
424 icq = ioc_lookup_icq(ioc, q); 398 icq = ioc_lookup_icq(ioc, q);
@@ -427,7 +401,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
427 } 401 }
428 402
429 spin_unlock(&ioc->lock); 403 spin_unlock(&ioc->lock);
430 spin_unlock_irq(q->queue_lock); 404 spin_unlock_irq(&q->queue_lock);
431 radix_tree_preload_end(); 405 radix_tree_preload_end();
432 return icq; 406 return icq;
433} 407}
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 38c35c32aff2..fc714ef402a6 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -262,29 +262,25 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
262 stat->rqs.mean); 262 stat->rqs.mean);
263} 263}
264 264
265static inline bool iolatency_may_queue(struct iolatency_grp *iolat, 265static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
266 wait_queue_entry_t *wait,
267 bool first_block)
268{ 266{
269 struct rq_wait *rqw = &iolat->rq_wait; 267 atomic_dec(&rqw->inflight);
268 wake_up(&rqw->wait);
269}
270 270
271 if (first_block && waitqueue_active(&rqw->wait) && 271static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
272 rqw->wait.head.next != &wait->entry) 272{
273 return false; 273 struct iolatency_grp *iolat = private_data;
274 return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); 274 return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
275} 275}
276 276
277static void __blkcg_iolatency_throttle(struct rq_qos *rqos, 277static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
278 struct iolatency_grp *iolat, 278 struct iolatency_grp *iolat,
279 spinlock_t *lock, bool issue_as_root, 279 bool issue_as_root,
280 bool use_memdelay) 280 bool use_memdelay)
281 __releases(lock)
282 __acquires(lock)
283{ 281{
284 struct rq_wait *rqw = &iolat->rq_wait; 282 struct rq_wait *rqw = &iolat->rq_wait;
285 unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); 283 unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
286 DEFINE_WAIT(wait);
287 bool first_block = true;
288 284
289 if (use_delay) 285 if (use_delay)
290 blkcg_schedule_throttle(rqos->q, use_memdelay); 286 blkcg_schedule_throttle(rqos->q, use_memdelay);
@@ -301,27 +297,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
301 return; 297 return;
302 } 298 }
303 299
304 if (iolatency_may_queue(iolat, &wait, first_block)) 300 rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb);
305 return;
306
307 do {
308 prepare_to_wait_exclusive(&rqw->wait, &wait,
309 TASK_UNINTERRUPTIBLE);
310
311 if (iolatency_may_queue(iolat, &wait, first_block))
312 break;
313 first_block = false;
314
315 if (lock) {
316 spin_unlock_irq(lock);
317 io_schedule();
318 spin_lock_irq(lock);
319 } else {
320 io_schedule();
321 }
322 } while (1);
323
324 finish_wait(&rqw->wait, &wait);
325} 301}
326 302
327#define SCALE_DOWN_FACTOR 2 303#define SCALE_DOWN_FACTOR 2
@@ -478,38 +454,15 @@ static void check_scale_change(struct iolatency_grp *iolat)
478 scale_change(iolat, direction > 0); 454 scale_change(iolat, direction > 0);
479} 455}
480 456
481static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, 457static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
482 spinlock_t *lock)
483{ 458{
484 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 459 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
485 struct blkcg *blkcg; 460 struct blkcg_gq *blkg = bio->bi_blkg;
486 struct blkcg_gq *blkg;
487 struct request_queue *q = rqos->q;
488 bool issue_as_root = bio_issue_as_root_blkg(bio); 461 bool issue_as_root = bio_issue_as_root_blkg(bio);
489 462
490 if (!blk_iolatency_enabled(blkiolat)) 463 if (!blk_iolatency_enabled(blkiolat))
491 return; 464 return;
492 465
493 rcu_read_lock();
494 blkcg = bio_blkcg(bio);
495 bio_associate_blkcg(bio, &blkcg->css);
496 blkg = blkg_lookup(blkcg, q);
497 if (unlikely(!blkg)) {
498 if (!lock)
499 spin_lock_irq(q->queue_lock);
500 blkg = blkg_lookup_create(blkcg, q);
501 if (IS_ERR(blkg))
502 blkg = NULL;
503 if (!lock)
504 spin_unlock_irq(q->queue_lock);
505 }
506 if (!blkg)
507 goto out;
508
509 bio_issue_init(&bio->bi_issue, bio_sectors(bio));
510 bio_associate_blkg(bio, blkg);
511out:
512 rcu_read_unlock();
513 while (blkg && blkg->parent) { 466 while (blkg && blkg->parent) {
514 struct iolatency_grp *iolat = blkg_to_lat(blkg); 467 struct iolatency_grp *iolat = blkg_to_lat(blkg);
515 if (!iolat) { 468 if (!iolat) {
@@ -518,7 +471,7 @@ out:
518 } 471 }
519 472
520 check_scale_change(iolat); 473 check_scale_change(iolat);
521 __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, 474 __blkcg_iolatency_throttle(rqos, iolat, issue_as_root,
522 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 475 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
523 blkg = blkg->parent; 476 blkg = blkg->parent;
524 } 477 }
@@ -640,7 +593,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
640 bool enabled = false; 593 bool enabled = false;
641 594
642 blkg = bio->bi_blkg; 595 blkg = bio->bi_blkg;
643 if (!blkg) 596 if (!blkg || !bio_flagged(bio, BIO_TRACKED))
644 return; 597 return;
645 598
646 iolat = blkg_to_lat(bio->bi_blkg); 599 iolat = blkg_to_lat(bio->bi_blkg);
@@ -730,7 +683,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
730 * We could be exiting, don't access the pd unless we have a 683 * We could be exiting, don't access the pd unless we have a
731 * ref on the blkg. 684 * ref on the blkg.
732 */ 685 */
733 if (!blkg_try_get(blkg)) 686 if (!blkg_tryget(blkg))
734 continue; 687 continue;
735 688
736 iolat = blkg_to_lat(blkg); 689 iolat = blkg_to_lat(blkg);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 76f867ea9a9b..5f2c429d4378 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -51,16 +51,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
51 if ((sector | nr_sects) & bs_mask) 51 if ((sector | nr_sects) & bs_mask)
52 return -EINVAL; 52 return -EINVAL;
53 53
54 while (nr_sects) { 54 if (!nr_sects)
55 unsigned int req_sects = nr_sects; 55 return -EINVAL;
56 sector_t end_sect;
57 56
58 if (!req_sects) 57 while (nr_sects) {
59 goto fail; 58 sector_t req_sects = min_t(sector_t, nr_sects,
60 if (req_sects > UINT_MAX >> 9) 59 bio_allowed_max_sectors(q));
61 req_sects = UINT_MAX >> 9;
62 60
63 end_sect = sector + req_sects; 61 WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
64 62
65 bio = blk_next_bio(bio, 0, gfp_mask); 63 bio = blk_next_bio(bio, 0, gfp_mask);
66 bio->bi_iter.bi_sector = sector; 64 bio->bi_iter.bi_sector = sector;
@@ -68,8 +66,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
68 bio_set_op_attrs(bio, op, 0); 66 bio_set_op_attrs(bio, op, 0);
69 67
70 bio->bi_iter.bi_size = req_sects << 9; 68 bio->bi_iter.bi_size = req_sects << 9;
69 sector += req_sects;
71 nr_sects -= req_sects; 70 nr_sects -= req_sects;
72 sector = end_sect;
73 71
74 /* 72 /*
75 * We can loop for a long time in here, if someone does 73 * We can loop for a long time in here, if someone does
@@ -82,14 +80,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
82 80
83 *biop = bio; 81 *biop = bio;
84 return 0; 82 return 0;
85
86fail:
87 if (bio) {
88 submit_bio_wait(bio);
89 bio_put(bio);
90 }
91 *biop = NULL;
92 return -EOPNOTSUPP;
93} 83}
94EXPORT_SYMBOL(__blkdev_issue_discard); 84EXPORT_SYMBOL(__blkdev_issue_discard);
95 85
@@ -161,7 +151,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
161 return -EOPNOTSUPP; 151 return -EOPNOTSUPP;
162 152
163 /* Ensure that max_write_same_sectors doesn't overflow bi_size */ 153 /* Ensure that max_write_same_sectors doesn't overflow bi_size */
164 max_write_same_sectors = UINT_MAX >> 9; 154 max_write_same_sectors = bio_allowed_max_sectors(q);
165 155
166 while (nr_sects) { 156 while (nr_sects) {
167 bio = blk_next_bio(bio, 1, gfp_mask); 157 bio = blk_next_bio(bio, 1, gfp_mask);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4478d53cc6ee..71e9ac03f621 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -46,7 +46,7 @@ static inline bool bio_will_gap(struct request_queue *q,
46 bio_get_first_bvec(prev_rq->bio, &pb); 46 bio_get_first_bvec(prev_rq->bio, &pb);
47 else 47 else
48 bio_get_first_bvec(prev, &pb); 48 bio_get_first_bvec(prev, &pb);
49 if (pb.bv_offset) 49 if (pb.bv_offset & queue_virt_boundary(q))
50 return true; 50 return true;
51 51
52 /* 52 /*
@@ -90,7 +90,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
90 /* Zero-sector (unknown) and one-sector granularities are the same. */ 90 /* Zero-sector (unknown) and one-sector granularities are the same. */
91 granularity = max(q->limits.discard_granularity >> 9, 1U); 91 granularity = max(q->limits.discard_granularity >> 9, 1U);
92 92
93 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); 93 max_discard_sectors = min(q->limits.max_discard_sectors,
94 bio_allowed_max_sectors(q));
94 max_discard_sectors -= max_discard_sectors % granularity; 95 max_discard_sectors -= max_discard_sectors % granularity;
95 96
96 if (unlikely(!max_discard_sectors)) { 97 if (unlikely(!max_discard_sectors)) {
@@ -387,7 +388,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
387 388
388 bio_set_flag(bio, BIO_SEG_VALID); 389 bio_set_flag(bio, BIO_SEG_VALID);
389} 390}
390EXPORT_SYMBOL(blk_recount_segments);
391 391
392static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, 392static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
393 struct bio *nxt) 393 struct bio *nxt)
@@ -591,17 +591,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
591 return ll_new_hw_segment(q, req, bio); 591 return ll_new_hw_segment(q, req, bio);
592} 592}
593 593
594/*
595 * blk-mq uses req->special to carry normal driver per-request payload, it
596 * does not indicate a prepared command that we cannot merge with.
597 */
598static bool req_no_special_merge(struct request *req)
599{
600 struct request_queue *q = req->q;
601
602 return !q->mq_ops && req->special;
603}
604
605static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, 594static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
606 struct request *next) 595 struct request *next)
607{ 596{
@@ -627,13 +616,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
627 unsigned int seg_size = 616 unsigned int seg_size =
628 req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; 617 req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
629 618
630 /*
631 * First check if the either of the requests are re-queued
632 * requests. Can't merge them if they are.
633 */
634 if (req_no_special_merge(req) || req_no_special_merge(next))
635 return 0;
636
637 if (req_gap_back_merge(req, next->bio)) 619 if (req_gap_back_merge(req, next->bio))
638 return 0; 620 return 0;
639 621
@@ -698,12 +680,10 @@ static void blk_account_io_merge(struct request *req)
698{ 680{
699 if (blk_do_io_stat(req)) { 681 if (blk_do_io_stat(req)) {
700 struct hd_struct *part; 682 struct hd_struct *part;
701 int cpu;
702 683
703 cpu = part_stat_lock(); 684 part_stat_lock();
704 part = req->part; 685 part = req->part;
705 686
706 part_round_stats(req->q, cpu, part);
707 part_dec_in_flight(req->q, part, rq_data_dir(req)); 687 part_dec_in_flight(req->q, part, rq_data_dir(req));
708 688
709 hd_struct_put(part); 689 hd_struct_put(part);
@@ -726,7 +706,8 @@ static inline bool blk_discard_mergable(struct request *req)
726 return false; 706 return false;
727} 707}
728 708
729enum elv_merge blk_try_req_merge(struct request *req, struct request *next) 709static enum elv_merge blk_try_req_merge(struct request *req,
710 struct request *next)
730{ 711{
731 if (blk_discard_mergable(req)) 712 if (blk_discard_mergable(req))
732 return ELEVATOR_DISCARD_MERGE; 713 return ELEVATOR_DISCARD_MERGE;
@@ -743,9 +724,6 @@ enum elv_merge blk_try_req_merge(struct request *req, struct request *next)
743static struct request *attempt_merge(struct request_queue *q, 724static struct request *attempt_merge(struct request_queue *q,
744 struct request *req, struct request *next) 725 struct request *req, struct request *next)
745{ 726{
746 if (!q->mq_ops)
747 lockdep_assert_held(q->queue_lock);
748
749 if (!rq_mergeable(req) || !rq_mergeable(next)) 727 if (!rq_mergeable(req) || !rq_mergeable(next))
750 return NULL; 728 return NULL;
751 729
@@ -753,8 +731,7 @@ static struct request *attempt_merge(struct request_queue *q,
753 return NULL; 731 return NULL;
754 732
755 if (rq_data_dir(req) != rq_data_dir(next) 733 if (rq_data_dir(req) != rq_data_dir(next)
756 || req->rq_disk != next->rq_disk 734 || req->rq_disk != next->rq_disk)
757 || req_no_special_merge(next))
758 return NULL; 735 return NULL;
759 736
760 if (req_op(req) == REQ_OP_WRITE_SAME && 737 if (req_op(req) == REQ_OP_WRITE_SAME &&
@@ -768,6 +745,9 @@ static struct request *attempt_merge(struct request_queue *q,
768 if (req->write_hint != next->write_hint) 745 if (req->write_hint != next->write_hint)
769 return NULL; 746 return NULL;
770 747
748 if (req->ioprio != next->ioprio)
749 return NULL;
750
771 /* 751 /*
772 * If we are allowed to merge, then append bio list 752 * If we are allowed to merge, then append bio list
773 * from next to rq and release next. merge_requests_fn 753 * from next to rq and release next. merge_requests_fn
@@ -815,7 +795,7 @@ static struct request *attempt_merge(struct request_queue *q,
815 795
816 req->__data_len += blk_rq_bytes(next); 796 req->__data_len += blk_rq_bytes(next);
817 797
818 if (req_op(req) != REQ_OP_DISCARD) 798 if (!blk_discard_mergable(req))
819 elv_merge_requests(q, req, next); 799 elv_merge_requests(q, req, next);
820 800
821 /* 801 /*
@@ -823,10 +803,6 @@ static struct request *attempt_merge(struct request_queue *q,
823 */ 803 */
824 blk_account_io_merge(next); 804 blk_account_io_merge(next);
825 805
826 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
827 if (blk_rq_cpu_valid(next))
828 req->cpu = next->cpu;
829
830 /* 806 /*
831 * ownership of bio passed from next to req, return 'next' for 807 * ownership of bio passed from next to req, return 'next' for
832 * the caller to free 808 * the caller to free
@@ -858,16 +834,11 @@ struct request *attempt_front_merge(struct request_queue *q, struct request *rq)
858int blk_attempt_req_merge(struct request_queue *q, struct request *rq, 834int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
859 struct request *next) 835 struct request *next)
860{ 836{
861 struct elevator_queue *e = q->elevator;
862 struct request *free; 837 struct request *free;
863 838
864 if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
865 if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
866 return 0;
867
868 free = attempt_merge(q, rq, next); 839 free = attempt_merge(q, rq, next);
869 if (free) { 840 if (free) {
870 __blk_put_request(q, free); 841 blk_put_request(free);
871 return 1; 842 return 1;
872 } 843 }
873 844
@@ -886,8 +857,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
886 if (bio_data_dir(bio) != rq_data_dir(rq)) 857 if (bio_data_dir(bio) != rq_data_dir(rq))
887 return false; 858 return false;
888 859
889 /* must be same device and not a special request */ 860 /* must be same device */
890 if (rq->rq_disk != bio->bi_disk || req_no_special_merge(rq)) 861 if (rq->rq_disk != bio->bi_disk)
891 return false; 862 return false;
892 863
893 /* only merge integrity protected bio into ditto rq */ 864 /* only merge integrity protected bio into ditto rq */
@@ -906,6 +877,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
906 if (rq->write_hint != bio->bi_write_hint) 877 if (rq->write_hint != bio->bi_write_hint)
907 return false; 878 return false;
908 879
880 if (rq->ioprio != bio_prio(bio))
881 return false;
882
909 return true; 883 return true;
910} 884}
911 885
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 3eb169f15842..03a534820271 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -14,9 +14,10 @@
14#include "blk.h" 14#include "blk.h"
15#include "blk-mq.h" 15#include "blk-mq.h"
16 16
17static int cpu_to_queue_index(unsigned int nr_queues, const int cpu) 17static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
18 unsigned int nr_queues, const int cpu)
18{ 19{
19 return cpu % nr_queues; 20 return qmap->queue_offset + (cpu % nr_queues);
20} 21}
21 22
22static int get_first_sibling(unsigned int cpu) 23static int get_first_sibling(unsigned int cpu)
@@ -30,10 +31,10 @@ static int get_first_sibling(unsigned int cpu)
30 return cpu; 31 return cpu;
31} 32}
32 33
33int blk_mq_map_queues(struct blk_mq_tag_set *set) 34int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
34{ 35{
35 unsigned int *map = set->mq_map; 36 unsigned int *map = qmap->mq_map;
36 unsigned int nr_queues = set->nr_hw_queues; 37 unsigned int nr_queues = qmap->nr_queues;
37 unsigned int cpu, first_sibling; 38 unsigned int cpu, first_sibling;
38 39
39 for_each_possible_cpu(cpu) { 40 for_each_possible_cpu(cpu) {
@@ -44,11 +45,11 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set)
44 * performace optimizations. 45 * performace optimizations.
45 */ 46 */
46 if (cpu < nr_queues) { 47 if (cpu < nr_queues) {
47 map[cpu] = cpu_to_queue_index(nr_queues, cpu); 48 map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
48 } else { 49 } else {
49 first_sibling = get_first_sibling(cpu); 50 first_sibling = get_first_sibling(cpu);
50 if (first_sibling == cpu) 51 if (first_sibling == cpu)
51 map[cpu] = cpu_to_queue_index(nr_queues, cpu); 52 map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
52 else 53 else
53 map[cpu] = map[first_sibling]; 54 map[cpu] = map[first_sibling];
54 } 55 }
@@ -62,12 +63,12 @@ EXPORT_SYMBOL_GPL(blk_mq_map_queues);
62 * We have no quick way of doing reverse lookups. This is only used at 63 * We have no quick way of doing reverse lookups. This is only used at
63 * queue init time, so runtime isn't important. 64 * queue init time, so runtime isn't important.
64 */ 65 */
65int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) 66int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
66{ 67{
67 int i; 68 int i;
68 69
69 for_each_possible_cpu(i) { 70 for_each_possible_cpu(i) {
70 if (index == mq_map[i]) 71 if (index == qmap->mq_map[i])
71 return local_memory_node(cpu_to_node(i)); 72 return local_memory_node(cpu_to_node(i));
72 } 73 }
73 74
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 10b284a1f18d..90d68760af08 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -23,6 +23,7 @@
23#include "blk-mq.h" 23#include "blk-mq.h"
24#include "blk-mq-debugfs.h" 24#include "blk-mq-debugfs.h"
25#include "blk-mq-tag.h" 25#include "blk-mq-tag.h"
26#include "blk-rq-qos.h"
26 27
27static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) 28static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
28{ 29{
@@ -112,10 +113,8 @@ static int queue_pm_only_show(void *data, struct seq_file *m)
112 113
113#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name 114#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
114static const char *const blk_queue_flag_name[] = { 115static const char *const blk_queue_flag_name[] = {
115 QUEUE_FLAG_NAME(QUEUED),
116 QUEUE_FLAG_NAME(STOPPED), 116 QUEUE_FLAG_NAME(STOPPED),
117 QUEUE_FLAG_NAME(DYING), 117 QUEUE_FLAG_NAME(DYING),
118 QUEUE_FLAG_NAME(BYPASS),
119 QUEUE_FLAG_NAME(BIDI), 118 QUEUE_FLAG_NAME(BIDI),
120 QUEUE_FLAG_NAME(NOMERGES), 119 QUEUE_FLAG_NAME(NOMERGES),
121 QUEUE_FLAG_NAME(SAME_COMP), 120 QUEUE_FLAG_NAME(SAME_COMP),
@@ -318,7 +317,6 @@ static const char *const cmd_flag_name[] = {
318static const char *const rqf_name[] = { 317static const char *const rqf_name[] = {
319 RQF_NAME(SORTED), 318 RQF_NAME(SORTED),
320 RQF_NAME(STARTED), 319 RQF_NAME(STARTED),
321 RQF_NAME(QUEUED),
322 RQF_NAME(SOFTBARRIER), 320 RQF_NAME(SOFTBARRIER),
323 RQF_NAME(FLUSH_SEQ), 321 RQF_NAME(FLUSH_SEQ),
324 RQF_NAME(MIXED_MERGE), 322 RQF_NAME(MIXED_MERGE),
@@ -424,15 +422,18 @@ struct show_busy_params {
424 422
425/* 423/*
426 * Note: the state of a request may change while this function is in progress, 424 * Note: the state of a request may change while this function is in progress,
427 * e.g. due to a concurrent blk_mq_finish_request() call. 425 * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to
426 * keep iterating requests.
428 */ 427 */
429static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) 428static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
430{ 429{
431 const struct show_busy_params *params = data; 430 const struct show_busy_params *params = data;
432 431
433 if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx) 432 if (rq->mq_hctx == params->hctx)
434 __blk_mq_debugfs_rq_show(params->m, 433 __blk_mq_debugfs_rq_show(params->m,
435 list_entry_rq(&rq->queuelist)); 434 list_entry_rq(&rq->queuelist));
435
436 return true;
436} 437}
437 438
438static int hctx_busy_show(void *data, struct seq_file *m) 439static int hctx_busy_show(void *data, struct seq_file *m)
@@ -446,6 +447,21 @@ static int hctx_busy_show(void *data, struct seq_file *m)
446 return 0; 447 return 0;
447} 448}
448 449
450static const char *const hctx_types[] = {
451 [HCTX_TYPE_DEFAULT] = "default",
452 [HCTX_TYPE_READ] = "read",
453 [HCTX_TYPE_POLL] = "poll",
454};
455
456static int hctx_type_show(void *data, struct seq_file *m)
457{
458 struct blk_mq_hw_ctx *hctx = data;
459
460 BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES);
461 seq_printf(m, "%s\n", hctx_types[hctx->type]);
462 return 0;
463}
464
449static int hctx_ctx_map_show(void *data, struct seq_file *m) 465static int hctx_ctx_map_show(void *data, struct seq_file *m)
450{ 466{
451 struct blk_mq_hw_ctx *hctx = data; 467 struct blk_mq_hw_ctx *hctx = data;
@@ -636,36 +652,43 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
636 return 0; 652 return 0;
637} 653}
638 654
639static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) 655#define CTX_RQ_SEQ_OPS(name, type) \
640 __acquires(&ctx->lock) 656static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
641{ 657 __acquires(&ctx->lock) \
642 struct blk_mq_ctx *ctx = m->private; 658{ \
643 659 struct blk_mq_ctx *ctx = m->private; \
644 spin_lock(&ctx->lock); 660 \
645 return seq_list_start(&ctx->rq_list, *pos); 661 spin_lock(&ctx->lock); \
646} 662 return seq_list_start(&ctx->rq_lists[type], *pos); \
647 663} \
648static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) 664 \
649{ 665static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \
650 struct blk_mq_ctx *ctx = m->private; 666 loff_t *pos) \
651 667{ \
652 return seq_list_next(v, &ctx->rq_list, pos); 668 struct blk_mq_ctx *ctx = m->private; \
669 \
670 return seq_list_next(v, &ctx->rq_lists[type], pos); \
671} \
672 \
673static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \
674 __releases(&ctx->lock) \
675{ \
676 struct blk_mq_ctx *ctx = m->private; \
677 \
678 spin_unlock(&ctx->lock); \
679} \
680 \
681static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \
682 .start = ctx_##name##_rq_list_start, \
683 .next = ctx_##name##_rq_list_next, \
684 .stop = ctx_##name##_rq_list_stop, \
685 .show = blk_mq_debugfs_rq_show, \
653} 686}
654 687
655static void ctx_rq_list_stop(struct seq_file *m, void *v) 688CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
656 __releases(&ctx->lock) 689CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
657{ 690CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
658 struct blk_mq_ctx *ctx = m->private;
659
660 spin_unlock(&ctx->lock);
661}
662 691
663static const struct seq_operations ctx_rq_list_seq_ops = {
664 .start = ctx_rq_list_start,
665 .next = ctx_rq_list_next,
666 .stop = ctx_rq_list_stop,
667 .show = blk_mq_debugfs_rq_show,
668};
669static int ctx_dispatched_show(void *data, struct seq_file *m) 692static int ctx_dispatched_show(void *data, struct seq_file *m)
670{ 693{
671 struct blk_mq_ctx *ctx = data; 694 struct blk_mq_ctx *ctx = data;
@@ -798,11 +821,14 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
798 {"run", 0600, hctx_run_show, hctx_run_write}, 821 {"run", 0600, hctx_run_show, hctx_run_write},
799 {"active", 0400, hctx_active_show}, 822 {"active", 0400, hctx_active_show},
800 {"dispatch_busy", 0400, hctx_dispatch_busy_show}, 823 {"dispatch_busy", 0400, hctx_dispatch_busy_show},
824 {"type", 0400, hctx_type_show},
801 {}, 825 {},
802}; 826};
803 827
804static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { 828static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
805 {"rq_list", 0400, .seq_ops = &ctx_rq_list_seq_ops}, 829 {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
830 {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
831 {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
806 {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, 832 {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
807 {"merged", 0600, ctx_merged_show, ctx_merged_write}, 833 {"merged", 0600, ctx_merged_show, ctx_merged_write},
808 {"completed", 0600, ctx_completed_show, ctx_completed_write}, 834 {"completed", 0600, ctx_completed_show, ctx_completed_write},
@@ -856,6 +882,15 @@ int blk_mq_debugfs_register(struct request_queue *q)
856 goto err; 882 goto err;
857 } 883 }
858 884
885 if (q->rq_qos) {
886 struct rq_qos *rqos = q->rq_qos;
887
888 while (rqos) {
889 blk_mq_debugfs_register_rqos(rqos);
890 rqos = rqos->next;
891 }
892 }
893
859 return 0; 894 return 0;
860 895
861err: 896err:
@@ -978,6 +1013,50 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q)
978 q->sched_debugfs_dir = NULL; 1013 q->sched_debugfs_dir = NULL;
979} 1014}
980 1015
1016void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
1017{
1018 debugfs_remove_recursive(rqos->debugfs_dir);
1019 rqos->debugfs_dir = NULL;
1020}
1021
1022int blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
1023{
1024 struct request_queue *q = rqos->q;
1025 const char *dir_name = rq_qos_id_to_name(rqos->id);
1026
1027 if (!q->debugfs_dir)
1028 return -ENOENT;
1029
1030 if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
1031 return 0;
1032
1033 if (!q->rqos_debugfs_dir) {
1034 q->rqos_debugfs_dir = debugfs_create_dir("rqos",
1035 q->debugfs_dir);
1036 if (!q->rqos_debugfs_dir)
1037 return -ENOMEM;
1038 }
1039
1040 rqos->debugfs_dir = debugfs_create_dir(dir_name,
1041 rqos->q->rqos_debugfs_dir);
1042 if (!rqos->debugfs_dir)
1043 return -ENOMEM;
1044
1045 if (!debugfs_create_files(rqos->debugfs_dir, rqos,
1046 rqos->ops->debugfs_attrs))
1047 goto err;
1048 return 0;
1049 err:
1050 blk_mq_debugfs_unregister_rqos(rqos);
1051 return -ENOMEM;
1052}
1053
1054void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
1055{
1056 debugfs_remove_recursive(q->rqos_debugfs_dir);
1057 q->rqos_debugfs_dir = NULL;
1058}
1059
981int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, 1060int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
982 struct blk_mq_hw_ctx *hctx) 1061 struct blk_mq_hw_ctx *hctx)
983{ 1062{
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a9160be12be0..8c9012a578c1 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -31,6 +31,10 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q);
31int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, 31int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
32 struct blk_mq_hw_ctx *hctx); 32 struct blk_mq_hw_ctx *hctx);
33void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); 33void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
34
35int blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
36void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
37void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q);
34#else 38#else
35static inline int blk_mq_debugfs_register(struct request_queue *q) 39static inline int blk_mq_debugfs_register(struct request_queue *q)
36{ 40{
@@ -78,6 +82,19 @@ static inline int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
78static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) 82static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
79{ 83{
80} 84}
85
86static inline int blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
87{
88 return 0;
89}
90
91static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
92{
93}
94
95static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
96{
97}
81#endif 98#endif
82 99
83#ifdef CONFIG_BLK_DEBUG_FS_ZONED 100#ifdef CONFIG_BLK_DEBUG_FS_ZONED
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index db644ec624f5..1dce18553984 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -31,26 +31,26 @@
31 * that maps a queue to the CPUs that have irq affinity for the corresponding 31 * that maps a queue to the CPUs that have irq affinity for the corresponding
32 * vector. 32 * vector.
33 */ 33 */
34int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, 34int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
35 int offset) 35 int offset)
36{ 36{
37 const struct cpumask *mask; 37 const struct cpumask *mask;
38 unsigned int queue, cpu; 38 unsigned int queue, cpu;
39 39
40 for (queue = 0; queue < set->nr_hw_queues; queue++) { 40 for (queue = 0; queue < qmap->nr_queues; queue++) {
41 mask = pci_irq_get_affinity(pdev, queue + offset); 41 mask = pci_irq_get_affinity(pdev, queue + offset);
42 if (!mask) 42 if (!mask)
43 goto fallback; 43 goto fallback;
44 44
45 for_each_cpu(cpu, mask) 45 for_each_cpu(cpu, mask)
46 set->mq_map[cpu] = queue; 46 qmap->mq_map[cpu] = qmap->queue_offset + queue;
47 } 47 }
48 48
49 return 0; 49 return 0;
50 50
51fallback: 51fallback:
52 WARN_ON_ONCE(set->nr_hw_queues > 1); 52 WARN_ON_ONCE(qmap->nr_queues > 1);
53 blk_mq_clear_mq_map(set); 53 blk_mq_clear_mq_map(qmap);
54 return 0; 54 return 0;
55} 55}
56EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); 56EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
index 996167f1de18..45030a81a1ed 100644
--- a/block/blk-mq-rdma.c
+++ b/block/blk-mq-rdma.c
@@ -29,24 +29,24 @@
29 * @set->nr_hw_queues, or @dev does not provide an affinity mask for a 29 * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
30 * vector, we fallback to the naive mapping. 30 * vector, we fallback to the naive mapping.
31 */ 31 */
32int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, 32int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
33 struct ib_device *dev, int first_vec) 33 struct ib_device *dev, int first_vec)
34{ 34{
35 const struct cpumask *mask; 35 const struct cpumask *mask;
36 unsigned int queue, cpu; 36 unsigned int queue, cpu;
37 37
38 for (queue = 0; queue < set->nr_hw_queues; queue++) { 38 for (queue = 0; queue < map->nr_queues; queue++) {
39 mask = ib_get_vector_affinity(dev, first_vec + queue); 39 mask = ib_get_vector_affinity(dev, first_vec + queue);
40 if (!mask) 40 if (!mask)
41 goto fallback; 41 goto fallback;
42 42
43 for_each_cpu(cpu, mask) 43 for_each_cpu(cpu, mask)
44 set->mq_map[cpu] = queue; 44 map->mq_map[cpu] = map->queue_offset + queue;
45 } 45 }
46 46
47 return 0; 47 return 0;
48 48
49fallback: 49fallback:
50 return blk_mq_map_queues(set); 50 return blk_mq_map_queues(map);
51} 51}
52EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); 52EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 29bfe8017a2d..140933e4a7d1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -31,15 +31,22 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
31} 31}
32EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); 32EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
33 33
34void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) 34void blk_mq_sched_assign_ioc(struct request *rq)
35{ 35{
36 struct request_queue *q = rq->q; 36 struct request_queue *q = rq->q;
37 struct io_context *ioc = rq_ioc(bio); 37 struct io_context *ioc;
38 struct io_cq *icq; 38 struct io_cq *icq;
39 39
40 spin_lock_irq(q->queue_lock); 40 /*
41 * May not have an IO context if it's a passthrough request
42 */
43 ioc = current->io_context;
44 if (!ioc)
45 return;
46
47 spin_lock_irq(&q->queue_lock);
41 icq = ioc_lookup_icq(ioc, q); 48 icq = ioc_lookup_icq(ioc, q);
42 spin_unlock_irq(q->queue_lock); 49 spin_unlock_irq(&q->queue_lock);
43 50
44 if (!icq) { 51 if (!icq) {
45 icq = ioc_create_icq(ioc, q, GFP_ATOMIC); 52 icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
@@ -54,13 +61,14 @@ void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
54 * Mark a hardware queue as needing a restart. For shared queues, maintain 61 * Mark a hardware queue as needing a restart. For shared queues, maintain
55 * a count of how many hardware queues are marked for restart. 62 * a count of how many hardware queues are marked for restart.
56 */ 63 */
57static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) 64void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
58{ 65{
59 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 66 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
60 return; 67 return;
61 68
62 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 69 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
63} 70}
71EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
64 72
65void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) 73void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
66{ 74{
@@ -85,14 +93,13 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
85 do { 93 do {
86 struct request *rq; 94 struct request *rq;
87 95
88 if (e->type->ops.mq.has_work && 96 if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
89 !e->type->ops.mq.has_work(hctx))
90 break; 97 break;
91 98
92 if (!blk_mq_get_dispatch_budget(hctx)) 99 if (!blk_mq_get_dispatch_budget(hctx))
93 break; 100 break;
94 101
95 rq = e->type->ops.mq.dispatch_request(hctx); 102 rq = e->type->ops.dispatch_request(hctx);
96 if (!rq) { 103 if (!rq) {
97 blk_mq_put_dispatch_budget(hctx); 104 blk_mq_put_dispatch_budget(hctx);
98 break; 105 break;
@@ -110,7 +117,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
110static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, 117static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
111 struct blk_mq_ctx *ctx) 118 struct blk_mq_ctx *ctx)
112{ 119{
113 unsigned idx = ctx->index_hw; 120 unsigned short idx = ctx->index_hw[hctx->type];
114 121
115 if (++idx == hctx->nr_ctx) 122 if (++idx == hctx->nr_ctx)
116 idx = 0; 123 idx = 0;
@@ -163,7 +170,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
163{ 170{
164 struct request_queue *q = hctx->queue; 171 struct request_queue *q = hctx->queue;
165 struct elevator_queue *e = q->elevator; 172 struct elevator_queue *e = q->elevator;
166 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; 173 const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
167 LIST_HEAD(rq_list); 174 LIST_HEAD(rq_list);
168 175
169 /* RCU or SRCU read lock is needed before checking quiesced flag */ 176 /* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -295,11 +302,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
295 * too much time checking for merges. 302 * too much time checking for merges.
296 */ 303 */
297static bool blk_mq_attempt_merge(struct request_queue *q, 304static bool blk_mq_attempt_merge(struct request_queue *q,
305 struct blk_mq_hw_ctx *hctx,
298 struct blk_mq_ctx *ctx, struct bio *bio) 306 struct blk_mq_ctx *ctx, struct bio *bio)
299{ 307{
308 enum hctx_type type = hctx->type;
309
300 lockdep_assert_held(&ctx->lock); 310 lockdep_assert_held(&ctx->lock);
301 311
302 if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { 312 if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) {
303 ctx->rq_merged++; 313 ctx->rq_merged++;
304 return true; 314 return true;
305 } 315 }
@@ -311,19 +321,21 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
311{ 321{
312 struct elevator_queue *e = q->elevator; 322 struct elevator_queue *e = q->elevator;
313 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 323 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
314 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 324 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
315 bool ret = false; 325 bool ret = false;
326 enum hctx_type type;
316 327
317 if (e && e->type->ops.mq.bio_merge) { 328 if (e && e->type->ops.bio_merge) {
318 blk_mq_put_ctx(ctx); 329 blk_mq_put_ctx(ctx);
319 return e->type->ops.mq.bio_merge(hctx, bio); 330 return e->type->ops.bio_merge(hctx, bio);
320 } 331 }
321 332
333 type = hctx->type;
322 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 334 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
323 !list_empty_careful(&ctx->rq_list)) { 335 !list_empty_careful(&ctx->rq_lists[type])) {
324 /* default per sw-queue merge */ 336 /* default per sw-queue merge */
325 spin_lock(&ctx->lock); 337 spin_lock(&ctx->lock);
326 ret = blk_mq_attempt_merge(q, ctx, bio); 338 ret = blk_mq_attempt_merge(q, hctx, ctx, bio);
327 spin_unlock(&ctx->lock); 339 spin_unlock(&ctx->lock);
328 } 340 }
329 341
@@ -367,7 +379,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
367 struct request_queue *q = rq->q; 379 struct request_queue *q = rq->q;
368 struct elevator_queue *e = q->elevator; 380 struct elevator_queue *e = q->elevator;
369 struct blk_mq_ctx *ctx = rq->mq_ctx; 381 struct blk_mq_ctx *ctx = rq->mq_ctx;
370 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 382 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
371 383
372 /* flush rq in flush machinery need to be dispatched directly */ 384 /* flush rq in flush machinery need to be dispatched directly */
373 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { 385 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
@@ -380,11 +392,11 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
380 if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) 392 if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
381 goto run; 393 goto run;
382 394
383 if (e && e->type->ops.mq.insert_requests) { 395 if (e && e->type->ops.insert_requests) {
384 LIST_HEAD(list); 396 LIST_HEAD(list);
385 397
386 list_add(&rq->queuelist, &list); 398 list_add(&rq->queuelist, &list);
387 e->type->ops.mq.insert_requests(hctx, &list, at_head); 399 e->type->ops.insert_requests(hctx, &list, at_head);
388 } else { 400 } else {
389 spin_lock(&ctx->lock); 401 spin_lock(&ctx->lock);
390 __blk_mq_insert_request(hctx, rq, at_head); 402 __blk_mq_insert_request(hctx, rq, at_head);
@@ -396,27 +408,25 @@ run:
396 blk_mq_run_hw_queue(hctx, async); 408 blk_mq_run_hw_queue(hctx, async);
397} 409}
398 410
399void blk_mq_sched_insert_requests(struct request_queue *q, 411void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
400 struct blk_mq_ctx *ctx, 412 struct blk_mq_ctx *ctx,
401 struct list_head *list, bool run_queue_async) 413 struct list_head *list, bool run_queue_async)
402{ 414{
403 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 415 struct elevator_queue *e;
404 struct elevator_queue *e = hctx->queue->elevator;
405 416
406 if (e && e->type->ops.mq.insert_requests) 417 e = hctx->queue->elevator;
407 e->type->ops.mq.insert_requests(hctx, list, false); 418 if (e && e->type->ops.insert_requests)
419 e->type->ops.insert_requests(hctx, list, false);
408 else { 420 else {
409 /* 421 /*
410 * try to issue requests directly if the hw queue isn't 422 * try to issue requests directly if the hw queue isn't
411 * busy in case of 'none' scheduler, and this way may save 423 * busy in case of 'none' scheduler, and this way may save
412 * us one extra enqueue & dequeue to sw queue. 424 * us one extra enqueue & dequeue to sw queue.
413 */ 425 */
414 if (!hctx->dispatch_busy && !e && !run_queue_async) { 426 if (!hctx->dispatch_busy && !e && !run_queue_async)
415 blk_mq_try_issue_list_directly(hctx, list); 427 blk_mq_try_issue_list_directly(hctx, list);
416 if (list_empty(list)) 428 else
417 return; 429 blk_mq_insert_requests(hctx, ctx, list);
418 }
419 blk_mq_insert_requests(hctx, ctx, list);
420 } 430 }
421 431
422 blk_mq_run_hw_queue(hctx, run_queue_async); 432 blk_mq_run_hw_queue(hctx, run_queue_async);
@@ -489,15 +499,15 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
489 goto err; 499 goto err;
490 } 500 }
491 501
492 ret = e->ops.mq.init_sched(q, e); 502 ret = e->ops.init_sched(q, e);
493 if (ret) 503 if (ret)
494 goto err; 504 goto err;
495 505
496 blk_mq_debugfs_register_sched(q); 506 blk_mq_debugfs_register_sched(q);
497 507
498 queue_for_each_hw_ctx(q, hctx, i) { 508 queue_for_each_hw_ctx(q, hctx, i) {
499 if (e->ops.mq.init_hctx) { 509 if (e->ops.init_hctx) {
500 ret = e->ops.mq.init_hctx(hctx, i); 510 ret = e->ops.init_hctx(hctx, i);
501 if (ret) { 511 if (ret) {
502 eq = q->elevator; 512 eq = q->elevator;
503 blk_mq_exit_sched(q, eq); 513 blk_mq_exit_sched(q, eq);
@@ -523,14 +533,14 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
523 533
524 queue_for_each_hw_ctx(q, hctx, i) { 534 queue_for_each_hw_ctx(q, hctx, i) {
525 blk_mq_debugfs_unregister_sched_hctx(hctx); 535 blk_mq_debugfs_unregister_sched_hctx(hctx);
526 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 536 if (e->type->ops.exit_hctx && hctx->sched_data) {
527 e->type->ops.mq.exit_hctx(hctx, i); 537 e->type->ops.exit_hctx(hctx, i);
528 hctx->sched_data = NULL; 538 hctx->sched_data = NULL;
529 } 539 }
530 } 540 }
531 blk_mq_debugfs_unregister_sched(q); 541 blk_mq_debugfs_unregister_sched(q);
532 if (e->type->ops.mq.exit_sched) 542 if (e->type->ops.exit_sched)
533 e->type->ops.mq.exit_sched(e); 543 e->type->ops.exit_sched(e);
534 blk_mq_sched_tags_teardown(q); 544 blk_mq_sched_tags_teardown(q);
535 q->elevator = NULL; 545 q->elevator = NULL;
536} 546}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 8a9544203173..c7bdb52367ac 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -8,18 +8,19 @@
8void blk_mq_sched_free_hctx_data(struct request_queue *q, 8void blk_mq_sched_free_hctx_data(struct request_queue *q,
9 void (*exit)(struct blk_mq_hw_ctx *)); 9 void (*exit)(struct blk_mq_hw_ctx *));
10 10
11void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio); 11void blk_mq_sched_assign_ioc(struct request *rq);
12 12
13void blk_mq_sched_request_inserted(struct request *rq); 13void blk_mq_sched_request_inserted(struct request *rq);
14bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 14bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
15 struct request **merged_request); 15 struct request **merged_request);
16bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); 16bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
17bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); 17bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
18void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
18void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); 19void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
19 20
20void blk_mq_sched_insert_request(struct request *rq, bool at_head, 21void blk_mq_sched_insert_request(struct request *rq, bool at_head,
21 bool run_queue, bool async); 22 bool run_queue, bool async);
22void blk_mq_sched_insert_requests(struct request_queue *q, 23void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
23 struct blk_mq_ctx *ctx, 24 struct blk_mq_ctx *ctx,
24 struct list_head *list, bool run_queue_async); 25 struct list_head *list, bool run_queue_async);
25 26
@@ -43,8 +44,8 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
43{ 44{
44 struct elevator_queue *e = q->elevator; 45 struct elevator_queue *e = q->elevator;
45 46
46 if (e && e->type->ops.mq.allow_merge) 47 if (e && e->type->ops.allow_merge)
47 return e->type->ops.mq.allow_merge(q, rq, bio); 48 return e->type->ops.allow_merge(q, rq, bio);
48 49
49 return true; 50 return true;
50} 51}
@@ -53,8 +54,8 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
53{ 54{
54 struct elevator_queue *e = rq->q->elevator; 55 struct elevator_queue *e = rq->q->elevator;
55 56
56 if (e && e->type->ops.mq.completed_request) 57 if (e && e->type->ops.completed_request)
57 e->type->ops.mq.completed_request(rq, now); 58 e->type->ops.completed_request(rq, now);
58} 59}
59 60
60static inline void blk_mq_sched_started_request(struct request *rq) 61static inline void blk_mq_sched_started_request(struct request *rq)
@@ -62,8 +63,8 @@ static inline void blk_mq_sched_started_request(struct request *rq)
62 struct request_queue *q = rq->q; 63 struct request_queue *q = rq->q;
63 struct elevator_queue *e = q->elevator; 64 struct elevator_queue *e = q->elevator;
64 65
65 if (e && e->type->ops.mq.started_request) 66 if (e && e->type->ops.started_request)
66 e->type->ops.mq.started_request(rq); 67 e->type->ops.started_request(rq);
67} 68}
68 69
69static inline void blk_mq_sched_requeue_request(struct request *rq) 70static inline void blk_mq_sched_requeue_request(struct request *rq)
@@ -71,16 +72,16 @@ static inline void blk_mq_sched_requeue_request(struct request *rq)
71 struct request_queue *q = rq->q; 72 struct request_queue *q = rq->q;
72 struct elevator_queue *e = q->elevator; 73 struct elevator_queue *e = q->elevator;
73 74
74 if (e && e->type->ops.mq.requeue_request) 75 if (e && e->type->ops.requeue_request)
75 e->type->ops.mq.requeue_request(rq); 76 e->type->ops.requeue_request(rq);
76} 77}
77 78
78static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) 79static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
79{ 80{
80 struct elevator_queue *e = hctx->queue->elevator; 81 struct elevator_queue *e = hctx->queue->elevator;
81 82
82 if (e && e->type->ops.mq.has_work) 83 if (e && e->type->ops.has_work)
83 return e->type->ops.mq.has_work(hctx); 84 return e->type->ops.has_work(hctx);
84 85
85 return false; 86 return false;
86} 87}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index aafb44224c89..3f9c3f4ac44c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -15,6 +15,18 @@
15 15
16static void blk_mq_sysfs_release(struct kobject *kobj) 16static void blk_mq_sysfs_release(struct kobject *kobj)
17{ 17{
18 struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj);
19
20 free_percpu(ctxs->queue_ctx);
21 kfree(ctxs);
22}
23
24static void blk_mq_ctx_sysfs_release(struct kobject *kobj)
25{
26 struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj);
27
28 /* ctx->ctxs won't be released until all ctx are freed */
29 kobject_put(&ctx->ctxs->kobj);
18} 30}
19 31
20static void blk_mq_hw_sysfs_release(struct kobject *kobj) 32static void blk_mq_hw_sysfs_release(struct kobject *kobj)
@@ -203,7 +215,7 @@ static struct kobj_type blk_mq_ktype = {
203static struct kobj_type blk_mq_ctx_ktype = { 215static struct kobj_type blk_mq_ctx_ktype = {
204 .sysfs_ops = &blk_mq_sysfs_ops, 216 .sysfs_ops = &blk_mq_sysfs_ops,
205 .default_attrs = default_ctx_attrs, 217 .default_attrs = default_ctx_attrs,
206 .release = blk_mq_sysfs_release, 218 .release = blk_mq_ctx_sysfs_release,
207}; 219};
208 220
209static struct kobj_type blk_mq_hw_ktype = { 221static struct kobj_type blk_mq_hw_ktype = {
@@ -235,7 +247,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
235 if (!hctx->nr_ctx) 247 if (!hctx->nr_ctx)
236 return 0; 248 return 0;
237 249
238 ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); 250 ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num);
239 if (ret) 251 if (ret)
240 return ret; 252 return ret;
241 253
@@ -258,8 +270,8 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
258 queue_for_each_hw_ctx(q, hctx, i) 270 queue_for_each_hw_ctx(q, hctx, i)
259 blk_mq_unregister_hctx(hctx); 271 blk_mq_unregister_hctx(hctx);
260 272
261 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); 273 kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
262 kobject_del(&q->mq_kobj); 274 kobject_del(q->mq_kobj);
263 kobject_put(&dev->kobj); 275 kobject_put(&dev->kobj);
264 276
265 q->mq_sysfs_init_done = false; 277 q->mq_sysfs_init_done = false;
@@ -279,7 +291,7 @@ void blk_mq_sysfs_deinit(struct request_queue *q)
279 ctx = per_cpu_ptr(q->queue_ctx, cpu); 291 ctx = per_cpu_ptr(q->queue_ctx, cpu);
280 kobject_put(&ctx->kobj); 292 kobject_put(&ctx->kobj);
281 } 293 }
282 kobject_put(&q->mq_kobj); 294 kobject_put(q->mq_kobj);
283} 295}
284 296
285void blk_mq_sysfs_init(struct request_queue *q) 297void blk_mq_sysfs_init(struct request_queue *q)
@@ -287,10 +299,12 @@ void blk_mq_sysfs_init(struct request_queue *q)
287 struct blk_mq_ctx *ctx; 299 struct blk_mq_ctx *ctx;
288 int cpu; 300 int cpu;
289 301
290 kobject_init(&q->mq_kobj, &blk_mq_ktype); 302 kobject_init(q->mq_kobj, &blk_mq_ktype);
291 303
292 for_each_possible_cpu(cpu) { 304 for_each_possible_cpu(cpu) {
293 ctx = per_cpu_ptr(q->queue_ctx, cpu); 305 ctx = per_cpu_ptr(q->queue_ctx, cpu);
306
307 kobject_get(q->mq_kobj);
294 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); 308 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
295 } 309 }
296} 310}
@@ -303,11 +317,11 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
303 WARN_ON_ONCE(!q->kobj.parent); 317 WARN_ON_ONCE(!q->kobj.parent);
304 lockdep_assert_held(&q->sysfs_lock); 318 lockdep_assert_held(&q->sysfs_lock);
305 319
306 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); 320 ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
307 if (ret < 0) 321 if (ret < 0)
308 goto out; 322 goto out;
309 323
310 kobject_uevent(&q->mq_kobj, KOBJ_ADD); 324 kobject_uevent(q->mq_kobj, KOBJ_ADD);
311 325
312 queue_for_each_hw_ctx(q, hctx, i) { 326 queue_for_each_hw_ctx(q, hctx, i) {
313 ret = blk_mq_register_hctx(hctx); 327 ret = blk_mq_register_hctx(hctx);
@@ -324,8 +338,8 @@ unreg:
324 while (--i >= 0) 338 while (--i >= 0)
325 blk_mq_unregister_hctx(q->queue_hw_ctx[i]); 339 blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
326 340
327 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); 341 kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
328 kobject_del(&q->mq_kobj); 342 kobject_del(q->mq_kobj);
329 kobject_put(&dev->kobj); 343 kobject_put(&dev->kobj);
330 return ret; 344 return ret;
331} 345}
@@ -340,7 +354,6 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
340 354
341 return ret; 355 return ret;
342} 356}
343EXPORT_SYMBOL_GPL(blk_mq_register_dev);
344 357
345void blk_mq_sysfs_unregister(struct request_queue *q) 358void blk_mq_sysfs_unregister(struct request_queue *q)
346{ 359{
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index cfda95b85d34..2089c6c62f44 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -110,7 +110,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
110 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 110 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
111 struct sbitmap_queue *bt; 111 struct sbitmap_queue *bt;
112 struct sbq_wait_state *ws; 112 struct sbq_wait_state *ws;
113 DEFINE_WAIT(wait); 113 DEFINE_SBQ_WAIT(wait);
114 unsigned int tag_offset; 114 unsigned int tag_offset;
115 bool drop_ctx; 115 bool drop_ctx;
116 int tag; 116 int tag;
@@ -154,8 +154,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
154 if (tag != -1) 154 if (tag != -1)
155 break; 155 break;
156 156
157 prepare_to_wait_exclusive(&ws->wait, &wait, 157 sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
158 TASK_UNINTERRUPTIBLE);
159 158
160 tag = __blk_mq_get_tag(data, bt); 159 tag = __blk_mq_get_tag(data, bt);
161 if (tag != -1) 160 if (tag != -1)
@@ -167,16 +166,17 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
167 bt_prev = bt; 166 bt_prev = bt;
168 io_schedule(); 167 io_schedule();
169 168
169 sbitmap_finish_wait(bt, ws, &wait);
170
170 data->ctx = blk_mq_get_ctx(data->q); 171 data->ctx = blk_mq_get_ctx(data->q);
171 data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); 172 data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
173 data->ctx->cpu);
172 tags = blk_mq_tags_from_data(data); 174 tags = blk_mq_tags_from_data(data);
173 if (data->flags & BLK_MQ_REQ_RESERVED) 175 if (data->flags & BLK_MQ_REQ_RESERVED)
174 bt = &tags->breserved_tags; 176 bt = &tags->breserved_tags;
175 else 177 else
176 bt = &tags->bitmap_tags; 178 bt = &tags->bitmap_tags;
177 179
178 finish_wait(&ws->wait, &wait);
179
180 /* 180 /*
181 * If destination hw queue is changed, fake wake up on 181 * If destination hw queue is changed, fake wake up on
182 * previous queue for compensating the wake up miss, so 182 * previous queue for compensating the wake up miss, so
@@ -191,7 +191,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
191 if (drop_ctx && data->ctx) 191 if (drop_ctx && data->ctx)
192 blk_mq_put_ctx(data->ctx); 192 blk_mq_put_ctx(data->ctx);
193 193
194 finish_wait(&ws->wait, &wait); 194 sbitmap_finish_wait(bt, ws, &wait);
195 195
196found_tag: 196found_tag:
197 return tag + tag_offset; 197 return tag + tag_offset;
@@ -235,7 +235,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
235 * test and set the bit before assigning ->rqs[]. 235 * test and set the bit before assigning ->rqs[].
236 */ 236 */
237 if (rq && rq->q == hctx->queue) 237 if (rq && rq->q == hctx->queue)
238 iter_data->fn(hctx, rq, iter_data->data, reserved); 238 return iter_data->fn(hctx, rq, iter_data->data, reserved);
239 return true; 239 return true;
240} 240}
241 241
@@ -247,7 +247,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
247 * @fn: Pointer to the function that will be called for each request 247 * @fn: Pointer to the function that will be called for each request
248 * associated with @hctx that has been assigned a driver tag. 248 * associated with @hctx that has been assigned a driver tag.
249 * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) 249 * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
250 * where rq is a pointer to a request. 250 * where rq is a pointer to a request. Return true to continue
251 * iterating tags, false to stop.
251 * @data: Will be passed as third argument to @fn. 252 * @data: Will be passed as third argument to @fn.
252 * @reserved: Indicates whether @bt is the breserved_tags member or the 253 * @reserved: Indicates whether @bt is the breserved_tags member or the
253 * bitmap_tags member of struct blk_mq_tags. 254 * bitmap_tags member of struct blk_mq_tags.
@@ -288,7 +289,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
288 */ 289 */
289 rq = tags->rqs[bitnr]; 290 rq = tags->rqs[bitnr];
290 if (rq && blk_mq_request_started(rq)) 291 if (rq && blk_mq_request_started(rq))
291 iter_data->fn(rq, iter_data->data, reserved); 292 return iter_data->fn(rq, iter_data->data, reserved);
292 293
293 return true; 294 return true;
294} 295}
@@ -300,7 +301,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
300 * or the bitmap_tags member of struct blk_mq_tags. 301 * or the bitmap_tags member of struct blk_mq_tags.
301 * @fn: Pointer to the function that will be called for each started 302 * @fn: Pointer to the function that will be called for each started
302 * request. @fn will be called as follows: @fn(rq, @data, 303 * request. @fn will be called as follows: @fn(rq, @data,
303 * @reserved) where rq is a pointer to a request. 304 * @reserved) where rq is a pointer to a request. Return true
305 * to continue iterating tags, false to stop.
304 * @data: Will be passed as second argument to @fn. 306 * @data: Will be passed as second argument to @fn.
305 * @reserved: Indicates whether @bt is the breserved_tags member or the 307 * @reserved: Indicates whether @bt is the breserved_tags member or the
306 * bitmap_tags member of struct blk_mq_tags. 308 * bitmap_tags member of struct blk_mq_tags.
@@ -325,7 +327,8 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
325 * @fn: Pointer to the function that will be called for each started 327 * @fn: Pointer to the function that will be called for each started
326 * request. @fn will be called as follows: @fn(rq, @priv, 328 * request. @fn will be called as follows: @fn(rq, @priv,
327 * reserved) where rq is a pointer to a request. 'reserved' 329 * reserved) where rq is a pointer to a request. 'reserved'
328 * indicates whether or not @rq is a reserved request. 330 * indicates whether or not @rq is a reserved request. Return
331 * true to continue iterating tags, false to stop.
329 * @priv: Will be passed as second argument to @fn. 332 * @priv: Will be passed as second argument to @fn.
330 */ 333 */
331static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, 334static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
@@ -342,7 +345,8 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
342 * @fn: Pointer to the function that will be called for each started 345 * @fn: Pointer to the function that will be called for each started
343 * request. @fn will be called as follows: @fn(rq, @priv, 346 * request. @fn will be called as follows: @fn(rq, @priv,
344 * reserved) where rq is a pointer to a request. 'reserved' 347 * reserved) where rq is a pointer to a request. 'reserved'
345 * indicates whether or not @rq is a reserved request. 348 * indicates whether or not @rq is a reserved request. Return
349 * true to continue iterating tags, false to stop.
346 * @priv: Will be passed as second argument to @fn. 350 * @priv: Will be passed as second argument to @fn.
347 */ 351 */
348void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 352void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
@@ -526,16 +530,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
526 */ 530 */
527u32 blk_mq_unique_tag(struct request *rq) 531u32 blk_mq_unique_tag(struct request *rq)
528{ 532{
529 struct request_queue *q = rq->q; 533 return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
530 struct blk_mq_hw_ctx *hctx;
531 int hwq = 0;
532
533 if (q->mq_ops) {
534 hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
535 hwq = hctx->queue_num;
536 }
537
538 return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
539 (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); 534 (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
540} 535}
541EXPORT_SYMBOL(blk_mq_unique_tag); 536EXPORT_SYMBOL(blk_mq_unique_tag);
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
index c3afbca11299..370827163835 100644
--- a/block/blk-mq-virtio.c
+++ b/block/blk-mq-virtio.c
@@ -29,7 +29,7 @@
29 * that maps a queue to the CPUs that have irq affinity for the corresponding 29 * that maps a queue to the CPUs that have irq affinity for the corresponding
30 * vector. 30 * vector.
31 */ 31 */
32int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, 32int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
33 struct virtio_device *vdev, int first_vec) 33 struct virtio_device *vdev, int first_vec)
34{ 34{
35 const struct cpumask *mask; 35 const struct cpumask *mask;
@@ -38,17 +38,17 @@ int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set,
38 if (!vdev->config->get_vq_affinity) 38 if (!vdev->config->get_vq_affinity)
39 goto fallback; 39 goto fallback;
40 40
41 for (queue = 0; queue < set->nr_hw_queues; queue++) { 41 for (queue = 0; queue < qmap->nr_queues; queue++) {
42 mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); 42 mask = vdev->config->get_vq_affinity(vdev, first_vec + queue);
43 if (!mask) 43 if (!mask)
44 goto fallback; 44 goto fallback;
45 45
46 for_each_cpu(cpu, mask) 46 for_each_cpu(cpu, mask)
47 set->mq_map[cpu] = queue; 47 qmap->mq_map[cpu] = qmap->queue_offset + queue;
48 } 48 }
49 49
50 return 0; 50 return 0;
51fallback: 51fallback:
52 return blk_mq_map_queues(set); 52 return blk_mq_map_queues(qmap);
53} 53}
54EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); 54EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3f91c6e5b17a..3ba37b9e15e9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -38,7 +38,6 @@
38#include "blk-mq-sched.h" 38#include "blk-mq-sched.h"
39#include "blk-rq-qos.h" 39#include "blk-rq-qos.h"
40 40
41static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
42static void blk_mq_poll_stats_start(struct request_queue *q); 41static void blk_mq_poll_stats_start(struct request_queue *q);
43static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); 42static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
44 43
@@ -75,14 +74,18 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
75static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 74static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
76 struct blk_mq_ctx *ctx) 75 struct blk_mq_ctx *ctx)
77{ 76{
78 if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) 77 const int bit = ctx->index_hw[hctx->type];
79 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); 78
79 if (!sbitmap_test_bit(&hctx->ctx_map, bit))
80 sbitmap_set_bit(&hctx->ctx_map, bit);
80} 81}
81 82
82static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 83static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
83 struct blk_mq_ctx *ctx) 84 struct blk_mq_ctx *ctx)
84{ 85{
85 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); 86 const int bit = ctx->index_hw[hctx->type];
87
88 sbitmap_clear_bit(&hctx->ctx_map, bit);
86} 89}
87 90
88struct mq_inflight { 91struct mq_inflight {
@@ -90,33 +93,33 @@ struct mq_inflight {
90 unsigned int *inflight; 93 unsigned int *inflight;
91}; 94};
92 95
93static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, 96static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
94 struct request *rq, void *priv, 97 struct request *rq, void *priv,
95 bool reserved) 98 bool reserved)
96{ 99{
97 struct mq_inflight *mi = priv; 100 struct mq_inflight *mi = priv;
98 101
99 /* 102 /*
100 * index[0] counts the specific partition that was asked for. index[1] 103 * index[0] counts the specific partition that was asked for.
101 * counts the ones that are active on the whole device, so increment
102 * that if mi->part is indeed a partition, and not a whole device.
103 */ 104 */
104 if (rq->part == mi->part) 105 if (rq->part == mi->part)
105 mi->inflight[0]++; 106 mi->inflight[0]++;
106 if (mi->part->partno) 107
107 mi->inflight[1]++; 108 return true;
108} 109}
109 110
110void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, 111unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
111 unsigned int inflight[2])
112{ 112{
113 unsigned inflight[2];
113 struct mq_inflight mi = { .part = part, .inflight = inflight, }; 114 struct mq_inflight mi = { .part = part, .inflight = inflight, };
114 115
115 inflight[0] = inflight[1] = 0; 116 inflight[0] = inflight[1] = 0;
116 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); 117 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
118
119 return inflight[0];
117} 120}
118 121
119static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, 122static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
120 struct request *rq, void *priv, 123 struct request *rq, void *priv,
121 bool reserved) 124 bool reserved)
122{ 125{
@@ -124,6 +127,8 @@ static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
124 127
125 if (rq->part == mi->part) 128 if (rq->part == mi->part)
126 mi->inflight[rq_data_dir(rq)]++; 129 mi->inflight[rq_data_dir(rq)]++;
130
131 return true;
127} 132}
128 133
129void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, 134void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
@@ -142,7 +147,7 @@ void blk_freeze_queue_start(struct request_queue *q)
142 freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 147 freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
143 if (freeze_depth == 1) { 148 if (freeze_depth == 1) {
144 percpu_ref_kill(&q->q_usage_counter); 149 percpu_ref_kill(&q->q_usage_counter);
145 if (q->mq_ops) 150 if (queue_is_mq(q))
146 blk_mq_run_hw_queues(q, false); 151 blk_mq_run_hw_queues(q, false);
147 } 152 }
148} 153}
@@ -177,8 +182,6 @@ void blk_freeze_queue(struct request_queue *q)
177 * exported to drivers as the only user for unfreeze is blk_mq. 182 * exported to drivers as the only user for unfreeze is blk_mq.
178 */ 183 */
179 blk_freeze_queue_start(q); 184 blk_freeze_queue_start(q);
180 if (!q->mq_ops)
181 blk_drain_queue(q);
182 blk_mq_freeze_queue_wait(q); 185 blk_mq_freeze_queue_wait(q);
183} 186}
184 187
@@ -275,6 +278,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
275} 278}
276EXPORT_SYMBOL(blk_mq_can_queue); 279EXPORT_SYMBOL(blk_mq_can_queue);
277 280
281/*
282 * Only need start/end time stamping if we have stats enabled, or using
283 * an IO scheduler.
284 */
285static inline bool blk_mq_need_time_stamp(struct request *rq)
286{
287 return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
288}
289
278static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 290static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
279 unsigned int tag, unsigned int op) 291 unsigned int tag, unsigned int op)
280{ 292{
@@ -298,8 +310,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
298 /* csd/requeue_work/fifo_time is initialized before use */ 310 /* csd/requeue_work/fifo_time is initialized before use */
299 rq->q = data->q; 311 rq->q = data->q;
300 rq->mq_ctx = data->ctx; 312 rq->mq_ctx = data->ctx;
313 rq->mq_hctx = data->hctx;
301 rq->rq_flags = rq_flags; 314 rq->rq_flags = rq_flags;
302 rq->cpu = -1;
303 rq->cmd_flags = op; 315 rq->cmd_flags = op;
304 if (data->flags & BLK_MQ_REQ_PREEMPT) 316 if (data->flags & BLK_MQ_REQ_PREEMPT)
305 rq->rq_flags |= RQF_PREEMPT; 317 rq->rq_flags |= RQF_PREEMPT;
@@ -310,7 +322,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
310 RB_CLEAR_NODE(&rq->rb_node); 322 RB_CLEAR_NODE(&rq->rb_node);
311 rq->rq_disk = NULL; 323 rq->rq_disk = NULL;
312 rq->part = NULL; 324 rq->part = NULL;
313 rq->start_time_ns = ktime_get_ns(); 325 if (blk_mq_need_time_stamp(rq))
326 rq->start_time_ns = ktime_get_ns();
327 else
328 rq->start_time_ns = 0;
314 rq->io_start_time_ns = 0; 329 rq->io_start_time_ns = 0;
315 rq->nr_phys_segments = 0; 330 rq->nr_phys_segments = 0;
316#if defined(CONFIG_BLK_DEV_INTEGRITY) 331#if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -319,27 +334,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
319 rq->special = NULL; 334 rq->special = NULL;
320 /* tag was already set */ 335 /* tag was already set */
321 rq->extra_len = 0; 336 rq->extra_len = 0;
322 rq->__deadline = 0; 337 WRITE_ONCE(rq->deadline, 0);
323 338
324 INIT_LIST_HEAD(&rq->timeout_list);
325 rq->timeout = 0; 339 rq->timeout = 0;
326 340
327 rq->end_io = NULL; 341 rq->end_io = NULL;
328 rq->end_io_data = NULL; 342 rq->end_io_data = NULL;
329 rq->next_rq = NULL; 343 rq->next_rq = NULL;
330 344
331#ifdef CONFIG_BLK_CGROUP
332 rq->rl = NULL;
333#endif
334
335 data->ctx->rq_dispatched[op_is_sync(op)]++; 345 data->ctx->rq_dispatched[op_is_sync(op)]++;
336 refcount_set(&rq->ref, 1); 346 refcount_set(&rq->ref, 1);
337 return rq; 347 return rq;
338} 348}
339 349
340static struct request *blk_mq_get_request(struct request_queue *q, 350static struct request *blk_mq_get_request(struct request_queue *q,
341 struct bio *bio, unsigned int op, 351 struct bio *bio,
342 struct blk_mq_alloc_data *data) 352 struct blk_mq_alloc_data *data)
343{ 353{
344 struct elevator_queue *e = q->elevator; 354 struct elevator_queue *e = q->elevator;
345 struct request *rq; 355 struct request *rq;
@@ -353,8 +363,9 @@ static struct request *blk_mq_get_request(struct request_queue *q,
353 put_ctx_on_error = true; 363 put_ctx_on_error = true;
354 } 364 }
355 if (likely(!data->hctx)) 365 if (likely(!data->hctx))
356 data->hctx = blk_mq_map_queue(q, data->ctx->cpu); 366 data->hctx = blk_mq_map_queue(q, data->cmd_flags,
357 if (op & REQ_NOWAIT) 367 data->ctx->cpu);
368 if (data->cmd_flags & REQ_NOWAIT)
358 data->flags |= BLK_MQ_REQ_NOWAIT; 369 data->flags |= BLK_MQ_REQ_NOWAIT;
359 370
360 if (e) { 371 if (e) {
@@ -365,9 +376,10 @@ static struct request *blk_mq_get_request(struct request_queue *q,
365 * dispatch list. Don't include reserved tags in the 376 * dispatch list. Don't include reserved tags in the
366 * limiting, as it isn't useful. 377 * limiting, as it isn't useful.
367 */ 378 */
368 if (!op_is_flush(op) && e->type->ops.mq.limit_depth && 379 if (!op_is_flush(data->cmd_flags) &&
380 e->type->ops.limit_depth &&
369 !(data->flags & BLK_MQ_REQ_RESERVED)) 381 !(data->flags & BLK_MQ_REQ_RESERVED))
370 e->type->ops.mq.limit_depth(op, data); 382 e->type->ops.limit_depth(data->cmd_flags, data);
371 } else { 383 } else {
372 blk_mq_tag_busy(data->hctx); 384 blk_mq_tag_busy(data->hctx);
373 } 385 }
@@ -382,14 +394,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
382 return NULL; 394 return NULL;
383 } 395 }
384 396
385 rq = blk_mq_rq_ctx_init(data, tag, op); 397 rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
386 if (!op_is_flush(op)) { 398 if (!op_is_flush(data->cmd_flags)) {
387 rq->elv.icq = NULL; 399 rq->elv.icq = NULL;
388 if (e && e->type->ops.mq.prepare_request) { 400 if (e && e->type->ops.prepare_request) {
389 if (e->type->icq_cache && rq_ioc(bio)) 401 if (e->type->icq_cache)
390 blk_mq_sched_assign_ioc(rq, bio); 402 blk_mq_sched_assign_ioc(rq);
391 403
392 e->type->ops.mq.prepare_request(rq, bio); 404 e->type->ops.prepare_request(rq, bio);
393 rq->rq_flags |= RQF_ELVPRIV; 405 rq->rq_flags |= RQF_ELVPRIV;
394 } 406 }
395 } 407 }
@@ -400,7 +412,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
400struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 412struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
401 blk_mq_req_flags_t flags) 413 blk_mq_req_flags_t flags)
402{ 414{
403 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 415 struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
404 struct request *rq; 416 struct request *rq;
405 int ret; 417 int ret;
406 418
@@ -408,7 +420,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
408 if (ret) 420 if (ret)
409 return ERR_PTR(ret); 421 return ERR_PTR(ret);
410 422
411 rq = blk_mq_get_request(q, NULL, op, &alloc_data); 423 rq = blk_mq_get_request(q, NULL, &alloc_data);
412 blk_queue_exit(q); 424 blk_queue_exit(q);
413 425
414 if (!rq) 426 if (!rq)
@@ -426,7 +438,7 @@ EXPORT_SYMBOL(blk_mq_alloc_request);
426struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 438struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
427 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) 439 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
428{ 440{
429 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 441 struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
430 struct request *rq; 442 struct request *rq;
431 unsigned int cpu; 443 unsigned int cpu;
432 int ret; 444 int ret;
@@ -459,7 +471,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
459 cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); 471 cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
460 alloc_data.ctx = __blk_mq_get_ctx(q, cpu); 472 alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
461 473
462 rq = blk_mq_get_request(q, NULL, op, &alloc_data); 474 rq = blk_mq_get_request(q, NULL, &alloc_data);
463 blk_queue_exit(q); 475 blk_queue_exit(q);
464 476
465 if (!rq) 477 if (!rq)
@@ -473,10 +485,11 @@ static void __blk_mq_free_request(struct request *rq)
473{ 485{
474 struct request_queue *q = rq->q; 486 struct request_queue *q = rq->q;
475 struct blk_mq_ctx *ctx = rq->mq_ctx; 487 struct blk_mq_ctx *ctx = rq->mq_ctx;
476 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 488 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
477 const int sched_tag = rq->internal_tag; 489 const int sched_tag = rq->internal_tag;
478 490
479 blk_pm_mark_last_busy(rq); 491 blk_pm_mark_last_busy(rq);
492 rq->mq_hctx = NULL;
480 if (rq->tag != -1) 493 if (rq->tag != -1)
481 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); 494 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
482 if (sched_tag != -1) 495 if (sched_tag != -1)
@@ -490,11 +503,11 @@ void blk_mq_free_request(struct request *rq)
490 struct request_queue *q = rq->q; 503 struct request_queue *q = rq->q;
491 struct elevator_queue *e = q->elevator; 504 struct elevator_queue *e = q->elevator;
492 struct blk_mq_ctx *ctx = rq->mq_ctx; 505 struct blk_mq_ctx *ctx = rq->mq_ctx;
493 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 506 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
494 507
495 if (rq->rq_flags & RQF_ELVPRIV) { 508 if (rq->rq_flags & RQF_ELVPRIV) {
496 if (e && e->type->ops.mq.finish_request) 509 if (e && e->type->ops.finish_request)
497 e->type->ops.mq.finish_request(rq); 510 e->type->ops.finish_request(rq);
498 if (rq->elv.icq) { 511 if (rq->elv.icq) {
499 put_io_context(rq->elv.icq->ioc); 512 put_io_context(rq->elv.icq->ioc);
500 rq->elv.icq = NULL; 513 rq->elv.icq = NULL;
@@ -510,9 +523,6 @@ void blk_mq_free_request(struct request *rq)
510 523
511 rq_qos_done(q, rq); 524 rq_qos_done(q, rq);
512 525
513 if (blk_rq_rl(rq))
514 blk_put_rl(blk_rq_rl(rq));
515
516 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 526 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
517 if (refcount_dec_and_test(&rq->ref)) 527 if (refcount_dec_and_test(&rq->ref))
518 __blk_mq_free_request(rq); 528 __blk_mq_free_request(rq);
@@ -521,7 +531,10 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
521 531
522inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 532inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
523{ 533{
524 u64 now = ktime_get_ns(); 534 u64 now = 0;
535
536 if (blk_mq_need_time_stamp(rq))
537 now = ktime_get_ns();
525 538
526 if (rq->rq_flags & RQF_STATS) { 539 if (rq->rq_flags & RQF_STATS) {
527 blk_mq_poll_stats_start(rq->q); 540 blk_mq_poll_stats_start(rq->q);
@@ -555,19 +568,19 @@ EXPORT_SYMBOL(blk_mq_end_request);
555static void __blk_mq_complete_request_remote(void *data) 568static void __blk_mq_complete_request_remote(void *data)
556{ 569{
557 struct request *rq = data; 570 struct request *rq = data;
571 struct request_queue *q = rq->q;
558 572
559 rq->q->softirq_done_fn(rq); 573 q->mq_ops->complete(rq);
560} 574}
561 575
562static void __blk_mq_complete_request(struct request *rq) 576static void __blk_mq_complete_request(struct request *rq)
563{ 577{
564 struct blk_mq_ctx *ctx = rq->mq_ctx; 578 struct blk_mq_ctx *ctx = rq->mq_ctx;
579 struct request_queue *q = rq->q;
565 bool shared = false; 580 bool shared = false;
566 int cpu; 581 int cpu;
567 582
568 if (!blk_mq_mark_complete(rq)) 583 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
569 return;
570
571 /* 584 /*
572 * Most of single queue controllers, there is only one irq vector 585 * Most of single queue controllers, there is only one irq vector
573 * for handling IO completion, and the only irq's affinity is set 586 * for handling IO completion, and the only irq's affinity is set
@@ -577,18 +590,23 @@ static void __blk_mq_complete_request(struct request *rq)
577 * So complete IO reqeust in softirq context in case of single queue 590 * So complete IO reqeust in softirq context in case of single queue
578 * for not degrading IO performance by irqsoff latency. 591 * for not degrading IO performance by irqsoff latency.
579 */ 592 */
580 if (rq->q->nr_hw_queues == 1) { 593 if (q->nr_hw_queues == 1) {
581 __blk_complete_request(rq); 594 __blk_complete_request(rq);
582 return; 595 return;
583 } 596 }
584 597
585 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 598 /*
586 rq->q->softirq_done_fn(rq); 599 * For a polled request, always complete locallly, it's pointless
600 * to redirect the completion.
601 */
602 if ((rq->cmd_flags & REQ_HIPRI) ||
603 !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
604 q->mq_ops->complete(rq);
587 return; 605 return;
588 } 606 }
589 607
590 cpu = get_cpu(); 608 cpu = get_cpu();
591 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 609 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
592 shared = cpus_share_cache(cpu, ctx->cpu); 610 shared = cpus_share_cache(cpu, ctx->cpu);
593 611
594 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 612 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
@@ -597,7 +615,7 @@ static void __blk_mq_complete_request(struct request *rq)
597 rq->csd.flags = 0; 615 rq->csd.flags = 0;
598 smp_call_function_single_async(ctx->cpu, &rq->csd); 616 smp_call_function_single_async(ctx->cpu, &rq->csd);
599 } else { 617 } else {
600 rq->q->softirq_done_fn(rq); 618 q->mq_ops->complete(rq);
601 } 619 }
602 put_cpu(); 620 put_cpu();
603} 621}
@@ -630,11 +648,12 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
630 * Ends all I/O on a request. It does not handle partial completions. 648 * Ends all I/O on a request. It does not handle partial completions.
631 * The actual completion happens out-of-order, through a IPI handler. 649 * The actual completion happens out-of-order, through a IPI handler.
632 **/ 650 **/
633void blk_mq_complete_request(struct request *rq) 651bool blk_mq_complete_request(struct request *rq)
634{ 652{
635 if (unlikely(blk_should_fake_timeout(rq->q))) 653 if (unlikely(blk_should_fake_timeout(rq->q)))
636 return; 654 return false;
637 __blk_mq_complete_request(rq); 655 __blk_mq_complete_request(rq);
656 return true;
638} 657}
639EXPORT_SYMBOL(blk_mq_complete_request); 658EXPORT_SYMBOL(blk_mq_complete_request);
640 659
@@ -701,7 +720,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
701 /* this request will be re-inserted to io scheduler queue */ 720 /* this request will be re-inserted to io scheduler queue */
702 blk_mq_sched_requeue_request(rq); 721 blk_mq_sched_requeue_request(rq);
703 722
704 BUG_ON(blk_queued_rq(rq)); 723 BUG_ON(!list_empty(&rq->queuelist));
705 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); 724 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
706} 725}
707EXPORT_SYMBOL(blk_mq_requeue_request); 726EXPORT_SYMBOL(blk_mq_requeue_request);
@@ -786,6 +805,32 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
786} 805}
787EXPORT_SYMBOL(blk_mq_tag_to_rq); 806EXPORT_SYMBOL(blk_mq_tag_to_rq);
788 807
808static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
809 void *priv, bool reserved)
810{
811 /*
812 * If we find a request that is inflight and the queue matches,
813 * we know the queue is busy. Return false to stop the iteration.
814 */
815 if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
816 bool *busy = priv;
817
818 *busy = true;
819 return false;
820 }
821
822 return true;
823}
824
825bool blk_mq_queue_inflight(struct request_queue *q)
826{
827 bool busy = false;
828
829 blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
830 return busy;
831}
832EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
833
789static void blk_mq_rq_timed_out(struct request *req, bool reserved) 834static void blk_mq_rq_timed_out(struct request *req, bool reserved)
790{ 835{
791 req->rq_flags |= RQF_TIMED_OUT; 836 req->rq_flags |= RQF_TIMED_OUT;
@@ -810,7 +855,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
810 if (rq->rq_flags & RQF_TIMED_OUT) 855 if (rq->rq_flags & RQF_TIMED_OUT)
811 return false; 856 return false;
812 857
813 deadline = blk_rq_deadline(rq); 858 deadline = READ_ONCE(rq->deadline);
814 if (time_after_eq(jiffies, deadline)) 859 if (time_after_eq(jiffies, deadline))
815 return true; 860 return true;
816 861
@@ -821,7 +866,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
821 return false; 866 return false;
822} 867}
823 868
824static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 869static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
825 struct request *rq, void *priv, bool reserved) 870 struct request *rq, void *priv, bool reserved)
826{ 871{
827 unsigned long *next = priv; 872 unsigned long *next = priv;
@@ -831,7 +876,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
831 * so we're not unnecessarilly synchronizing across CPUs. 876 * so we're not unnecessarilly synchronizing across CPUs.
832 */ 877 */
833 if (!blk_mq_req_expired(rq, next)) 878 if (!blk_mq_req_expired(rq, next))
834 return; 879 return true;
835 880
836 /* 881 /*
837 * We have reason to believe the request may be expired. Take a 882 * We have reason to believe the request may be expired. Take a
@@ -843,7 +888,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
843 * timeout handler to posting a natural completion. 888 * timeout handler to posting a natural completion.
844 */ 889 */
845 if (!refcount_inc_not_zero(&rq->ref)) 890 if (!refcount_inc_not_zero(&rq->ref))
846 return; 891 return true;
847 892
848 /* 893 /*
849 * The request is now locked and cannot be reallocated underneath the 894 * The request is now locked and cannot be reallocated underneath the
@@ -855,6 +900,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
855 blk_mq_rq_timed_out(rq, reserved); 900 blk_mq_rq_timed_out(rq, reserved);
856 if (refcount_dec_and_test(&rq->ref)) 901 if (refcount_dec_and_test(&rq->ref))
857 __blk_mq_free_request(rq); 902 __blk_mq_free_request(rq);
903
904 return true;
858} 905}
859 906
860static void blk_mq_timeout_work(struct work_struct *work) 907static void blk_mq_timeout_work(struct work_struct *work)
@@ -911,9 +958,10 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
911 struct flush_busy_ctx_data *flush_data = data; 958 struct flush_busy_ctx_data *flush_data = data;
912 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 959 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
913 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 960 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
961 enum hctx_type type = hctx->type;
914 962
915 spin_lock(&ctx->lock); 963 spin_lock(&ctx->lock);
916 list_splice_tail_init(&ctx->rq_list, flush_data->list); 964 list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
917 sbitmap_clear_bit(sb, bitnr); 965 sbitmap_clear_bit(sb, bitnr);
918 spin_unlock(&ctx->lock); 966 spin_unlock(&ctx->lock);
919 return true; 967 return true;
@@ -945,12 +993,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
945 struct dispatch_rq_data *dispatch_data = data; 993 struct dispatch_rq_data *dispatch_data = data;
946 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; 994 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
947 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 995 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
996 enum hctx_type type = hctx->type;
948 997
949 spin_lock(&ctx->lock); 998 spin_lock(&ctx->lock);
950 if (!list_empty(&ctx->rq_list)) { 999 if (!list_empty(&ctx->rq_lists[type])) {
951 dispatch_data->rq = list_entry_rq(ctx->rq_list.next); 1000 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
952 list_del_init(&dispatch_data->rq->queuelist); 1001 list_del_init(&dispatch_data->rq->queuelist);
953 if (list_empty(&ctx->rq_list)) 1002 if (list_empty(&ctx->rq_lists[type]))
954 sbitmap_clear_bit(sb, bitnr); 1003 sbitmap_clear_bit(sb, bitnr);
955 } 1004 }
956 spin_unlock(&ctx->lock); 1005 spin_unlock(&ctx->lock);
@@ -961,7 +1010,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
961struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 1010struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
962 struct blk_mq_ctx *start) 1011 struct blk_mq_ctx *start)
963{ 1012{
964 unsigned off = start ? start->index_hw : 0; 1013 unsigned off = start ? start->index_hw[hctx->type] : 0;
965 struct dispatch_rq_data data = { 1014 struct dispatch_rq_data data = {
966 .hctx = hctx, 1015 .hctx = hctx,
967 .rq = NULL, 1016 .rq = NULL,
@@ -985,8 +1034,9 @@ bool blk_mq_get_driver_tag(struct request *rq)
985{ 1034{
986 struct blk_mq_alloc_data data = { 1035 struct blk_mq_alloc_data data = {
987 .q = rq->q, 1036 .q = rq->q,
988 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), 1037 .hctx = rq->mq_hctx,
989 .flags = BLK_MQ_REQ_NOWAIT, 1038 .flags = BLK_MQ_REQ_NOWAIT,
1039 .cmd_flags = rq->cmd_flags,
990 }; 1040 };
991 bool shared; 1041 bool shared;
992 1042
@@ -1150,7 +1200,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1150 1200
1151 rq = list_first_entry(list, struct request, queuelist); 1201 rq = list_first_entry(list, struct request, queuelist);
1152 1202
1153 hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); 1203 hctx = rq->mq_hctx;
1154 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) 1204 if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
1155 break; 1205 break;
1156 1206
@@ -1223,6 +1273,14 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1223 if (!list_empty(list)) { 1273 if (!list_empty(list)) {
1224 bool needs_restart; 1274 bool needs_restart;
1225 1275
1276 /*
1277 * If we didn't flush the entire list, we could have told
1278 * the driver there was more coming, but that turned out to
1279 * be a lie.
1280 */
1281 if (q->mq_ops->commit_rqs)
1282 q->mq_ops->commit_rqs(hctx);
1283
1226 spin_lock(&hctx->lock); 1284 spin_lock(&hctx->lock);
1227 list_splice_init(list, &hctx->dispatch); 1285 list_splice_init(list, &hctx->dispatch);
1228 spin_unlock(&hctx->lock); 1286 spin_unlock(&hctx->lock);
@@ -1552,15 +1610,16 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1552 bool at_head) 1610 bool at_head)
1553{ 1611{
1554 struct blk_mq_ctx *ctx = rq->mq_ctx; 1612 struct blk_mq_ctx *ctx = rq->mq_ctx;
1613 enum hctx_type type = hctx->type;
1555 1614
1556 lockdep_assert_held(&ctx->lock); 1615 lockdep_assert_held(&ctx->lock);
1557 1616
1558 trace_block_rq_insert(hctx->queue, rq); 1617 trace_block_rq_insert(hctx->queue, rq);
1559 1618
1560 if (at_head) 1619 if (at_head)
1561 list_add(&rq->queuelist, &ctx->rq_list); 1620 list_add(&rq->queuelist, &ctx->rq_lists[type]);
1562 else 1621 else
1563 list_add_tail(&rq->queuelist, &ctx->rq_list); 1622 list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
1564} 1623}
1565 1624
1566void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 1625void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
@@ -1580,8 +1639,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1580 */ 1639 */
1581void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) 1640void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1582{ 1641{
1583 struct blk_mq_ctx *ctx = rq->mq_ctx; 1642 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1584 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1585 1643
1586 spin_lock(&hctx->lock); 1644 spin_lock(&hctx->lock);
1587 list_add_tail(&rq->queuelist, &hctx->dispatch); 1645 list_add_tail(&rq->queuelist, &hctx->dispatch);
@@ -1596,6 +1654,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1596 1654
1597{ 1655{
1598 struct request *rq; 1656 struct request *rq;
1657 enum hctx_type type = hctx->type;
1599 1658
1600 /* 1659 /*
1601 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1660 * preemption doesn't flush plug list, so it's possible ctx->cpu is
@@ -1607,35 +1666,46 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1607 } 1666 }
1608 1667
1609 spin_lock(&ctx->lock); 1668 spin_lock(&ctx->lock);
1610 list_splice_tail_init(list, &ctx->rq_list); 1669 list_splice_tail_init(list, &ctx->rq_lists[type]);
1611 blk_mq_hctx_mark_pending(hctx, ctx); 1670 blk_mq_hctx_mark_pending(hctx, ctx);
1612 spin_unlock(&ctx->lock); 1671 spin_unlock(&ctx->lock);
1613} 1672}
1614 1673
1615static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1674static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
1616{ 1675{
1617 struct request *rqa = container_of(a, struct request, queuelist); 1676 struct request *rqa = container_of(a, struct request, queuelist);
1618 struct request *rqb = container_of(b, struct request, queuelist); 1677 struct request *rqb = container_of(b, struct request, queuelist);
1619 1678
1620 return !(rqa->mq_ctx < rqb->mq_ctx || 1679 if (rqa->mq_ctx < rqb->mq_ctx)
1621 (rqa->mq_ctx == rqb->mq_ctx && 1680 return -1;
1622 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 1681 else if (rqa->mq_ctx > rqb->mq_ctx)
1682 return 1;
1683 else if (rqa->mq_hctx < rqb->mq_hctx)
1684 return -1;
1685 else if (rqa->mq_hctx > rqb->mq_hctx)
1686 return 1;
1687
1688 return blk_rq_pos(rqa) > blk_rq_pos(rqb);
1623} 1689}
1624 1690
1625void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1691void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1626{ 1692{
1693 struct blk_mq_hw_ctx *this_hctx;
1627 struct blk_mq_ctx *this_ctx; 1694 struct blk_mq_ctx *this_ctx;
1628 struct request_queue *this_q; 1695 struct request_queue *this_q;
1629 struct request *rq; 1696 struct request *rq;
1630 LIST_HEAD(list); 1697 LIST_HEAD(list);
1631 LIST_HEAD(ctx_list); 1698 LIST_HEAD(rq_list);
1632 unsigned int depth; 1699 unsigned int depth;
1633 1700
1634 list_splice_init(&plug->mq_list, &list); 1701 list_splice_init(&plug->mq_list, &list);
1702 plug->rq_count = 0;
1635 1703
1636 list_sort(NULL, &list, plug_ctx_cmp); 1704 if (plug->rq_count > 2 && plug->multiple_queues)
1705 list_sort(NULL, &list, plug_rq_cmp);
1637 1706
1638 this_q = NULL; 1707 this_q = NULL;
1708 this_hctx = NULL;
1639 this_ctx = NULL; 1709 this_ctx = NULL;
1640 depth = 0; 1710 depth = 0;
1641 1711
@@ -1643,30 +1713,31 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1643 rq = list_entry_rq(list.next); 1713 rq = list_entry_rq(list.next);
1644 list_del_init(&rq->queuelist); 1714 list_del_init(&rq->queuelist);
1645 BUG_ON(!rq->q); 1715 BUG_ON(!rq->q);
1646 if (rq->mq_ctx != this_ctx) { 1716 if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) {
1647 if (this_ctx) { 1717 if (this_hctx) {
1648 trace_block_unplug(this_q, depth, !from_schedule); 1718 trace_block_unplug(this_q, depth, !from_schedule);
1649 blk_mq_sched_insert_requests(this_q, this_ctx, 1719 blk_mq_sched_insert_requests(this_hctx, this_ctx,
1650 &ctx_list, 1720 &rq_list,
1651 from_schedule); 1721 from_schedule);
1652 } 1722 }
1653 1723
1654 this_ctx = rq->mq_ctx;
1655 this_q = rq->q; 1724 this_q = rq->q;
1725 this_ctx = rq->mq_ctx;
1726 this_hctx = rq->mq_hctx;
1656 depth = 0; 1727 depth = 0;
1657 } 1728 }
1658 1729
1659 depth++; 1730 depth++;
1660 list_add_tail(&rq->queuelist, &ctx_list); 1731 list_add_tail(&rq->queuelist, &rq_list);
1661 } 1732 }
1662 1733
1663 /* 1734 /*
1664 * If 'this_ctx' is set, we know we have entries to complete 1735 * If 'this_hctx' is set, we know we have entries to complete
1665 * on 'ctx_list'. Do those. 1736 * on 'rq_list'. Do those.
1666 */ 1737 */
1667 if (this_ctx) { 1738 if (this_hctx) {
1668 trace_block_unplug(this_q, depth, !from_schedule); 1739 trace_block_unplug(this_q, depth, !from_schedule);
1669 blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, 1740 blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
1670 from_schedule); 1741 from_schedule);
1671 } 1742 }
1672} 1743}
@@ -1675,27 +1746,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1675{ 1746{
1676 blk_init_request_from_bio(rq, bio); 1747 blk_init_request_from_bio(rq, bio);
1677 1748
1678 blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
1679
1680 blk_account_io_start(rq, true); 1749 blk_account_io_start(rq, true);
1681} 1750}
1682 1751
1683static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1684{
1685 if (rq->tag != -1)
1686 return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1687
1688 return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1689}
1690
1691static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, 1752static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1692 struct request *rq, 1753 struct request *rq,
1693 blk_qc_t *cookie) 1754 blk_qc_t *cookie, bool last)
1694{ 1755{
1695 struct request_queue *q = rq->q; 1756 struct request_queue *q = rq->q;
1696 struct blk_mq_queue_data bd = { 1757 struct blk_mq_queue_data bd = {
1697 .rq = rq, 1758 .rq = rq,
1698 .last = true, 1759 .last = last,
1699 }; 1760 };
1700 blk_qc_t new_cookie; 1761 blk_qc_t new_cookie;
1701 blk_status_t ret; 1762 blk_status_t ret;
@@ -1727,77 +1788,74 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1727 return ret; 1788 return ret;
1728} 1789}
1729 1790
1730static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 1791blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1731 struct request *rq, 1792 struct request *rq,
1732 blk_qc_t *cookie, 1793 blk_qc_t *cookie,
1733 bool bypass_insert) 1794 bool bypass, bool last)
1734{ 1795{
1735 struct request_queue *q = rq->q; 1796 struct request_queue *q = rq->q;
1736 bool run_queue = true; 1797 bool run_queue = true;
1798 blk_status_t ret = BLK_STS_RESOURCE;
1799 int srcu_idx;
1800 bool force = false;
1737 1801
1802 hctx_lock(hctx, &srcu_idx);
1738 /* 1803 /*
1739 * RCU or SRCU read lock is needed before checking quiesced flag. 1804 * hctx_lock is needed before checking quiesced flag.
1740 * 1805 *
1741 * When queue is stopped or quiesced, ignore 'bypass_insert' from 1806 * When queue is stopped or quiesced, ignore 'bypass', insert
1742 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, 1807 * and return BLK_STS_OK to caller, and avoid driver to try to
1743 * and avoid driver to try to dispatch again. 1808 * dispatch again.
1744 */ 1809 */
1745 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { 1810 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) {
1746 run_queue = false; 1811 run_queue = false;
1747 bypass_insert = false; 1812 bypass = false;
1748 goto insert; 1813 goto out_unlock;
1749 } 1814 }
1750 1815
1751 if (q->elevator && !bypass_insert) 1816 if (unlikely(q->elevator && !bypass))
1752 goto insert; 1817 goto out_unlock;
1753 1818
1754 if (!blk_mq_get_dispatch_budget(hctx)) 1819 if (!blk_mq_get_dispatch_budget(hctx))
1755 goto insert; 1820 goto out_unlock;
1756 1821
1757 if (!blk_mq_get_driver_tag(rq)) { 1822 if (!blk_mq_get_driver_tag(rq)) {
1758 blk_mq_put_dispatch_budget(hctx); 1823 blk_mq_put_dispatch_budget(hctx);
1759 goto insert; 1824 goto out_unlock;
1760 } 1825 }
1761 1826
1762 return __blk_mq_issue_directly(hctx, rq, cookie); 1827 /*
1763insert: 1828 * Always add a request that has been through
1764 if (bypass_insert) 1829 *.queue_rq() to the hardware dispatch list.
1765 return BLK_STS_RESOURCE; 1830 */
1766 1831 force = true;
1767 blk_mq_sched_insert_request(rq, false, run_queue, false); 1832 ret = __blk_mq_issue_directly(hctx, rq, cookie, last);
1768 return BLK_STS_OK; 1833out_unlock:
1769}
1770
1771static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1772 struct request *rq, blk_qc_t *cookie)
1773{
1774 blk_status_t ret;
1775 int srcu_idx;
1776
1777 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1778
1779 hctx_lock(hctx, &srcu_idx);
1780
1781 ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
1782 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1783 blk_mq_sched_insert_request(rq, false, true, false);
1784 else if (ret != BLK_STS_OK)
1785 blk_mq_end_request(rq, ret);
1786
1787 hctx_unlock(hctx, srcu_idx);
1788}
1789
1790blk_status_t blk_mq_request_issue_directly(struct request *rq)
1791{
1792 blk_status_t ret;
1793 int srcu_idx;
1794 blk_qc_t unused_cookie;
1795 struct blk_mq_ctx *ctx = rq->mq_ctx;
1796 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1797
1798 hctx_lock(hctx, &srcu_idx);
1799 ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
1800 hctx_unlock(hctx, srcu_idx); 1834 hctx_unlock(hctx, srcu_idx);
1835 switch (ret) {
1836 case BLK_STS_OK:
1837 break;
1838 case BLK_STS_DEV_RESOURCE:
1839 case BLK_STS_RESOURCE:
1840 if (force) {
1841 blk_mq_request_bypass_insert(rq, run_queue);
1842 /*
1843 * We have to return BLK_STS_OK for the DM
1844 * to avoid livelock. Otherwise, we return
1845 * the real result to indicate whether the
1846 * request is direct-issued successfully.
1847 */
1848 ret = bypass ? BLK_STS_OK : ret;
1849 } else if (!bypass) {
1850 blk_mq_sched_insert_request(rq, false,
1851 run_queue, false);
1852 }
1853 break;
1854 default:
1855 if (!bypass)
1856 blk_mq_end_request(rq, ret);
1857 break;
1858 }
1801 1859
1802 return ret; 1860 return ret;
1803} 1861}
@@ -1805,21 +1863,42 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
1805void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 1863void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1806 struct list_head *list) 1864 struct list_head *list)
1807{ 1865{
1866 blk_qc_t unused;
1867 blk_status_t ret = BLK_STS_OK;
1868
1808 while (!list_empty(list)) { 1869 while (!list_empty(list)) {
1809 blk_status_t ret;
1810 struct request *rq = list_first_entry(list, struct request, 1870 struct request *rq = list_first_entry(list, struct request,
1811 queuelist); 1871 queuelist);
1812 1872
1813 list_del_init(&rq->queuelist); 1873 list_del_init(&rq->queuelist);
1814 ret = blk_mq_request_issue_directly(rq); 1874 if (ret == BLK_STS_OK)
1815 if (ret != BLK_STS_OK) { 1875 ret = blk_mq_try_issue_directly(hctx, rq, &unused,
1816 if (ret == BLK_STS_RESOURCE || 1876 false,
1817 ret == BLK_STS_DEV_RESOURCE) { 1877 list_empty(list));
1818 list_add(&rq->queuelist, list); 1878 else
1819 break; 1879 blk_mq_sched_insert_request(rq, false, true, false);
1820 } 1880 }
1821 blk_mq_end_request(rq, ret); 1881
1822 } 1882 /*
1883 * If we didn't flush the entire list, we could have told
1884 * the driver there was more coming, but that turned out to
1885 * be a lie.
1886 */
1887 if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs)
1888 hctx->queue->mq_ops->commit_rqs(hctx);
1889}
1890
1891static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
1892{
1893 list_add_tail(&rq->queuelist, &plug->mq_list);
1894 plug->rq_count++;
1895 if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
1896 struct request *tmp;
1897
1898 tmp = list_first_entry(&plug->mq_list, struct request,
1899 queuelist);
1900 if (tmp->q != rq->q)
1901 plug->multiple_queues = true;
1823 } 1902 }
1824} 1903}
1825 1904
@@ -1827,9 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1827{ 1906{
1828 const int is_sync = op_is_sync(bio->bi_opf); 1907 const int is_sync = op_is_sync(bio->bi_opf);
1829 const int is_flush_fua = op_is_flush(bio->bi_opf); 1908 const int is_flush_fua = op_is_flush(bio->bi_opf);
1830 struct blk_mq_alloc_data data = { .flags = 0 }; 1909 struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf };
1831 struct request *rq; 1910 struct request *rq;
1832 unsigned int request_count = 0;
1833 struct blk_plug *plug; 1911 struct blk_plug *plug;
1834 struct request *same_queue_rq = NULL; 1912 struct request *same_queue_rq = NULL;
1835 blk_qc_t cookie; 1913 blk_qc_t cookie;
@@ -1842,15 +1920,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1842 return BLK_QC_T_NONE; 1920 return BLK_QC_T_NONE;
1843 1921
1844 if (!is_flush_fua && !blk_queue_nomerges(q) && 1922 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1845 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) 1923 blk_attempt_plug_merge(q, bio, &same_queue_rq))
1846 return BLK_QC_T_NONE; 1924 return BLK_QC_T_NONE;
1847 1925
1848 if (blk_mq_sched_bio_merge(q, bio)) 1926 if (blk_mq_sched_bio_merge(q, bio))
1849 return BLK_QC_T_NONE; 1927 return BLK_QC_T_NONE;
1850 1928
1851 rq_qos_throttle(q, bio, NULL); 1929 rq_qos_throttle(q, bio);
1852 1930
1853 rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); 1931 rq = blk_mq_get_request(q, bio, &data);
1854 if (unlikely(!rq)) { 1932 if (unlikely(!rq)) {
1855 rq_qos_cleanup(q, bio); 1933 rq_qos_cleanup(q, bio);
1856 if (bio->bi_opf & REQ_NOWAIT) 1934 if (bio->bi_opf & REQ_NOWAIT)
@@ -1872,21 +1950,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1872 /* bypass scheduler for flush rq */ 1950 /* bypass scheduler for flush rq */
1873 blk_insert_flush(rq); 1951 blk_insert_flush(rq);
1874 blk_mq_run_hw_queue(data.hctx, true); 1952 blk_mq_run_hw_queue(data.hctx, true);
1875 } else if (plug && q->nr_hw_queues == 1) { 1953 } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) {
1954 /*
1955 * Use plugging if we have a ->commit_rqs() hook as well, as
1956 * we know the driver uses bd->last in a smart fashion.
1957 */
1958 unsigned int request_count = plug->rq_count;
1876 struct request *last = NULL; 1959 struct request *last = NULL;
1877 1960
1878 blk_mq_put_ctx(data.ctx); 1961 blk_mq_put_ctx(data.ctx);
1879 blk_mq_bio_to_request(rq, bio); 1962 blk_mq_bio_to_request(rq, bio);
1880 1963
1881 /*
1882 * @request_count may become stale because of schedule
1883 * out, so check the list again.
1884 */
1885 if (list_empty(&plug->mq_list))
1886 request_count = 0;
1887 else if (blk_queue_nomerges(q))
1888 request_count = blk_plug_queued_count(q);
1889
1890 if (!request_count) 1964 if (!request_count)
1891 trace_block_plug(q); 1965 trace_block_plug(q);
1892 else 1966 else
@@ -1898,7 +1972,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1898 trace_block_plug(q); 1972 trace_block_plug(q);
1899 } 1973 }
1900 1974
1901 list_add_tail(&rq->queuelist, &plug->mq_list); 1975 blk_add_rq_to_plug(plug, rq);
1902 } else if (plug && !blk_queue_nomerges(q)) { 1976 } else if (plug && !blk_queue_nomerges(q)) {
1903 blk_mq_bio_to_request(rq, bio); 1977 blk_mq_bio_to_request(rq, bio);
1904 1978
@@ -1911,23 +1985,24 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1911 */ 1985 */
1912 if (list_empty(&plug->mq_list)) 1986 if (list_empty(&plug->mq_list))
1913 same_queue_rq = NULL; 1987 same_queue_rq = NULL;
1914 if (same_queue_rq) 1988 if (same_queue_rq) {
1915 list_del_init(&same_queue_rq->queuelist); 1989 list_del_init(&same_queue_rq->queuelist);
1916 list_add_tail(&rq->queuelist, &plug->mq_list); 1990 plug->rq_count--;
1991 }
1992 blk_add_rq_to_plug(plug, rq);
1917 1993
1918 blk_mq_put_ctx(data.ctx); 1994 blk_mq_put_ctx(data.ctx);
1919 1995
1920 if (same_queue_rq) { 1996 if (same_queue_rq) {
1921 data.hctx = blk_mq_map_queue(q, 1997 data.hctx = same_queue_rq->mq_hctx;
1922 same_queue_rq->mq_ctx->cpu);
1923 blk_mq_try_issue_directly(data.hctx, same_queue_rq, 1998 blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1924 &cookie); 1999 &cookie, false, true);
1925 } 2000 }
1926 } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && 2001 } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
1927 !data.hctx->dispatch_busy)) { 2002 !data.hctx->dispatch_busy)) {
1928 blk_mq_put_ctx(data.ctx); 2003 blk_mq_put_ctx(data.ctx);
1929 blk_mq_bio_to_request(rq, bio); 2004 blk_mq_bio_to_request(rq, bio);
1930 blk_mq_try_issue_directly(data.hctx, rq, &cookie); 2005 blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true);
1931 } else { 2006 } else {
1932 blk_mq_put_ctx(data.ctx); 2007 blk_mq_put_ctx(data.ctx);
1933 blk_mq_bio_to_request(rq, bio); 2008 blk_mq_bio_to_request(rq, bio);
@@ -1985,7 +2060,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
1985 struct blk_mq_tags *tags; 2060 struct blk_mq_tags *tags;
1986 int node; 2061 int node;
1987 2062
1988 node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 2063 node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
1989 if (node == NUMA_NO_NODE) 2064 if (node == NUMA_NO_NODE)
1990 node = set->numa_node; 2065 node = set->numa_node;
1991 2066
@@ -2041,7 +2116,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2041 size_t rq_size, left; 2116 size_t rq_size, left;
2042 int node; 2117 int node;
2043 2118
2044 node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 2119 node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
2045 if (node == NUMA_NO_NODE) 2120 if (node == NUMA_NO_NODE)
2046 node = set->numa_node; 2121 node = set->numa_node;
2047 2122
@@ -2121,13 +2196,15 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
2121 struct blk_mq_hw_ctx *hctx; 2196 struct blk_mq_hw_ctx *hctx;
2122 struct blk_mq_ctx *ctx; 2197 struct blk_mq_ctx *ctx;
2123 LIST_HEAD(tmp); 2198 LIST_HEAD(tmp);
2199 enum hctx_type type;
2124 2200
2125 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 2201 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2126 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 2202 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2203 type = hctx->type;
2127 2204
2128 spin_lock(&ctx->lock); 2205 spin_lock(&ctx->lock);
2129 if (!list_empty(&ctx->rq_list)) { 2206 if (!list_empty(&ctx->rq_lists[type])) {
2130 list_splice_init(&ctx->rq_list, &tmp); 2207 list_splice_init(&ctx->rq_lists[type], &tmp);
2131 blk_mq_hctx_clear_pending(hctx, ctx); 2208 blk_mq_hctx_clear_pending(hctx, ctx);
2132 } 2209 }
2133 spin_unlock(&ctx->lock); 2210 spin_unlock(&ctx->lock);
@@ -2258,24 +2335,30 @@ static int blk_mq_init_hctx(struct request_queue *q,
2258static void blk_mq_init_cpu_queues(struct request_queue *q, 2335static void blk_mq_init_cpu_queues(struct request_queue *q,
2259 unsigned int nr_hw_queues) 2336 unsigned int nr_hw_queues)
2260{ 2337{
2261 unsigned int i; 2338 struct blk_mq_tag_set *set = q->tag_set;
2339 unsigned int i, j;
2262 2340
2263 for_each_possible_cpu(i) { 2341 for_each_possible_cpu(i) {
2264 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 2342 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2265 struct blk_mq_hw_ctx *hctx; 2343 struct blk_mq_hw_ctx *hctx;
2344 int k;
2266 2345
2267 __ctx->cpu = i; 2346 __ctx->cpu = i;
2268 spin_lock_init(&__ctx->lock); 2347 spin_lock_init(&__ctx->lock);
2269 INIT_LIST_HEAD(&__ctx->rq_list); 2348 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2349 INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2350
2270 __ctx->queue = q; 2351 __ctx->queue = q;
2271 2352
2272 /* 2353 /*
2273 * Set local node, IFF we have more than one hw queue. If 2354 * Set local node, IFF we have more than one hw queue. If
2274 * not, we remain on the home node of the device 2355 * not, we remain on the home node of the device
2275 */ 2356 */
2276 hctx = blk_mq_map_queue(q, i); 2357 for (j = 0; j < set->nr_maps; j++) {
2277 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 2358 hctx = blk_mq_map_queue_type(q, j, i);
2278 hctx->numa_node = local_memory_node(cpu_to_node(i)); 2359 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2360 hctx->numa_node = local_memory_node(cpu_to_node(i));
2361 }
2279 } 2362 }
2280} 2363}
2281 2364
@@ -2301,7 +2384,7 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2301static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, 2384static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2302 unsigned int hctx_idx) 2385 unsigned int hctx_idx)
2303{ 2386{
2304 if (set->tags[hctx_idx]) { 2387 if (set->tags && set->tags[hctx_idx]) {
2305 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); 2388 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2306 blk_mq_free_rq_map(set->tags[hctx_idx]); 2389 blk_mq_free_rq_map(set->tags[hctx_idx]);
2307 set->tags[hctx_idx] = NULL; 2390 set->tags[hctx_idx] = NULL;
@@ -2310,7 +2393,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2310 2393
2311static void blk_mq_map_swqueue(struct request_queue *q) 2394static void blk_mq_map_swqueue(struct request_queue *q)
2312{ 2395{
2313 unsigned int i, hctx_idx; 2396 unsigned int i, j, hctx_idx;
2314 struct blk_mq_hw_ctx *hctx; 2397 struct blk_mq_hw_ctx *hctx;
2315 struct blk_mq_ctx *ctx; 2398 struct blk_mq_ctx *ctx;
2316 struct blk_mq_tag_set *set = q->tag_set; 2399 struct blk_mq_tag_set *set = q->tag_set;
@@ -2332,7 +2415,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
2332 * If the cpu isn't present, the cpu is mapped to first hctx. 2415 * If the cpu isn't present, the cpu is mapped to first hctx.
2333 */ 2416 */
2334 for_each_possible_cpu(i) { 2417 for_each_possible_cpu(i) {
2335 hctx_idx = q->mq_map[i]; 2418 hctx_idx = set->map[0].mq_map[i];
2336 /* unmapped hw queue can be remapped after CPU topo changed */ 2419 /* unmapped hw queue can be remapped after CPU topo changed */
2337 if (!set->tags[hctx_idx] && 2420 if (!set->tags[hctx_idx] &&
2338 !__blk_mq_alloc_rq_map(set, hctx_idx)) { 2421 !__blk_mq_alloc_rq_map(set, hctx_idx)) {
@@ -2342,15 +2425,35 @@ static void blk_mq_map_swqueue(struct request_queue *q)
2342 * case, remap the current ctx to hctx[0] which 2425 * case, remap the current ctx to hctx[0] which
2343 * is guaranteed to always have tags allocated 2426 * is guaranteed to always have tags allocated
2344 */ 2427 */
2345 q->mq_map[i] = 0; 2428 set->map[0].mq_map[i] = 0;
2346 } 2429 }
2347 2430
2348 ctx = per_cpu_ptr(q->queue_ctx, i); 2431 ctx = per_cpu_ptr(q->queue_ctx, i);
2349 hctx = blk_mq_map_queue(q, i); 2432 for (j = 0; j < set->nr_maps; j++) {
2433 if (!set->map[j].nr_queues)
2434 continue;
2435
2436 hctx = blk_mq_map_queue_type(q, j, i);
2437
2438 /*
2439 * If the CPU is already set in the mask, then we've
2440 * mapped this one already. This can happen if
2441 * devices share queues across queue maps.
2442 */
2443 if (cpumask_test_cpu(i, hctx->cpumask))
2444 continue;
2445
2446 cpumask_set_cpu(i, hctx->cpumask);
2447 hctx->type = j;
2448 ctx->index_hw[hctx->type] = hctx->nr_ctx;
2449 hctx->ctxs[hctx->nr_ctx++] = ctx;
2350 2450
2351 cpumask_set_cpu(i, hctx->cpumask); 2451 /*
2352 ctx->index_hw = hctx->nr_ctx; 2452 * If the nr_ctx type overflows, we have exceeded the
2353 hctx->ctxs[hctx->nr_ctx++] = ctx; 2453 * amount of sw queues we can support.
2454 */
2455 BUG_ON(!hctx->nr_ctx);
2456 }
2354 } 2457 }
2355 2458
2356 mutex_unlock(&q->sysfs_lock); 2459 mutex_unlock(&q->sysfs_lock);
@@ -2440,8 +2543,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
2440static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 2543static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2441 struct request_queue *q) 2544 struct request_queue *q)
2442{ 2545{
2443 q->tag_set = set;
2444
2445 mutex_lock(&set->tag_list_lock); 2546 mutex_lock(&set->tag_list_lock);
2446 2547
2447 /* 2548 /*
@@ -2460,6 +2561,34 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2460 mutex_unlock(&set->tag_list_lock); 2561 mutex_unlock(&set->tag_list_lock);
2461} 2562}
2462 2563
2564/* All allocations will be freed in release handler of q->mq_kobj */
2565static int blk_mq_alloc_ctxs(struct request_queue *q)
2566{
2567 struct blk_mq_ctxs *ctxs;
2568 int cpu;
2569
2570 ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
2571 if (!ctxs)
2572 return -ENOMEM;
2573
2574 ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2575 if (!ctxs->queue_ctx)
2576 goto fail;
2577
2578 for_each_possible_cpu(cpu) {
2579 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
2580 ctx->ctxs = ctxs;
2581 }
2582
2583 q->mq_kobj = &ctxs->kobj;
2584 q->queue_ctx = ctxs->queue_ctx;
2585
2586 return 0;
2587 fail:
2588 kfree(ctxs);
2589 return -ENOMEM;
2590}
2591
2463/* 2592/*
2464 * It is the actual release handler for mq, but we do it from 2593 * It is the actual release handler for mq, but we do it from
2465 * request queue's release handler for avoiding use-after-free 2594 * request queue's release handler for avoiding use-after-free
@@ -2478,8 +2607,6 @@ void blk_mq_release(struct request_queue *q)
2478 kobject_put(&hctx->kobj); 2607 kobject_put(&hctx->kobj);
2479 } 2608 }
2480 2609
2481 q->mq_map = NULL;
2482
2483 kfree(q->queue_hw_ctx); 2610 kfree(q->queue_hw_ctx);
2484 2611
2485 /* 2612 /*
@@ -2487,15 +2614,13 @@ void blk_mq_release(struct request_queue *q)
2487 * both share lifetime with request queue. 2614 * both share lifetime with request queue.
2488 */ 2615 */
2489 blk_mq_sysfs_deinit(q); 2616 blk_mq_sysfs_deinit(q);
2490
2491 free_percpu(q->queue_ctx);
2492} 2617}
2493 2618
2494struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 2619struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2495{ 2620{
2496 struct request_queue *uninit_q, *q; 2621 struct request_queue *uninit_q, *q;
2497 2622
2498 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL); 2623 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
2499 if (!uninit_q) 2624 if (!uninit_q)
2500 return ERR_PTR(-ENOMEM); 2625 return ERR_PTR(-ENOMEM);
2501 2626
@@ -2522,6 +2647,7 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
2522 memset(set, 0, sizeof(*set)); 2647 memset(set, 0, sizeof(*set));
2523 set->ops = ops; 2648 set->ops = ops;
2524 set->nr_hw_queues = 1; 2649 set->nr_hw_queues = 1;
2650 set->nr_maps = 1;
2525 set->queue_depth = queue_depth; 2651 set->queue_depth = queue_depth;
2526 set->numa_node = NUMA_NO_NODE; 2652 set->numa_node = NUMA_NO_NODE;
2527 set->flags = set_flags; 2653 set->flags = set_flags;
@@ -2599,7 +2725,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2599 int node; 2725 int node;
2600 struct blk_mq_hw_ctx *hctx; 2726 struct blk_mq_hw_ctx *hctx;
2601 2727
2602 node = blk_mq_hw_queue_to_node(q->mq_map, i); 2728 node = blk_mq_hw_queue_to_node(&set->map[0], i);
2603 /* 2729 /*
2604 * If the hw queue has been mapped to another numa node, 2730 * If the hw queue has been mapped to another numa node,
2605 * we need to realloc the hctx. If allocation fails, fallback 2731 * we need to realloc the hctx. If allocation fails, fallback
@@ -2652,6 +2778,19 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2652 mutex_unlock(&q->sysfs_lock); 2778 mutex_unlock(&q->sysfs_lock);
2653} 2779}
2654 2780
2781/*
2782 * Maximum number of hardware queues we support. For single sets, we'll never
2783 * have more than the CPUs (software queues). For multiple sets, the tag_set
2784 * user may have set ->nr_hw_queues larger.
2785 */
2786static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
2787{
2788 if (set->nr_maps == 1)
2789 return nr_cpu_ids;
2790
2791 return max(set->nr_hw_queues, nr_cpu_ids);
2792}
2793
2655struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 2794struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2656 struct request_queue *q) 2795 struct request_queue *q)
2657{ 2796{
@@ -2664,19 +2803,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2664 if (!q->poll_cb) 2803 if (!q->poll_cb)
2665 goto err_exit; 2804 goto err_exit;
2666 2805
2667 q->queue_ctx = alloc_percpu(struct blk_mq_ctx); 2806 if (blk_mq_alloc_ctxs(q))
2668 if (!q->queue_ctx)
2669 goto err_exit; 2807 goto err_exit;
2670 2808
2671 /* init q->mq_kobj and sw queues' kobjects */ 2809 /* init q->mq_kobj and sw queues' kobjects */
2672 blk_mq_sysfs_init(q); 2810 blk_mq_sysfs_init(q);
2673 2811
2674 q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)), 2812 q->nr_queues = nr_hw_queues(set);
2813 q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)),
2675 GFP_KERNEL, set->numa_node); 2814 GFP_KERNEL, set->numa_node);
2676 if (!q->queue_hw_ctx) 2815 if (!q->queue_hw_ctx)
2677 goto err_percpu; 2816 goto err_sys_init;
2678
2679 q->mq_map = set->mq_map;
2680 2817
2681 blk_mq_realloc_hw_ctxs(set, q); 2818 blk_mq_realloc_hw_ctxs(set, q);
2682 if (!q->nr_hw_queues) 2819 if (!q->nr_hw_queues)
@@ -2685,12 +2822,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2685 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 2822 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2686 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 2823 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2687 2824
2688 q->nr_queues = nr_cpu_ids; 2825 q->tag_set = set;
2689 2826
2690 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 2827 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2828 if (set->nr_maps > HCTX_TYPE_POLL &&
2829 set->map[HCTX_TYPE_POLL].nr_queues)
2830 blk_queue_flag_set(QUEUE_FLAG_POLL, q);
2691 2831
2692 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 2832 if (!(set->flags & BLK_MQ_F_SG_MERGE))
2693 queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); 2833 blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
2694 2834
2695 q->sg_reserved_size = INT_MAX; 2835 q->sg_reserved_size = INT_MAX;
2696 2836
@@ -2699,8 +2839,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2699 spin_lock_init(&q->requeue_lock); 2839 spin_lock_init(&q->requeue_lock);
2700 2840
2701 blk_queue_make_request(q, blk_mq_make_request); 2841 blk_queue_make_request(q, blk_mq_make_request);
2702 if (q->mq_ops->poll)
2703 q->poll_fn = blk_mq_poll;
2704 2842
2705 /* 2843 /*
2706 * Do this after blk_queue_make_request() overrides it... 2844 * Do this after blk_queue_make_request() overrides it...
@@ -2712,9 +2850,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2712 */ 2850 */
2713 q->poll_nsec = -1; 2851 q->poll_nsec = -1;
2714 2852
2715 if (set->ops->complete)
2716 blk_queue_softirq_done(q, set->ops->complete);
2717
2718 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 2853 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2719 blk_mq_add_queue_tag_set(set, q); 2854 blk_mq_add_queue_tag_set(set, q);
2720 blk_mq_map_swqueue(q); 2855 blk_mq_map_swqueue(q);
@@ -2731,8 +2866,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2731 2866
2732err_hctxs: 2867err_hctxs:
2733 kfree(q->queue_hw_ctx); 2868 kfree(q->queue_hw_ctx);
2734err_percpu: 2869err_sys_init:
2735 free_percpu(q->queue_ctx); 2870 blk_mq_sysfs_deinit(q);
2736err_exit: 2871err_exit:
2737 q->mq_ops = NULL; 2872 q->mq_ops = NULL;
2738 return ERR_PTR(-ENOMEM); 2873 return ERR_PTR(-ENOMEM);
@@ -2801,7 +2936,9 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2801 2936
2802static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 2937static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2803{ 2938{
2804 if (set->ops->map_queues) { 2939 if (set->ops->map_queues && !is_kdump_kernel()) {
2940 int i;
2941
2805 /* 2942 /*
2806 * transport .map_queues is usually done in the following 2943 * transport .map_queues is usually done in the following
2807 * way: 2944 * way:
@@ -2809,18 +2946,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2809 * for (queue = 0; queue < set->nr_hw_queues; queue++) { 2946 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
2810 * mask = get_cpu_mask(queue) 2947 * mask = get_cpu_mask(queue)
2811 * for_each_cpu(cpu, mask) 2948 * for_each_cpu(cpu, mask)
2812 * set->mq_map[cpu] = queue; 2949 * set->map[x].mq_map[cpu] = queue;
2813 * } 2950 * }
2814 * 2951 *
2815 * When we need to remap, the table has to be cleared for 2952 * When we need to remap, the table has to be cleared for
2816 * killing stale mapping since one CPU may not be mapped 2953 * killing stale mapping since one CPU may not be mapped
2817 * to any hw queue. 2954 * to any hw queue.
2818 */ 2955 */
2819 blk_mq_clear_mq_map(set); 2956 for (i = 0; i < set->nr_maps; i++)
2957 blk_mq_clear_mq_map(&set->map[i]);
2820 2958
2821 return set->ops->map_queues(set); 2959 return set->ops->map_queues(set);
2822 } else 2960 } else {
2823 return blk_mq_map_queues(set); 2961 BUG_ON(set->nr_maps > 1);
2962 return blk_mq_map_queues(&set->map[0]);
2963 }
2824} 2964}
2825 2965
2826/* 2966/*
@@ -2831,7 +2971,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2831 */ 2971 */
2832int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2972int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2833{ 2973{
2834 int ret; 2974 int i, ret;
2835 2975
2836 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2976 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2837 2977
@@ -2854,6 +2994,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2854 set->queue_depth = BLK_MQ_MAX_DEPTH; 2994 set->queue_depth = BLK_MQ_MAX_DEPTH;
2855 } 2995 }
2856 2996
2997 if (!set->nr_maps)
2998 set->nr_maps = 1;
2999 else if (set->nr_maps > HCTX_MAX_TYPES)
3000 return -EINVAL;
3001
2857 /* 3002 /*
2858 * If a crashdump is active, then we are potentially in a very 3003 * If a crashdump is active, then we are potentially in a very
2859 * memory constrained environment. Limit us to 1 queue and 3004 * memory constrained environment. Limit us to 1 queue and
@@ -2861,24 +3006,30 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2861 */ 3006 */
2862 if (is_kdump_kernel()) { 3007 if (is_kdump_kernel()) {
2863 set->nr_hw_queues = 1; 3008 set->nr_hw_queues = 1;
3009 set->nr_maps = 1;
2864 set->queue_depth = min(64U, set->queue_depth); 3010 set->queue_depth = min(64U, set->queue_depth);
2865 } 3011 }
2866 /* 3012 /*
2867 * There is no use for more h/w queues than cpus. 3013 * There is no use for more h/w queues than cpus if we just have
3014 * a single map
2868 */ 3015 */
2869 if (set->nr_hw_queues > nr_cpu_ids) 3016 if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
2870 set->nr_hw_queues = nr_cpu_ids; 3017 set->nr_hw_queues = nr_cpu_ids;
2871 3018
2872 set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *), 3019 set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *),
2873 GFP_KERNEL, set->numa_node); 3020 GFP_KERNEL, set->numa_node);
2874 if (!set->tags) 3021 if (!set->tags)
2875 return -ENOMEM; 3022 return -ENOMEM;
2876 3023
2877 ret = -ENOMEM; 3024 ret = -ENOMEM;
2878 set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map), 3025 for (i = 0; i < set->nr_maps; i++) {
2879 GFP_KERNEL, set->numa_node); 3026 set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
2880 if (!set->mq_map) 3027 sizeof(set->map[i].mq_map[0]),
2881 goto out_free_tags; 3028 GFP_KERNEL, set->numa_node);
3029 if (!set->map[i].mq_map)
3030 goto out_free_mq_map;
3031 set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
3032 }
2882 3033
2883 ret = blk_mq_update_queue_map(set); 3034 ret = blk_mq_update_queue_map(set);
2884 if (ret) 3035 if (ret)
@@ -2894,9 +3045,10 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2894 return 0; 3045 return 0;
2895 3046
2896out_free_mq_map: 3047out_free_mq_map:
2897 kfree(set->mq_map); 3048 for (i = 0; i < set->nr_maps; i++) {
2898 set->mq_map = NULL; 3049 kfree(set->map[i].mq_map);
2899out_free_tags: 3050 set->map[i].mq_map = NULL;
3051 }
2900 kfree(set->tags); 3052 kfree(set->tags);
2901 set->tags = NULL; 3053 set->tags = NULL;
2902 return ret; 3054 return ret;
@@ -2905,13 +3057,15 @@ EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2905 3057
2906void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 3058void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2907{ 3059{
2908 int i; 3060 int i, j;
2909 3061
2910 for (i = 0; i < nr_cpu_ids; i++) 3062 for (i = 0; i < nr_hw_queues(set); i++)
2911 blk_mq_free_map_and_requests(set, i); 3063 blk_mq_free_map_and_requests(set, i);
2912 3064
2913 kfree(set->mq_map); 3065 for (j = 0; j < set->nr_maps; j++) {
2914 set->mq_map = NULL; 3066 kfree(set->map[j].mq_map);
3067 set->map[j].mq_map = NULL;
3068 }
2915 3069
2916 kfree(set->tags); 3070 kfree(set->tags);
2917 set->tags = NULL; 3071 set->tags = NULL;
@@ -3037,7 +3191,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
3037 3191
3038 lockdep_assert_held(&set->tag_list_lock); 3192 lockdep_assert_held(&set->tag_list_lock);
3039 3193
3040 if (nr_hw_queues > nr_cpu_ids) 3194 if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
3041 nr_hw_queues = nr_cpu_ids; 3195 nr_hw_queues = nr_cpu_ids;
3042 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) 3196 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
3043 return; 3197 return;
@@ -3072,7 +3226,7 @@ fallback:
3072 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", 3226 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3073 nr_hw_queues, prev_nr_hw_queues); 3227 nr_hw_queues, prev_nr_hw_queues);
3074 set->nr_hw_queues = prev_nr_hw_queues; 3228 set->nr_hw_queues = prev_nr_hw_queues;
3075 blk_mq_map_queues(set); 3229 blk_mq_map_queues(&set->map[0]);
3076 goto fallback; 3230 goto fallback;
3077 } 3231 }
3078 blk_mq_map_swqueue(q); 3232 blk_mq_map_swqueue(q);
@@ -3179,15 +3333,12 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3179 return false; 3333 return false;
3180 3334
3181 /* 3335 /*
3182 * poll_nsec can be: 3336 * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
3183 * 3337 *
3184 * -1: don't ever hybrid sleep
3185 * 0: use half of prev avg 3338 * 0: use half of prev avg
3186 * >0: use this specific value 3339 * >0: use this specific value
3187 */ 3340 */
3188 if (q->poll_nsec == -1) 3341 if (q->poll_nsec > 0)
3189 return false;
3190 else if (q->poll_nsec > 0)
3191 nsecs = q->poll_nsec; 3342 nsecs = q->poll_nsec;
3192 else 3343 else
3193 nsecs = blk_mq_poll_nsecs(q, hctx, rq); 3344 nsecs = blk_mq_poll_nsecs(q, hctx, rq);
@@ -3224,11 +3375,57 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3224 return true; 3375 return true;
3225} 3376}
3226 3377
3227static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) 3378static bool blk_mq_poll_hybrid(struct request_queue *q,
3379 struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
3228{ 3380{
3229 struct request_queue *q = hctx->queue; 3381 struct request *rq;
3382
3383 if (q->poll_nsec == -1)
3384 return false;
3385
3386 if (!blk_qc_t_is_internal(cookie))
3387 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3388 else {
3389 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
3390 /*
3391 * With scheduling, if the request has completed, we'll
3392 * get a NULL return here, as we clear the sched tag when
3393 * that happens. The request still remains valid, like always,
3394 * so we should be safe with just the NULL check.
3395 */
3396 if (!rq)
3397 return false;
3398 }
3399
3400 return blk_mq_poll_hybrid_sleep(q, hctx, rq);
3401}
3402
3403/**
3404 * blk_poll - poll for IO completions
3405 * @q: the queue
3406 * @cookie: cookie passed back at IO submission time
3407 * @spin: whether to spin for completions
3408 *
3409 * Description:
3410 * Poll for completions on the passed in queue. Returns number of
3411 * completed entries found. If @spin is true, then blk_poll will continue
3412 * looping until at least one completion is found, unless the task is
3413 * otherwise marked running (or we need to reschedule).
3414 */
3415int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
3416{
3417 struct blk_mq_hw_ctx *hctx;
3230 long state; 3418 long state;
3231 3419
3420 if (!blk_qc_t_valid(cookie) ||
3421 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3422 return 0;
3423
3424 if (current->plug)
3425 blk_flush_plug_list(current->plug, false);
3426
3427 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3428
3232 /* 3429 /*
3233 * If we sleep, have the caller restart the poll loop to reset 3430 * If we sleep, have the caller restart the poll loop to reset
3234 * the state. Like for the other success return cases, the 3431 * the state. Like for the other success return cases, the
@@ -3236,63 +3433,44 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
3236 * the IO isn't complete, we'll get called again and will go 3433 * the IO isn't complete, we'll get called again and will go
3237 * straight to the busy poll loop. 3434 * straight to the busy poll loop.
3238 */ 3435 */
3239 if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) 3436 if (blk_mq_poll_hybrid(q, hctx, cookie))
3240 return true; 3437 return 1;
3241 3438
3242 hctx->poll_considered++; 3439 hctx->poll_considered++;
3243 3440
3244 state = current->state; 3441 state = current->state;
3245 while (!need_resched()) { 3442 do {
3246 int ret; 3443 int ret;
3247 3444
3248 hctx->poll_invoked++; 3445 hctx->poll_invoked++;
3249 3446
3250 ret = q->mq_ops->poll(hctx, rq->tag); 3447 ret = q->mq_ops->poll(hctx);
3251 if (ret > 0) { 3448 if (ret > 0) {
3252 hctx->poll_success++; 3449 hctx->poll_success++;
3253 set_current_state(TASK_RUNNING); 3450 __set_current_state(TASK_RUNNING);
3254 return true; 3451 return ret;
3255 } 3452 }
3256 3453
3257 if (signal_pending_state(state, current)) 3454 if (signal_pending_state(state, current))
3258 set_current_state(TASK_RUNNING); 3455 __set_current_state(TASK_RUNNING);
3259 3456
3260 if (current->state == TASK_RUNNING) 3457 if (current->state == TASK_RUNNING)
3261 return true; 3458 return 1;
3262 if (ret < 0) 3459 if (ret < 0 || !spin)
3263 break; 3460 break;
3264 cpu_relax(); 3461 cpu_relax();
3265 } 3462 } while (!need_resched());
3266 3463
3267 __set_current_state(TASK_RUNNING); 3464 __set_current_state(TASK_RUNNING);
3268 return false; 3465 return 0;
3269} 3466}
3467EXPORT_SYMBOL_GPL(blk_poll);
3270 3468
3271static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) 3469unsigned int blk_mq_rq_cpu(struct request *rq)
3272{ 3470{
3273 struct blk_mq_hw_ctx *hctx; 3471 return rq->mq_ctx->cpu;
3274 struct request *rq;
3275
3276 if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3277 return false;
3278
3279 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3280 if (!blk_qc_t_is_internal(cookie))
3281 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3282 else {
3283 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
3284 /*
3285 * With scheduling, if the request has completed, we'll
3286 * get a NULL return here, as we clear the sched tag when
3287 * that happens. The request still remains valid, like always,
3288 * so we should be safe with just the NULL check.
3289 */
3290 if (!rq)
3291 return false;
3292 }
3293
3294 return __blk_mq_poll(hctx, rq);
3295} 3472}
3473EXPORT_SYMBOL(blk_mq_rq_cpu);
3296 3474
3297static int __init blk_mq_init(void) 3475static int __init blk_mq_init(void)
3298{ 3476{
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9497b47e2526..d943d46b0785 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -7,17 +7,22 @@
7 7
8struct blk_mq_tag_set; 8struct blk_mq_tag_set;
9 9
10struct blk_mq_ctxs {
11 struct kobject kobj;
12 struct blk_mq_ctx __percpu *queue_ctx;
13};
14
10/** 15/**
11 * struct blk_mq_ctx - State for a software queue facing the submitting CPUs 16 * struct blk_mq_ctx - State for a software queue facing the submitting CPUs
12 */ 17 */
13struct blk_mq_ctx { 18struct blk_mq_ctx {
14 struct { 19 struct {
15 spinlock_t lock; 20 spinlock_t lock;
16 struct list_head rq_list; 21 struct list_head rq_lists[HCTX_MAX_TYPES];
17 } ____cacheline_aligned_in_smp; 22 } ____cacheline_aligned_in_smp;
18 23
19 unsigned int cpu; 24 unsigned int cpu;
20 unsigned int index_hw; 25 unsigned short index_hw[HCTX_MAX_TYPES];
21 26
22 /* incremented at dispatch time */ 27 /* incremented at dispatch time */
23 unsigned long rq_dispatched[2]; 28 unsigned long rq_dispatched[2];
@@ -27,6 +32,7 @@ struct blk_mq_ctx {
27 unsigned long ____cacheline_aligned_in_smp rq_completed[2]; 32 unsigned long ____cacheline_aligned_in_smp rq_completed[2];
28 33
29 struct request_queue *queue; 34 struct request_queue *queue;
35 struct blk_mq_ctxs *ctxs;
30 struct kobject kobj; 36 struct kobject kobj;
31} ____cacheline_aligned_in_smp; 37} ____cacheline_aligned_in_smp;
32 38
@@ -62,20 +68,55 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
62void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 68void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
63 struct list_head *list); 69 struct list_head *list);
64 70
65/* Used by blk_insert_cloned_request() to issue request directly */ 71blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
66blk_status_t blk_mq_request_issue_directly(struct request *rq); 72 struct request *rq,
73 blk_qc_t *cookie,
74 bool bypass, bool last);
67void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 75void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
68 struct list_head *list); 76 struct list_head *list);
69 77
70/* 78/*
71 * CPU -> queue mappings 79 * CPU -> queue mappings
72 */ 80 */
73extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); 81extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);
82
83/*
84 * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue
85 * @q: request queue
86 * @type: the hctx type index
87 * @cpu: CPU
88 */
89static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q,
90 enum hctx_type type,
91 unsigned int cpu)
92{
93 return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
94}
74 95
96/*
97 * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
98 * @q: request queue
99 * @flags: request command flags
100 * @cpu: CPU
101 */
75static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, 102static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
76 int cpu) 103 unsigned int flags,
104 unsigned int cpu)
77{ 105{
78 return q->queue_hw_ctx[q->mq_map[cpu]]; 106 enum hctx_type type = HCTX_TYPE_DEFAULT;
107
108 if ((flags & REQ_HIPRI) &&
109 q->tag_set->nr_maps > HCTX_TYPE_POLL &&
110 q->tag_set->map[HCTX_TYPE_POLL].nr_queues &&
111 test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
112 type = HCTX_TYPE_POLL;
113
114 else if (((flags & REQ_OP_MASK) == REQ_OP_READ) &&
115 q->tag_set->nr_maps > HCTX_TYPE_READ &&
116 q->tag_set->map[HCTX_TYPE_READ].nr_queues)
117 type = HCTX_TYPE_READ;
118
119 return blk_mq_map_queue_type(q, type, cpu);
79} 120}
80 121
81/* 122/*
@@ -126,6 +167,7 @@ struct blk_mq_alloc_data {
126 struct request_queue *q; 167 struct request_queue *q;
127 blk_mq_req_flags_t flags; 168 blk_mq_req_flags_t flags;
128 unsigned int shallow_depth; 169 unsigned int shallow_depth;
170 unsigned int cmd_flags;
129 171
130 /* input & output parameter */ 172 /* input & output parameter */
131 struct blk_mq_ctx *ctx; 173 struct blk_mq_ctx *ctx;
@@ -150,8 +192,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
150 return hctx->nr_ctx && hctx->tags; 192 return hctx->nr_ctx && hctx->tags;
151} 193}
152 194
153void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, 195unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
154 unsigned int inflight[2]);
155void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, 196void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
156 unsigned int inflight[2]); 197 unsigned int inflight[2]);
157 198
@@ -195,21 +236,18 @@ static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
195 236
196static inline void blk_mq_put_driver_tag(struct request *rq) 237static inline void blk_mq_put_driver_tag(struct request *rq)
197{ 238{
198 struct blk_mq_hw_ctx *hctx;
199
200 if (rq->tag == -1 || rq->internal_tag == -1) 239 if (rq->tag == -1 || rq->internal_tag == -1)
201 return; 240 return;
202 241
203 hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); 242 __blk_mq_put_driver_tag(rq->mq_hctx, rq);
204 __blk_mq_put_driver_tag(hctx, rq);
205} 243}
206 244
207static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set) 245static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
208{ 246{
209 int cpu; 247 int cpu;
210 248
211 for_each_possible_cpu(cpu) 249 for_each_possible_cpu(cpu)
212 set->mq_map[cpu] = 0; 250 qmap->mq_map[cpu] = 0;
213} 251}
214 252
215#endif 253#endif
diff --git a/block/blk-pm.c b/block/blk-pm.c
index f8fdae01bea2..0a028c189897 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -89,12 +89,12 @@ int blk_pre_runtime_suspend(struct request_queue *q)
89 /* Switch q_usage_counter back to per-cpu mode. */ 89 /* Switch q_usage_counter back to per-cpu mode. */
90 blk_mq_unfreeze_queue(q); 90 blk_mq_unfreeze_queue(q);
91 91
92 spin_lock_irq(q->queue_lock); 92 spin_lock_irq(&q->queue_lock);
93 if (ret < 0) 93 if (ret < 0)
94 pm_runtime_mark_last_busy(q->dev); 94 pm_runtime_mark_last_busy(q->dev);
95 else 95 else
96 q->rpm_status = RPM_SUSPENDING; 96 q->rpm_status = RPM_SUSPENDING;
97 spin_unlock_irq(q->queue_lock); 97 spin_unlock_irq(&q->queue_lock);
98 98
99 if (ret) 99 if (ret)
100 blk_clear_pm_only(q); 100 blk_clear_pm_only(q);
@@ -121,14 +121,14 @@ void blk_post_runtime_suspend(struct request_queue *q, int err)
121 if (!q->dev) 121 if (!q->dev)
122 return; 122 return;
123 123
124 spin_lock_irq(q->queue_lock); 124 spin_lock_irq(&q->queue_lock);
125 if (!err) { 125 if (!err) {
126 q->rpm_status = RPM_SUSPENDED; 126 q->rpm_status = RPM_SUSPENDED;
127 } else { 127 } else {
128 q->rpm_status = RPM_ACTIVE; 128 q->rpm_status = RPM_ACTIVE;
129 pm_runtime_mark_last_busy(q->dev); 129 pm_runtime_mark_last_busy(q->dev);
130 } 130 }
131 spin_unlock_irq(q->queue_lock); 131 spin_unlock_irq(&q->queue_lock);
132 132
133 if (err) 133 if (err)
134 blk_clear_pm_only(q); 134 blk_clear_pm_only(q);
@@ -151,9 +151,9 @@ void blk_pre_runtime_resume(struct request_queue *q)
151 if (!q->dev) 151 if (!q->dev)
152 return; 152 return;
153 153
154 spin_lock_irq(q->queue_lock); 154 spin_lock_irq(&q->queue_lock);
155 q->rpm_status = RPM_RESUMING; 155 q->rpm_status = RPM_RESUMING;
156 spin_unlock_irq(q->queue_lock); 156 spin_unlock_irq(&q->queue_lock);
157} 157}
158EXPORT_SYMBOL(blk_pre_runtime_resume); 158EXPORT_SYMBOL(blk_pre_runtime_resume);
159 159
@@ -176,7 +176,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err)
176 if (!q->dev) 176 if (!q->dev)
177 return; 177 return;
178 178
179 spin_lock_irq(q->queue_lock); 179 spin_lock_irq(&q->queue_lock);
180 if (!err) { 180 if (!err) {
181 q->rpm_status = RPM_ACTIVE; 181 q->rpm_status = RPM_ACTIVE;
182 pm_runtime_mark_last_busy(q->dev); 182 pm_runtime_mark_last_busy(q->dev);
@@ -184,7 +184,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err)
184 } else { 184 } else {
185 q->rpm_status = RPM_SUSPENDED; 185 q->rpm_status = RPM_SUSPENDED;
186 } 186 }
187 spin_unlock_irq(q->queue_lock); 187 spin_unlock_irq(&q->queue_lock);
188 188
189 if (!err) 189 if (!err)
190 blk_clear_pm_only(q); 190 blk_clear_pm_only(q);
@@ -207,10 +207,10 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
207 */ 207 */
208void blk_set_runtime_active(struct request_queue *q) 208void blk_set_runtime_active(struct request_queue *q)
209{ 209{
210 spin_lock_irq(q->queue_lock); 210 spin_lock_irq(&q->queue_lock);
211 q->rpm_status = RPM_ACTIVE; 211 q->rpm_status = RPM_ACTIVE;
212 pm_runtime_mark_last_busy(q->dev); 212 pm_runtime_mark_last_busy(q->dev);
213 pm_request_autosuspend(q->dev); 213 pm_request_autosuspend(q->dev);
214 spin_unlock_irq(q->queue_lock); 214 spin_unlock_irq(&q->queue_lock);
215} 215}
216EXPORT_SYMBOL(blk_set_runtime_active); 216EXPORT_SYMBOL(blk_set_runtime_active);
diff --git a/block/blk-pm.h b/block/blk-pm.h
index a8564ea72a41..ea5507d23e75 100644
--- a/block/blk-pm.h
+++ b/block/blk-pm.h
@@ -21,7 +21,7 @@ static inline void blk_pm_mark_last_busy(struct request *rq)
21 21
22static inline void blk_pm_requeue_request(struct request *rq) 22static inline void blk_pm_requeue_request(struct request *rq)
23{ 23{
24 lockdep_assert_held(rq->q->queue_lock); 24 lockdep_assert_held(&rq->q->queue_lock);
25 25
26 if (rq->q->dev && !(rq->rq_flags & RQF_PM)) 26 if (rq->q->dev && !(rq->rq_flags & RQF_PM))
27 rq->q->nr_pending--; 27 rq->q->nr_pending--;
@@ -30,7 +30,7 @@ static inline void blk_pm_requeue_request(struct request *rq)
30static inline void blk_pm_add_request(struct request_queue *q, 30static inline void blk_pm_add_request(struct request_queue *q,
31 struct request *rq) 31 struct request *rq)
32{ 32{
33 lockdep_assert_held(q->queue_lock); 33 lockdep_assert_held(&q->queue_lock);
34 34
35 if (q->dev && !(rq->rq_flags & RQF_PM)) 35 if (q->dev && !(rq->rq_flags & RQF_PM))
36 q->nr_pending++; 36 q->nr_pending++;
@@ -38,7 +38,7 @@ static inline void blk_pm_add_request(struct request_queue *q,
38 38
39static inline void blk_pm_put_request(struct request *rq) 39static inline void blk_pm_put_request(struct request *rq)
40{ 40{
41 lockdep_assert_held(rq->q->queue_lock); 41 lockdep_assert_held(&rq->q->queue_lock);
42 42
43 if (rq->q->dev && !(rq->rq_flags & RQF_PM)) 43 if (rq->q->dev && !(rq->rq_flags & RQF_PM))
44 --rq->q->nr_pending; 44 --rq->q->nr_pending;
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 0005dfd568dd..d169d7188fa6 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -27,75 +27,67 @@ bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
27 return atomic_inc_below(&rq_wait->inflight, limit); 27 return atomic_inc_below(&rq_wait->inflight, limit);
28} 28}
29 29
30void rq_qos_cleanup(struct request_queue *q, struct bio *bio) 30void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio)
31{ 31{
32 struct rq_qos *rqos; 32 do {
33
34 for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
35 if (rqos->ops->cleanup) 33 if (rqos->ops->cleanup)
36 rqos->ops->cleanup(rqos, bio); 34 rqos->ops->cleanup(rqos, bio);
37 } 35 rqos = rqos->next;
36 } while (rqos);
38} 37}
39 38
40void rq_qos_done(struct request_queue *q, struct request *rq) 39void __rq_qos_done(struct rq_qos *rqos, struct request *rq)
41{ 40{
42 struct rq_qos *rqos; 41 do {
43
44 for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
45 if (rqos->ops->done) 42 if (rqos->ops->done)
46 rqos->ops->done(rqos, rq); 43 rqos->ops->done(rqos, rq);
47 } 44 rqos = rqos->next;
45 } while (rqos);
48} 46}
49 47
50void rq_qos_issue(struct request_queue *q, struct request *rq) 48void __rq_qos_issue(struct rq_qos *rqos, struct request *rq)
51{ 49{
52 struct rq_qos *rqos; 50 do {
53
54 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
55 if (rqos->ops->issue) 51 if (rqos->ops->issue)
56 rqos->ops->issue(rqos, rq); 52 rqos->ops->issue(rqos, rq);
57 } 53 rqos = rqos->next;
54 } while (rqos);
58} 55}
59 56
60void rq_qos_requeue(struct request_queue *q, struct request *rq) 57void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq)
61{ 58{
62 struct rq_qos *rqos; 59 do {
63
64 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
65 if (rqos->ops->requeue) 60 if (rqos->ops->requeue)
66 rqos->ops->requeue(rqos, rq); 61 rqos->ops->requeue(rqos, rq);
67 } 62 rqos = rqos->next;
63 } while (rqos);
68} 64}
69 65
70void rq_qos_throttle(struct request_queue *q, struct bio *bio, 66void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio)
71 spinlock_t *lock)
72{ 67{
73 struct rq_qos *rqos; 68 do {
74
75 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
76 if (rqos->ops->throttle) 69 if (rqos->ops->throttle)
77 rqos->ops->throttle(rqos, bio, lock); 70 rqos->ops->throttle(rqos, bio);
78 } 71 rqos = rqos->next;
72 } while (rqos);
79} 73}
80 74
81void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) 75void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
82{ 76{
83 struct rq_qos *rqos; 77 do {
84
85 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
86 if (rqos->ops->track) 78 if (rqos->ops->track)
87 rqos->ops->track(rqos, rq, bio); 79 rqos->ops->track(rqos, rq, bio);
88 } 80 rqos = rqos->next;
81 } while (rqos);
89} 82}
90 83
91void rq_qos_done_bio(struct request_queue *q, struct bio *bio) 84void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
92{ 85{
93 struct rq_qos *rqos; 86 do {
94
95 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
96 if (rqos->ops->done_bio) 87 if (rqos->ops->done_bio)
97 rqos->ops->done_bio(rqos, bio); 88 rqos->ops->done_bio(rqos, bio);
98 } 89 rqos = rqos->next;
90 } while (rqos);
99} 91}
100 92
101/* 93/*
@@ -184,8 +176,96 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
184 rq_depth_calc_max_depth(rqd); 176 rq_depth_calc_max_depth(rqd);
185} 177}
186 178
179struct rq_qos_wait_data {
180 struct wait_queue_entry wq;
181 struct task_struct *task;
182 struct rq_wait *rqw;
183 acquire_inflight_cb_t *cb;
184 void *private_data;
185 bool got_token;
186};
187
188static int rq_qos_wake_function(struct wait_queue_entry *curr,
189 unsigned int mode, int wake_flags, void *key)
190{
191 struct rq_qos_wait_data *data = container_of(curr,
192 struct rq_qos_wait_data,
193 wq);
194
195 /*
196 * If we fail to get a budget, return -1 to interrupt the wake up loop
197 * in __wake_up_common.
198 */
199 if (!data->cb(data->rqw, data->private_data))
200 return -1;
201
202 data->got_token = true;
203 list_del_init(&curr->entry);
204 wake_up_process(data->task);
205 return 1;
206}
207
208/**
209 * rq_qos_wait - throttle on a rqw if we need to
210 * @private_data - caller provided specific data
211 * @acquire_inflight_cb - inc the rqw->inflight counter if we can
212 * @cleanup_cb - the callback to cleanup in case we race with a waker
213 *
214 * This provides a uniform place for the rq_qos users to do their throttling.
215 * Since you can end up with a lot of things sleeping at once, this manages the
216 * waking up based on the resources available. The acquire_inflight_cb should
217 * inc the rqw->inflight if we have the ability to do so, or return false if not
218 * and then we will sleep until the room becomes available.
219 *
220 * cleanup_cb is in case that we race with a waker and need to cleanup the
221 * inflight count accordingly.
222 */
223void rq_qos_wait(struct rq_wait *rqw, void *private_data,
224 acquire_inflight_cb_t *acquire_inflight_cb,
225 cleanup_cb_t *cleanup_cb)
226{
227 struct rq_qos_wait_data data = {
228 .wq = {
229 .func = rq_qos_wake_function,
230 .entry = LIST_HEAD_INIT(data.wq.entry),
231 },
232 .task = current,
233 .rqw = rqw,
234 .cb = acquire_inflight_cb,
235 .private_data = private_data,
236 };
237 bool has_sleeper;
238
239 has_sleeper = wq_has_sleeper(&rqw->wait);
240 if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
241 return;
242
243 prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
244 do {
245 if (data.got_token)
246 break;
247 if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
248 finish_wait(&rqw->wait, &data.wq);
249
250 /*
251 * We raced with wbt_wake_function() getting a token,
252 * which means we now have two. Put our local token
253 * and wake anyone else potentially waiting for one.
254 */
255 if (data.got_token)
256 cleanup_cb(rqw, private_data);
257 break;
258 }
259 io_schedule();
260 has_sleeper = false;
261 } while (1);
262 finish_wait(&rqw->wait, &data.wq);
263}
264
187void rq_qos_exit(struct request_queue *q) 265void rq_qos_exit(struct request_queue *q)
188{ 266{
267 blk_mq_debugfs_unregister_queue_rqos(q);
268
189 while (q->rq_qos) { 269 while (q->rq_qos) {
190 struct rq_qos *rqos = q->rq_qos; 270 struct rq_qos *rqos = q->rq_qos;
191 q->rq_qos = rqos->next; 271 q->rq_qos = rqos->next;
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 32b02efbfa66..564851889550 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -7,6 +7,10 @@
7#include <linux/atomic.h> 7#include <linux/atomic.h>
8#include <linux/wait.h> 8#include <linux/wait.h>
9 9
10#include "blk-mq-debugfs.h"
11
12struct blk_mq_debugfs_attr;
13
10enum rq_qos_id { 14enum rq_qos_id {
11 RQ_QOS_WBT, 15 RQ_QOS_WBT,
12 RQ_QOS_CGROUP, 16 RQ_QOS_CGROUP,
@@ -22,10 +26,13 @@ struct rq_qos {
22 struct request_queue *q; 26 struct request_queue *q;
23 enum rq_qos_id id; 27 enum rq_qos_id id;
24 struct rq_qos *next; 28 struct rq_qos *next;
29#ifdef CONFIG_BLK_DEBUG_FS
30 struct dentry *debugfs_dir;
31#endif
25}; 32};
26 33
27struct rq_qos_ops { 34struct rq_qos_ops {
28 void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *); 35 void (*throttle)(struct rq_qos *, struct bio *);
29 void (*track)(struct rq_qos *, struct request *, struct bio *); 36 void (*track)(struct rq_qos *, struct request *, struct bio *);
30 void (*issue)(struct rq_qos *, struct request *); 37 void (*issue)(struct rq_qos *, struct request *);
31 void (*requeue)(struct rq_qos *, struct request *); 38 void (*requeue)(struct rq_qos *, struct request *);
@@ -33,6 +40,7 @@ struct rq_qos_ops {
33 void (*done_bio)(struct rq_qos *, struct bio *); 40 void (*done_bio)(struct rq_qos *, struct bio *);
34 void (*cleanup)(struct rq_qos *, struct bio *); 41 void (*cleanup)(struct rq_qos *, struct bio *);
35 void (*exit)(struct rq_qos *); 42 void (*exit)(struct rq_qos *);
43 const struct blk_mq_debugfs_attr *debugfs_attrs;
36}; 44};
37 45
38struct rq_depth { 46struct rq_depth {
@@ -66,6 +74,17 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
66 return rq_qos_id(q, RQ_QOS_CGROUP); 74 return rq_qos_id(q, RQ_QOS_CGROUP);
67} 75}
68 76
77static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
78{
79 switch (id) {
80 case RQ_QOS_WBT:
81 return "wbt";
82 case RQ_QOS_CGROUP:
83 return "cgroup";
84 }
85 return "unknown";
86}
87
69static inline void rq_wait_init(struct rq_wait *rq_wait) 88static inline void rq_wait_init(struct rq_wait *rq_wait)
70{ 89{
71 atomic_set(&rq_wait->inflight, 0); 90 atomic_set(&rq_wait->inflight, 0);
@@ -76,6 +95,9 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
76{ 95{
77 rqos->next = q->rq_qos; 96 rqos->next = q->rq_qos;
78 q->rq_qos = rqos; 97 q->rq_qos = rqos;
98
99 if (rqos->ops->debugfs_attrs)
100 blk_mq_debugfs_register_rqos(rqos);
79} 101}
80 102
81static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) 103static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
@@ -91,19 +113,77 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
91 } 113 }
92 prev = cur; 114 prev = cur;
93 } 115 }
116
117 blk_mq_debugfs_unregister_rqos(rqos);
94} 118}
95 119
120typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
121typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);
122
123void rq_qos_wait(struct rq_wait *rqw, void *private_data,
124 acquire_inflight_cb_t *acquire_inflight_cb,
125 cleanup_cb_t *cleanup_cb);
96bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); 126bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
97void rq_depth_scale_up(struct rq_depth *rqd); 127void rq_depth_scale_up(struct rq_depth *rqd);
98void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); 128void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
99bool rq_depth_calc_max_depth(struct rq_depth *rqd); 129bool rq_depth_calc_max_depth(struct rq_depth *rqd);
100 130
101void rq_qos_cleanup(struct request_queue *, struct bio *); 131void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
102void rq_qos_done(struct request_queue *, struct request *); 132void __rq_qos_done(struct rq_qos *rqos, struct request *rq);
103void rq_qos_issue(struct request_queue *, struct request *); 133void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
104void rq_qos_requeue(struct request_queue *, struct request *); 134void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
105void rq_qos_done_bio(struct request_queue *q, struct bio *bio); 135void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
106void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); 136void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
107void rq_qos_track(struct request_queue *q, struct request *, struct bio *); 137void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
138
139static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
140{
141 if (q->rq_qos)
142 __rq_qos_cleanup(q->rq_qos, bio);
143}
144
145static inline void rq_qos_done(struct request_queue *q, struct request *rq)
146{
147 if (q->rq_qos)
148 __rq_qos_done(q->rq_qos, rq);
149}
150
151static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
152{
153 if (q->rq_qos)
154 __rq_qos_issue(q->rq_qos, rq);
155}
156
157static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
158{
159 if (q->rq_qos)
160 __rq_qos_requeue(q->rq_qos, rq);
161}
162
163static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
164{
165 if (q->rq_qos)
166 __rq_qos_done_bio(q->rq_qos, bio);
167}
168
169static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
170{
171 /*
172 * BIO_TRACKED lets controllers know that a bio went through the
173 * normal rq_qos path.
174 */
175 bio_set_flag(bio, BIO_TRACKED);
176 if (q->rq_qos)
177 __rq_qos_throttle(q->rq_qos, bio);
178}
179
180static inline void rq_qos_track(struct request_queue *q, struct request *rq,
181 struct bio *bio)
182{
183 if (q->rq_qos)
184 __rq_qos_track(q->rq_qos, rq, bio);
185}
186
108void rq_qos_exit(struct request_queue *); 187void rq_qos_exit(struct request_queue *);
188
109#endif 189#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9c8b62f8c180..3e7038e475ee 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -20,65 +20,12 @@ EXPORT_SYMBOL(blk_max_low_pfn);
20 20
21unsigned long blk_max_pfn; 21unsigned long blk_max_pfn;
22 22
23/**
24 * blk_queue_prep_rq - set a prepare_request function for queue
25 * @q: queue
26 * @pfn: prepare_request function
27 *
28 * It's possible for a queue to register a prepare_request callback which
29 * is invoked before the request is handed to the request_fn. The goal of
30 * the function is to prepare a request for I/O, it can be used to build a
31 * cdb from the request data for instance.
32 *
33 */
34void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
35{
36 q->prep_rq_fn = pfn;
37}
38EXPORT_SYMBOL(blk_queue_prep_rq);
39
40/**
41 * blk_queue_unprep_rq - set an unprepare_request function for queue
42 * @q: queue
43 * @ufn: unprepare_request function
44 *
45 * It's possible for a queue to register an unprepare_request callback
46 * which is invoked before the request is finally completed. The goal
47 * of the function is to deallocate any data that was allocated in the
48 * prepare_request callback.
49 *
50 */
51void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
52{
53 q->unprep_rq_fn = ufn;
54}
55EXPORT_SYMBOL(blk_queue_unprep_rq);
56
57void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
58{
59 q->softirq_done_fn = fn;
60}
61EXPORT_SYMBOL(blk_queue_softirq_done);
62
63void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) 23void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
64{ 24{
65 q->rq_timeout = timeout; 25 q->rq_timeout = timeout;
66} 26}
67EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); 27EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
68 28
69void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
70{
71 WARN_ON_ONCE(q->mq_ops);
72 q->rq_timed_out_fn = fn;
73}
74EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
75
76void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
77{
78 q->lld_busy_fn = fn;
79}
80EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
81
82/** 29/**
83 * blk_set_default_limits - reset limits to default values 30 * blk_set_default_limits - reset limits to default values
84 * @lim: the queue_limits structure to reset 31 * @lim: the queue_limits structure to reset
@@ -168,8 +115,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
168 115
169 q->make_request_fn = mfn; 116 q->make_request_fn = mfn;
170 blk_queue_dma_alignment(q, 511); 117 blk_queue_dma_alignment(q, 511);
171 blk_queue_congestion_threshold(q);
172 q->nr_batching = BLK_BATCH_REQ;
173 118
174 blk_set_default_limits(&q->limits); 119 blk_set_default_limits(&q->limits);
175} 120}
@@ -886,16 +831,14 @@ EXPORT_SYMBOL(blk_set_queue_depth);
886 */ 831 */
887void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) 832void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
888{ 833{
889 spin_lock_irq(q->queue_lock);
890 if (wc) 834 if (wc)
891 queue_flag_set(QUEUE_FLAG_WC, q); 835 blk_queue_flag_set(QUEUE_FLAG_WC, q);
892 else 836 else
893 queue_flag_clear(QUEUE_FLAG_WC, q); 837 blk_queue_flag_clear(QUEUE_FLAG_WC, q);
894 if (fua) 838 if (fua)
895 queue_flag_set(QUEUE_FLAG_FUA, q); 839 blk_queue_flag_set(QUEUE_FLAG_FUA, q);
896 else 840 else
897 queue_flag_clear(QUEUE_FLAG_FUA, q); 841 blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
898 spin_unlock_irq(q->queue_lock);
899 842
900 wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 843 wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
901} 844}
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index e47a2f751884..457d9ba3eb20 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -34,7 +34,7 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h)
34 34
35 rq = list_entry(local_list.next, struct request, ipi_list); 35 rq = list_entry(local_list.next, struct request, ipi_list);
36 list_del_init(&rq->ipi_list); 36 list_del_init(&rq->ipi_list);
37 rq->q->softirq_done_fn(rq); 37 rq->q->mq_ops->complete(rq);
38 } 38 }
39} 39}
40 40
@@ -98,11 +98,11 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
98void __blk_complete_request(struct request *req) 98void __blk_complete_request(struct request *req)
99{ 99{
100 struct request_queue *q = req->q; 100 struct request_queue *q = req->q;
101 int cpu, ccpu = q->mq_ops ? req->mq_ctx->cpu : req->cpu; 101 int cpu, ccpu = req->mq_ctx->cpu;
102 unsigned long flags; 102 unsigned long flags;
103 bool shared = false; 103 bool shared = false;
104 104
105 BUG_ON(!q->softirq_done_fn); 105 BUG_ON(!q->mq_ops->complete);
106 106
107 local_irq_save(flags); 107 local_irq_save(flags);
108 cpu = smp_processor_id(); 108 cpu = smp_processor_id();
@@ -143,27 +143,6 @@ do_local:
143 143
144 local_irq_restore(flags); 144 local_irq_restore(flags);
145} 145}
146EXPORT_SYMBOL(__blk_complete_request);
147
148/**
149 * blk_complete_request - end I/O on a request
150 * @req: the request being processed
151 *
152 * Description:
153 * Ends all I/O on a request. It does not handle partial completions,
154 * unless the driver actually implements this in its completion callback
155 * through requeueing. The actual completion happens out-of-order,
156 * through a softirq handler. The user must have registered a completion
157 * callback through blk_queue_softirq_done().
158 **/
159void blk_complete_request(struct request *req)
160{
161 if (unlikely(blk_should_fake_timeout(req->q)))
162 return;
163 if (!blk_mark_rq_complete(req))
164 __blk_complete_request(req);
165}
166EXPORT_SYMBOL(blk_complete_request);
167 146
168static __init int blk_softirq_init(void) 147static __init int blk_softirq_init(void)
169{ 148{
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 90561af85a62..696a04176e4d 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -130,7 +130,6 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
130 130
131 return cb; 131 return cb;
132} 132}
133EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
134 133
135void blk_stat_add_callback(struct request_queue *q, 134void blk_stat_add_callback(struct request_queue *q,
136 struct blk_stat_callback *cb) 135 struct blk_stat_callback *cb)
@@ -151,7 +150,6 @@ void blk_stat_add_callback(struct request_queue *q,
151 blk_queue_flag_set(QUEUE_FLAG_STATS, q); 150 blk_queue_flag_set(QUEUE_FLAG_STATS, q);
152 spin_unlock(&q->stats->lock); 151 spin_unlock(&q->stats->lock);
153} 152}
154EXPORT_SYMBOL_GPL(blk_stat_add_callback);
155 153
156void blk_stat_remove_callback(struct request_queue *q, 154void blk_stat_remove_callback(struct request_queue *q,
157 struct blk_stat_callback *cb) 155 struct blk_stat_callback *cb)
@@ -164,7 +162,6 @@ void blk_stat_remove_callback(struct request_queue *q,
164 162
165 del_timer_sync(&cb->timer); 163 del_timer_sync(&cb->timer);
166} 164}
167EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
168 165
169static void blk_stat_free_callback_rcu(struct rcu_head *head) 166static void blk_stat_free_callback_rcu(struct rcu_head *head)
170{ 167{
@@ -181,7 +178,6 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
181 if (cb) 178 if (cb)
182 call_rcu(&cb->rcu, blk_stat_free_callback_rcu); 179 call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
183} 180}
184EXPORT_SYMBOL_GPL(blk_stat_free_callback);
185 181
186void blk_stat_enable_accounting(struct request_queue *q) 182void blk_stat_enable_accounting(struct request_queue *q)
187{ 183{
diff --git a/block/blk-stat.h b/block/blk-stat.h
index f4a1568e81a4..17b47a86eefb 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -145,6 +145,11 @@ static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
145 mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); 145 mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
146} 146}
147 147
148static inline void blk_stat_deactivate(struct blk_stat_callback *cb)
149{
150 del_timer_sync(&cb->timer);
151}
152
148/** 153/**
149 * blk_stat_activate_msecs() - Gather block statistics during a time window in 154 * blk_stat_activate_msecs() - Gather block statistics during a time window in
150 * milliseconds. 155 * milliseconds.
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5144707f25ea..590d1ef2f961 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -68,7 +68,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
68 unsigned long nr; 68 unsigned long nr;
69 int ret, err; 69 int ret, err;
70 70
71 if (!q->request_fn && !q->mq_ops) 71 if (!queue_is_mq(q))
72 return -EINVAL; 72 return -EINVAL;
73 73
74 ret = queue_var_store(&nr, page, count); 74 ret = queue_var_store(&nr, page, count);
@@ -78,11 +78,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
78 if (nr < BLKDEV_MIN_RQ) 78 if (nr < BLKDEV_MIN_RQ)
79 nr = BLKDEV_MIN_RQ; 79 nr = BLKDEV_MIN_RQ;
80 80
81 if (q->request_fn) 81 err = blk_mq_update_nr_requests(q, nr);
82 err = blk_update_nr_requests(q, nr);
83 else
84 err = blk_mq_update_nr_requests(q, nr);
85
86 if (err) 82 if (err)
87 return err; 83 return err;
88 84
@@ -239,10 +235,10 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
239 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) 235 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
240 return -EINVAL; 236 return -EINVAL;
241 237
242 spin_lock_irq(q->queue_lock); 238 spin_lock_irq(&q->queue_lock);
243 q->limits.max_sectors = max_sectors_kb << 1; 239 q->limits.max_sectors = max_sectors_kb << 1;
244 q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); 240 q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
245 spin_unlock_irq(q->queue_lock); 241 spin_unlock_irq(&q->queue_lock);
246 242
247 return ret; 243 return ret;
248} 244}
@@ -317,14 +313,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
317 if (ret < 0) 313 if (ret < 0)
318 return ret; 314 return ret;
319 315
320 spin_lock_irq(q->queue_lock); 316 blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
321 queue_flag_clear(QUEUE_FLAG_NOMERGES, q); 317 blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
322 queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
323 if (nm == 2) 318 if (nm == 2)
324 queue_flag_set(QUEUE_FLAG_NOMERGES, q); 319 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
325 else if (nm) 320 else if (nm)
326 queue_flag_set(QUEUE_FLAG_NOXMERGES, q); 321 blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
327 spin_unlock_irq(q->queue_lock);
328 322
329 return ret; 323 return ret;
330} 324}
@@ -348,18 +342,16 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
348 if (ret < 0) 342 if (ret < 0)
349 return ret; 343 return ret;
350 344
351 spin_lock_irq(q->queue_lock);
352 if (val == 2) { 345 if (val == 2) {
353 queue_flag_set(QUEUE_FLAG_SAME_COMP, q); 346 blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
354 queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); 347 blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
355 } else if (val == 1) { 348 } else if (val == 1) {
356 queue_flag_set(QUEUE_FLAG_SAME_COMP, q); 349 blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
357 queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); 350 blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
358 } else if (val == 0) { 351 } else if (val == 0) {
359 queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); 352 blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
360 queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); 353 blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
361 } 354 }
362 spin_unlock_irq(q->queue_lock);
363#endif 355#endif
364 return ret; 356 return ret;
365} 357}
@@ -407,7 +399,8 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
407 unsigned long poll_on; 399 unsigned long poll_on;
408 ssize_t ret; 400 ssize_t ret;
409 401
410 if (!q->mq_ops || !q->mq_ops->poll) 402 if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
403 !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
411 return -EINVAL; 404 return -EINVAL;
412 405
413 ret = queue_var_store(&poll_on, page, count); 406 ret = queue_var_store(&poll_on, page, count);
@@ -422,6 +415,26 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
422 return ret; 415 return ret;
423} 416}
424 417
418static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
419{
420 return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout));
421}
422
423static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page,
424 size_t count)
425{
426 unsigned int val;
427 int err;
428
429 err = kstrtou32(page, 10, &val);
430 if (err || val == 0)
431 return -EINVAL;
432
433 blk_queue_rq_timeout(q, msecs_to_jiffies(val));
434
435 return count;
436}
437
425static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) 438static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
426{ 439{
427 if (!wbt_rq_qos(q)) 440 if (!wbt_rq_qos(q))
@@ -460,20 +473,14 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
460 * ends up either enabling or disabling wbt completely. We can't 473 * ends up either enabling or disabling wbt completely. We can't
461 * have IO inflight if that happens. 474 * have IO inflight if that happens.
462 */ 475 */
463 if (q->mq_ops) { 476 blk_mq_freeze_queue(q);
464 blk_mq_freeze_queue(q); 477 blk_mq_quiesce_queue(q);
465 blk_mq_quiesce_queue(q);
466 } else
467 blk_queue_bypass_start(q);
468 478
469 wbt_set_min_lat(q, val); 479 wbt_set_min_lat(q, val);
470 wbt_update_limits(q); 480 wbt_update_limits(q);
471 481
472 if (q->mq_ops) { 482 blk_mq_unquiesce_queue(q);
473 blk_mq_unquiesce_queue(q); 483 blk_mq_unfreeze_queue(q);
474 blk_mq_unfreeze_queue(q);
475 } else
476 blk_queue_bypass_end(q);
477 484
478 return count; 485 return count;
479} 486}
@@ -696,6 +703,12 @@ static struct queue_sysfs_entry queue_dax_entry = {
696 .show = queue_dax_show, 703 .show = queue_dax_show,
697}; 704};
698 705
706static struct queue_sysfs_entry queue_io_timeout_entry = {
707 .attr = {.name = "io_timeout", .mode = 0644 },
708 .show = queue_io_timeout_show,
709 .store = queue_io_timeout_store,
710};
711
699static struct queue_sysfs_entry queue_wb_lat_entry = { 712static struct queue_sysfs_entry queue_wb_lat_entry = {
700 .attr = {.name = "wbt_lat_usec", .mode = 0644 }, 713 .attr = {.name = "wbt_lat_usec", .mode = 0644 },
701 .show = queue_wb_lat_show, 714 .show = queue_wb_lat_show,
@@ -745,6 +758,7 @@ static struct attribute *default_attrs[] = {
745 &queue_dax_entry.attr, 758 &queue_dax_entry.attr,
746 &queue_wb_lat_entry.attr, 759 &queue_wb_lat_entry.attr,
747 &queue_poll_delay_entry.attr, 760 &queue_poll_delay_entry.attr,
761 &queue_io_timeout_entry.attr,
748#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 762#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
749 &throtl_sample_time_entry.attr, 763 &throtl_sample_time_entry.attr,
750#endif 764#endif
@@ -844,24 +858,14 @@ static void __blk_release_queue(struct work_struct *work)
844 858
845 blk_free_queue_stats(q->stats); 859 blk_free_queue_stats(q->stats);
846 860
847 blk_exit_rl(q, &q->root_rl);
848
849 if (q->queue_tags)
850 __blk_queue_free_tags(q);
851
852 blk_queue_free_zone_bitmaps(q); 861 blk_queue_free_zone_bitmaps(q);
853 862
854 if (!q->mq_ops) { 863 if (queue_is_mq(q))
855 if (q->exit_rq_fn)
856 q->exit_rq_fn(q, q->fq->flush_rq);
857 blk_free_flush_queue(q->fq);
858 } else {
859 blk_mq_release(q); 864 blk_mq_release(q);
860 }
861 865
862 blk_trace_shutdown(q); 866 blk_trace_shutdown(q);
863 867
864 if (q->mq_ops) 868 if (queue_is_mq(q))
865 blk_mq_debugfs_unregister(q); 869 blk_mq_debugfs_unregister(q);
866 870
867 bioset_exit(&q->bio_split); 871 bioset_exit(&q->bio_split);
@@ -906,7 +910,7 @@ int blk_register_queue(struct gendisk *disk)
906 WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), 910 WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
907 "%s is registering an already registered queue\n", 911 "%s is registering an already registered queue\n",
908 kobject_name(&dev->kobj)); 912 kobject_name(&dev->kobj));
909 queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q); 913 blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
910 914
911 /* 915 /*
912 * SCSI probing may synchronously create and destroy a lot of 916 * SCSI probing may synchronously create and destroy a lot of
@@ -918,9 +922,8 @@ int blk_register_queue(struct gendisk *disk)
918 * request_queues for non-existent devices never get registered. 922 * request_queues for non-existent devices never get registered.
919 */ 923 */
920 if (!blk_queue_init_done(q)) { 924 if (!blk_queue_init_done(q)) {
921 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); 925 blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
922 percpu_ref_switch_to_percpu(&q->q_usage_counter); 926 percpu_ref_switch_to_percpu(&q->q_usage_counter);
923 blk_queue_bypass_end(q);
924 } 927 }
925 928
926 ret = blk_trace_init_sysfs(dev); 929 ret = blk_trace_init_sysfs(dev);
@@ -936,7 +939,7 @@ int blk_register_queue(struct gendisk *disk)
936 goto unlock; 939 goto unlock;
937 } 940 }
938 941
939 if (q->mq_ops) { 942 if (queue_is_mq(q)) {
940 __blk_mq_register_dev(dev, q); 943 __blk_mq_register_dev(dev, q);
941 blk_mq_debugfs_register(q); 944 blk_mq_debugfs_register(q);
942 } 945 }
@@ -947,7 +950,7 @@ int blk_register_queue(struct gendisk *disk)
947 950
948 blk_throtl_register_queue(q); 951 blk_throtl_register_queue(q);
949 952
950 if (q->request_fn || (q->mq_ops && q->elevator)) { 953 if (q->elevator) {
951 ret = elv_register_queue(q); 954 ret = elv_register_queue(q);
952 if (ret) { 955 if (ret) {
953 mutex_unlock(&q->sysfs_lock); 956 mutex_unlock(&q->sysfs_lock);
@@ -996,7 +999,7 @@ void blk_unregister_queue(struct gendisk *disk)
996 * Remove the sysfs attributes before unregistering the queue data 999 * Remove the sysfs attributes before unregistering the queue data
997 * structures that can be modified through sysfs. 1000 * structures that can be modified through sysfs.
998 */ 1001 */
999 if (q->mq_ops) 1002 if (queue_is_mq(q))
1000 blk_mq_unregister_dev(disk_to_dev(disk), q); 1003 blk_mq_unregister_dev(disk_to_dev(disk), q);
1001 mutex_unlock(&q->sysfs_lock); 1004 mutex_unlock(&q->sysfs_lock);
1002 1005
@@ -1005,7 +1008,7 @@ void blk_unregister_queue(struct gendisk *disk)
1005 blk_trace_remove_sysfs(disk_to_dev(disk)); 1008 blk_trace_remove_sysfs(disk_to_dev(disk));
1006 1009
1007 mutex_lock(&q->sysfs_lock); 1010 mutex_lock(&q->sysfs_lock);
1008 if (q->request_fn || (q->mq_ops && q->elevator)) 1011 if (q->elevator)
1009 elv_unregister_queue(q); 1012 elv_unregister_queue(q);
1010 mutex_unlock(&q->sysfs_lock); 1013 mutex_unlock(&q->sysfs_lock);
1011 1014
diff --git a/block/blk-tag.c b/block/blk-tag.c
deleted file mode 100644
index fbc153aef166..000000000000
--- a/block/blk-tag.c
+++ /dev/null
@@ -1,378 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Functions related to tagged command queuing
4 */
5#include <linux/kernel.h>
6#include <linux/module.h>
7#include <linux/bio.h>
8#include <linux/blkdev.h>
9#include <linux/slab.h>
10
11#include "blk.h"
12
13/**
14 * blk_queue_find_tag - find a request by its tag and queue
15 * @q: The request queue for the device
16 * @tag: The tag of the request
17 *
18 * Notes:
19 * Should be used when a device returns a tag and you want to match
20 * it with a request.
21 *
22 * no locks need be held.
23 **/
24struct request *blk_queue_find_tag(struct request_queue *q, int tag)
25{
26 return blk_map_queue_find_tag(q->queue_tags, tag);
27}
28EXPORT_SYMBOL(blk_queue_find_tag);
29
30/**
31 * blk_free_tags - release a given set of tag maintenance info
32 * @bqt: the tag map to free
33 *
34 * Drop the reference count on @bqt and frees it when the last reference
35 * is dropped.
36 */
37void blk_free_tags(struct blk_queue_tag *bqt)
38{
39 if (atomic_dec_and_test(&bqt->refcnt)) {
40 BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) <
41 bqt->max_depth);
42
43 kfree(bqt->tag_index);
44 bqt->tag_index = NULL;
45
46 kfree(bqt->tag_map);
47 bqt->tag_map = NULL;
48
49 kfree(bqt);
50 }
51}
52EXPORT_SYMBOL(blk_free_tags);
53
54/**
55 * __blk_queue_free_tags - release tag maintenance info
56 * @q: the request queue for the device
57 *
58 * Notes:
59 * blk_cleanup_queue() will take care of calling this function, if tagging
60 * has been used. So there's no need to call this directly.
61 **/
62void __blk_queue_free_tags(struct request_queue *q)
63{
64 struct blk_queue_tag *bqt = q->queue_tags;
65
66 if (!bqt)
67 return;
68
69 blk_free_tags(bqt);
70
71 q->queue_tags = NULL;
72 queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
73}
74
75/**
76 * blk_queue_free_tags - release tag maintenance info
77 * @q: the request queue for the device
78 *
79 * Notes:
80 * This is used to disable tagged queuing to a device, yet leave
81 * queue in function.
82 **/
83void blk_queue_free_tags(struct request_queue *q)
84{
85 queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
86}
87EXPORT_SYMBOL(blk_queue_free_tags);
88
89static int
90init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
91{
92 struct request **tag_index;
93 unsigned long *tag_map;
94 int nr_ulongs;
95
96 if (q && depth > q->nr_requests * 2) {
97 depth = q->nr_requests * 2;
98 printk(KERN_ERR "%s: adjusted depth to %d\n",
99 __func__, depth);
100 }
101
102 tag_index = kcalloc(depth, sizeof(struct request *), GFP_ATOMIC);
103 if (!tag_index)
104 goto fail;
105
106 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
107 tag_map = kcalloc(nr_ulongs, sizeof(unsigned long), GFP_ATOMIC);
108 if (!tag_map)
109 goto fail;
110
111 tags->real_max_depth = depth;
112 tags->max_depth = depth;
113 tags->tag_index = tag_index;
114 tags->tag_map = tag_map;
115
116 return 0;
117fail:
118 kfree(tag_index);
119 return -ENOMEM;
120}
121
122static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
123 int depth, int alloc_policy)
124{
125 struct blk_queue_tag *tags;
126
127 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
128 if (!tags)
129 goto fail;
130
131 if (init_tag_map(q, tags, depth))
132 goto fail;
133
134 atomic_set(&tags->refcnt, 1);
135 tags->alloc_policy = alloc_policy;
136 tags->next_tag = 0;
137 return tags;
138fail:
139 kfree(tags);
140 return NULL;
141}
142
143/**
144 * blk_init_tags - initialize the tag info for an external tag map
145 * @depth: the maximum queue depth supported
146 * @alloc_policy: tag allocation policy
147 **/
148struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy)
149{
150 return __blk_queue_init_tags(NULL, depth, alloc_policy);
151}
152EXPORT_SYMBOL(blk_init_tags);
153
154/**
155 * blk_queue_init_tags - initialize the queue tag info
156 * @q: the request queue for the device
157 * @depth: the maximum queue depth supported
158 * @tags: the tag to use
159 * @alloc_policy: tag allocation policy
160 *
161 * Queue lock must be held here if the function is called to resize an
162 * existing map.
163 **/
164int blk_queue_init_tags(struct request_queue *q, int depth,
165 struct blk_queue_tag *tags, int alloc_policy)
166{
167 int rc;
168
169 BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
170
171 if (!tags && !q->queue_tags) {
172 tags = __blk_queue_init_tags(q, depth, alloc_policy);
173
174 if (!tags)
175 return -ENOMEM;
176
177 } else if (q->queue_tags) {
178 rc = blk_queue_resize_tags(q, depth);
179 if (rc)
180 return rc;
181 queue_flag_set(QUEUE_FLAG_QUEUED, q);
182 return 0;
183 } else
184 atomic_inc(&tags->refcnt);
185
186 /*
187 * assign it, all done
188 */
189 q->queue_tags = tags;
190 queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
191 return 0;
192}
193EXPORT_SYMBOL(blk_queue_init_tags);
194
195/**
196 * blk_queue_resize_tags - change the queueing depth
197 * @q: the request queue for the device
198 * @new_depth: the new max command queueing depth
199 *
200 * Notes:
201 * Must be called with the queue lock held.
202 **/
203int blk_queue_resize_tags(struct request_queue *q, int new_depth)
204{
205 struct blk_queue_tag *bqt = q->queue_tags;
206 struct request **tag_index;
207 unsigned long *tag_map;
208 int max_depth, nr_ulongs;
209
210 if (!bqt)
211 return -ENXIO;
212
213 /*
214 * if we already have large enough real_max_depth. just
215 * adjust max_depth. *NOTE* as requests with tag value
216 * between new_depth and real_max_depth can be in-flight, tag
217 * map can not be shrunk blindly here.
218 */
219 if (new_depth <= bqt->real_max_depth) {
220 bqt->max_depth = new_depth;
221 return 0;
222 }
223
224 /*
225 * Currently cannot replace a shared tag map with a new
226 * one, so error out if this is the case
227 */
228 if (atomic_read(&bqt->refcnt) != 1)
229 return -EBUSY;
230
231 /*
232 * save the old state info, so we can copy it back
233 */
234 tag_index = bqt->tag_index;
235 tag_map = bqt->tag_map;
236 max_depth = bqt->real_max_depth;
237
238 if (init_tag_map(q, bqt, new_depth))
239 return -ENOMEM;
240
241 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
242 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
243 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
244
245 kfree(tag_index);
246 kfree(tag_map);
247 return 0;
248}
249EXPORT_SYMBOL(blk_queue_resize_tags);
250
251/**
252 * blk_queue_end_tag - end tag operations for a request
253 * @q: the request queue for the device
254 * @rq: the request that has completed
255 *
256 * Description:
257 * Typically called when end_that_request_first() returns %0, meaning
258 * all transfers have been done for a request. It's important to call
259 * this function before end_that_request_last(), as that will put the
260 * request back on the free list thus corrupting the internal tag list.
261 **/
262void blk_queue_end_tag(struct request_queue *q, struct request *rq)
263{
264 struct blk_queue_tag *bqt = q->queue_tags;
265 unsigned tag = rq->tag; /* negative tags invalid */
266
267 lockdep_assert_held(q->queue_lock);
268
269 BUG_ON(tag >= bqt->real_max_depth);
270
271 list_del_init(&rq->queuelist);
272 rq->rq_flags &= ~RQF_QUEUED;
273 rq->tag = -1;
274 rq->internal_tag = -1;
275
276 if (unlikely(bqt->tag_index[tag] == NULL))
277 printk(KERN_ERR "%s: tag %d is missing\n",
278 __func__, tag);
279
280 bqt->tag_index[tag] = NULL;
281
282 if (unlikely(!test_bit(tag, bqt->tag_map))) {
283 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
284 __func__, tag);
285 return;
286 }
287 /*
288 * The tag_map bit acts as a lock for tag_index[bit], so we need
289 * unlock memory barrier semantics.
290 */
291 clear_bit_unlock(tag, bqt->tag_map);
292}
293
294/**
295 * blk_queue_start_tag - find a free tag and assign it
296 * @q: the request queue for the device
297 * @rq: the block request that needs tagging
298 *
299 * Description:
300 * This can either be used as a stand-alone helper, or possibly be
301 * assigned as the queue &prep_rq_fn (in which case &struct request
302 * automagically gets a tag assigned). Note that this function
303 * assumes that any type of request can be queued! if this is not
304 * true for your device, you must check the request type before
305 * calling this function. The request will also be removed from
306 * the request queue, so it's the drivers responsibility to readd
307 * it if it should need to be restarted for some reason.
308 **/
309int blk_queue_start_tag(struct request_queue *q, struct request *rq)
310{
311 struct blk_queue_tag *bqt = q->queue_tags;
312 unsigned max_depth;
313 int tag;
314
315 lockdep_assert_held(q->queue_lock);
316
317 if (unlikely((rq->rq_flags & RQF_QUEUED))) {
318 printk(KERN_ERR
319 "%s: request %p for device [%s] already tagged %d",
320 __func__, rq,
321 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
322 BUG();
323 }
324
325 /*
326 * Protect against shared tag maps, as we may not have exclusive
327 * access to the tag map.
328 *
329 * We reserve a few tags just for sync IO, since we don't want
330 * to starve sync IO on behalf of flooding async IO.
331 */
332 max_depth = bqt->max_depth;
333 if (!rq_is_sync(rq) && max_depth > 1) {
334 switch (max_depth) {
335 case 2:
336 max_depth = 1;
337 break;
338 case 3:
339 max_depth = 2;
340 break;
341 default:
342 max_depth -= 2;
343 }
344 if (q->in_flight[BLK_RW_ASYNC] > max_depth)
345 return 1;
346 }
347
348 do {
349 if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) {
350 tag = find_first_zero_bit(bqt->tag_map, max_depth);
351 if (tag >= max_depth)
352 return 1;
353 } else {
354 int start = bqt->next_tag;
355 int size = min_t(int, bqt->max_depth, max_depth + start);
356 tag = find_next_zero_bit(bqt->tag_map, size, start);
357 if (tag >= size && start + size > bqt->max_depth) {
358 size = start + size - bqt->max_depth;
359 tag = find_first_zero_bit(bqt->tag_map, size);
360 }
361 if (tag >= size)
362 return 1;
363 }
364
365 } while (test_and_set_bit_lock(tag, bqt->tag_map));
366 /*
367 * We need lock ordering semantics given by test_and_set_bit_lock.
368 * See blk_queue_end_tag for details.
369 */
370
371 bqt->next_tag = (tag + 1) % bqt->max_depth;
372 rq->rq_flags |= RQF_QUEUED;
373 rq->tag = tag;
374 bqt->tag_index[tag] = rq;
375 blk_start_request(rq);
376 return 0;
377}
378EXPORT_SYMBOL(blk_queue_start_tag);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index db1a3a2ae006..1b97a73d2fb1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1243,7 +1243,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
1243 bool dispatched; 1243 bool dispatched;
1244 int ret; 1244 int ret;
1245 1245
1246 spin_lock_irq(q->queue_lock); 1246 spin_lock_irq(&q->queue_lock);
1247 if (throtl_can_upgrade(td, NULL)) 1247 if (throtl_can_upgrade(td, NULL))
1248 throtl_upgrade_state(td); 1248 throtl_upgrade_state(td);
1249 1249
@@ -1266,9 +1266,9 @@ again:
1266 break; 1266 break;
1267 1267
1268 /* this dispatch windows is still open, relax and repeat */ 1268 /* this dispatch windows is still open, relax and repeat */
1269 spin_unlock_irq(q->queue_lock); 1269 spin_unlock_irq(&q->queue_lock);
1270 cpu_relax(); 1270 cpu_relax();
1271 spin_lock_irq(q->queue_lock); 1271 spin_lock_irq(&q->queue_lock);
1272 } 1272 }
1273 1273
1274 if (!dispatched) 1274 if (!dispatched)
@@ -1290,7 +1290,7 @@ again:
1290 queue_work(kthrotld_workqueue, &td->dispatch_work); 1290 queue_work(kthrotld_workqueue, &td->dispatch_work);
1291 } 1291 }
1292out_unlock: 1292out_unlock:
1293 spin_unlock_irq(q->queue_lock); 1293 spin_unlock_irq(&q->queue_lock);
1294} 1294}
1295 1295
1296/** 1296/**
@@ -1314,11 +1314,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
1314 1314
1315 bio_list_init(&bio_list_on_stack); 1315 bio_list_init(&bio_list_on_stack);
1316 1316
1317 spin_lock_irq(q->queue_lock); 1317 spin_lock_irq(&q->queue_lock);
1318 for (rw = READ; rw <= WRITE; rw++) 1318 for (rw = READ; rw <= WRITE; rw++)
1319 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1319 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
1320 bio_list_add(&bio_list_on_stack, bio); 1320 bio_list_add(&bio_list_on_stack, bio);
1321 spin_unlock_irq(q->queue_lock); 1321 spin_unlock_irq(&q->queue_lock);
1322 1322
1323 if (!bio_list_empty(&bio_list_on_stack)) { 1323 if (!bio_list_empty(&bio_list_on_stack)) {
1324 blk_start_plug(&plug); 1324 blk_start_plug(&plug);
@@ -2115,16 +2115,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
2115} 2115}
2116#endif 2116#endif
2117 2117
2118static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
2119{
2120#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2121 /* fallback to root_blkg if we fail to get a blkg ref */
2122 if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV))
2123 bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg);
2124 bio_issue_init(&bio->bi_issue, bio_sectors(bio));
2125#endif
2126}
2127
2128bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, 2118bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
2129 struct bio *bio) 2119 struct bio *bio)
2130{ 2120{
@@ -2141,14 +2131,10 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
2141 if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) 2131 if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
2142 goto out; 2132 goto out;
2143 2133
2144 spin_lock_irq(q->queue_lock); 2134 spin_lock_irq(&q->queue_lock);
2145 2135
2146 throtl_update_latency_buckets(td); 2136 throtl_update_latency_buckets(td);
2147 2137
2148 if (unlikely(blk_queue_bypass(q)))
2149 goto out_unlock;
2150
2151 blk_throtl_assoc_bio(tg, bio);
2152 blk_throtl_update_idletime(tg); 2138 blk_throtl_update_idletime(tg);
2153 2139
2154 sq = &tg->service_queue; 2140 sq = &tg->service_queue;
@@ -2227,7 +2213,7 @@ again:
2227 } 2213 }
2228 2214
2229out_unlock: 2215out_unlock:
2230 spin_unlock_irq(q->queue_lock); 2216 spin_unlock_irq(&q->queue_lock);
2231out: 2217out:
2232 bio_set_flag(bio, BIO_THROTTLED); 2218 bio_set_flag(bio, BIO_THROTTLED);
2233 2219
@@ -2348,7 +2334,7 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq)
2348 * Dispatch all currently throttled bios on @q through ->make_request_fn(). 2334 * Dispatch all currently throttled bios on @q through ->make_request_fn().
2349 */ 2335 */
2350void blk_throtl_drain(struct request_queue *q) 2336void blk_throtl_drain(struct request_queue *q)
2351 __releases(q->queue_lock) __acquires(q->queue_lock) 2337 __releases(&q->queue_lock) __acquires(&q->queue_lock)
2352{ 2338{
2353 struct throtl_data *td = q->td; 2339 struct throtl_data *td = q->td;
2354 struct blkcg_gq *blkg; 2340 struct blkcg_gq *blkg;
@@ -2356,7 +2342,6 @@ void blk_throtl_drain(struct request_queue *q)
2356 struct bio *bio; 2342 struct bio *bio;
2357 int rw; 2343 int rw;
2358 2344
2359 queue_lockdep_assert_held(q);
2360 rcu_read_lock(); 2345 rcu_read_lock();
2361 2346
2362 /* 2347 /*
@@ -2372,7 +2357,7 @@ void blk_throtl_drain(struct request_queue *q)
2372 tg_drain_bios(&td->service_queue); 2357 tg_drain_bios(&td->service_queue);
2373 2358
2374 rcu_read_unlock(); 2359 rcu_read_unlock();
2375 spin_unlock_irq(q->queue_lock); 2360 spin_unlock_irq(&q->queue_lock);
2376 2361
2377 /* all bios now should be in td->service_queue, issue them */ 2362 /* all bios now should be in td->service_queue, issue them */
2378 for (rw = READ; rw <= WRITE; rw++) 2363 for (rw = READ; rw <= WRITE; rw++)
@@ -2380,7 +2365,7 @@ void blk_throtl_drain(struct request_queue *q)
2380 NULL))) 2365 NULL)))
2381 generic_make_request(bio); 2366 generic_make_request(bio);
2382 2367
2383 spin_lock_irq(q->queue_lock); 2368 spin_lock_irq(&q->queue_lock);
2384} 2369}
2385 2370
2386int blk_throtl_init(struct request_queue *q) 2371int blk_throtl_init(struct request_queue *q)
@@ -2460,7 +2445,7 @@ void blk_throtl_register_queue(struct request_queue *q)
2460 td->throtl_slice = DFL_THROTL_SLICE_HD; 2445 td->throtl_slice = DFL_THROTL_SLICE_HD;
2461#endif 2446#endif
2462 2447
2463 td->track_bio_latency = !queue_is_rq_based(q); 2448 td->track_bio_latency = !queue_is_mq(q);
2464 if (!td->track_bio_latency) 2449 if (!td->track_bio_latency)
2465 blk_stat_enable_accounting(q); 2450 blk_stat_enable_accounting(q);
2466} 2451}
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index f2cfd56e1606..124c26128bf6 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -68,80 +68,6 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
68 68
69#endif /* CONFIG_FAIL_IO_TIMEOUT */ 69#endif /* CONFIG_FAIL_IO_TIMEOUT */
70 70
71/*
72 * blk_delete_timer - Delete/cancel timer for a given function.
73 * @req: request that we are canceling timer for
74 *
75 */
76void blk_delete_timer(struct request *req)
77{
78 list_del_init(&req->timeout_list);
79}
80
81static void blk_rq_timed_out(struct request *req)
82{
83 struct request_queue *q = req->q;
84 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
85
86 if (q->rq_timed_out_fn)
87 ret = q->rq_timed_out_fn(req);
88 switch (ret) {
89 case BLK_EH_RESET_TIMER:
90 blk_add_timer(req);
91 blk_clear_rq_complete(req);
92 break;
93 case BLK_EH_DONE:
94 /*
95 * LLD handles this for now but in the future
96 * we can send a request msg to abort the command
97 * and we can move more of the generic scsi eh code to
98 * the blk layer.
99 */
100 break;
101 default:
102 printk(KERN_ERR "block: bad eh return: %d\n", ret);
103 break;
104 }
105}
106
107static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
108 unsigned int *next_set)
109{
110 const unsigned long deadline = blk_rq_deadline(rq);
111
112 if (time_after_eq(jiffies, deadline)) {
113 list_del_init(&rq->timeout_list);
114
115 /*
116 * Check if we raced with end io completion
117 */
118 if (!blk_mark_rq_complete(rq))
119 blk_rq_timed_out(rq);
120 } else if (!*next_set || time_after(*next_timeout, deadline)) {
121 *next_timeout = deadline;
122 *next_set = 1;
123 }
124}
125
126void blk_timeout_work(struct work_struct *work)
127{
128 struct request_queue *q =
129 container_of(work, struct request_queue, timeout_work);
130 unsigned long flags, next = 0;
131 struct request *rq, *tmp;
132 int next_set = 0;
133
134 spin_lock_irqsave(q->queue_lock, flags);
135
136 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
137 blk_rq_check_expired(rq, &next, &next_set);
138
139 if (next_set)
140 mod_timer(&q->timeout, round_jiffies_up(next));
141
142 spin_unlock_irqrestore(q->queue_lock, flags);
143}
144
145/** 71/**
146 * blk_abort_request -- Request request recovery for the specified command 72 * blk_abort_request -- Request request recovery for the specified command
147 * @req: pointer to the request of interest 73 * @req: pointer to the request of interest
@@ -149,24 +75,17 @@ void blk_timeout_work(struct work_struct *work)
149 * This function requests that the block layer start recovery for the 75 * This function requests that the block layer start recovery for the
150 * request by deleting the timer and calling the q's timeout function. 76 * request by deleting the timer and calling the q's timeout function.
151 * LLDDs who implement their own error recovery MAY ignore the timeout 77 * LLDDs who implement their own error recovery MAY ignore the timeout
152 * event if they generated blk_abort_req. Must hold queue lock. 78 * event if they generated blk_abort_request.
153 */ 79 */
154void blk_abort_request(struct request *req) 80void blk_abort_request(struct request *req)
155{ 81{
156 if (req->q->mq_ops) { 82 /*
157 /* 83 * All we need to ensure is that timeout scan takes place
158 * All we need to ensure is that timeout scan takes place 84 * immediately and that scan sees the new timeout value.
159 * immediately and that scan sees the new timeout value. 85 * No need for fancy synchronizations.
160 * No need for fancy synchronizations. 86 */
161 */ 87 WRITE_ONCE(req->deadline, jiffies);
162 blk_rq_set_deadline(req, jiffies); 88 kblockd_schedule_work(&req->q->timeout_work);
163 kblockd_schedule_work(&req->q->timeout_work);
164 } else {
165 if (blk_mark_rq_complete(req))
166 return;
167 blk_delete_timer(req);
168 blk_rq_timed_out(req);
169 }
170} 89}
171EXPORT_SYMBOL_GPL(blk_abort_request); 90EXPORT_SYMBOL_GPL(blk_abort_request);
172 91
@@ -194,15 +113,6 @@ void blk_add_timer(struct request *req)
194 struct request_queue *q = req->q; 113 struct request_queue *q = req->q;
195 unsigned long expiry; 114 unsigned long expiry;
196 115
197 if (!q->mq_ops)
198 lockdep_assert_held(q->queue_lock);
199
200 /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
201 if (!q->mq_ops && !q->rq_timed_out_fn)
202 return;
203
204 BUG_ON(!list_empty(&req->timeout_list));
205
206 /* 116 /*
207 * Some LLDs, like scsi, peek at the timeout to prevent a 117 * Some LLDs, like scsi, peek at the timeout to prevent a
208 * command from being retried forever. 118 * command from being retried forever.
@@ -211,21 +121,16 @@ void blk_add_timer(struct request *req)
211 req->timeout = q->rq_timeout; 121 req->timeout = q->rq_timeout;
212 122
213 req->rq_flags &= ~RQF_TIMED_OUT; 123 req->rq_flags &= ~RQF_TIMED_OUT;
214 blk_rq_set_deadline(req, jiffies + req->timeout);
215 124
216 /* 125 expiry = jiffies + req->timeout;
217 * Only the non-mq case needs to add the request to a protected list. 126 WRITE_ONCE(req->deadline, expiry);
218 * For the mq case we simply scan the tag map.
219 */
220 if (!q->mq_ops)
221 list_add_tail(&req->timeout_list, &req->q->timeout_list);
222 127
223 /* 128 /*
224 * If the timer isn't already pending or this timeout is earlier 129 * If the timer isn't already pending or this timeout is earlier
225 * than an existing one, modify the timer. Round up to next nearest 130 * than an existing one, modify the timer. Round up to next nearest
226 * second. 131 * second.
227 */ 132 */
228 expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req))); 133 expiry = blk_rq_timeout(round_jiffies_up(expiry));
229 134
230 if (!timer_pending(&q->timeout) || 135 if (!timer_pending(&q->timeout) ||
231 time_before(expiry, q->timeout.expires)) { 136 time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 8ac93fcbaa2e..f0c56649775f 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -489,31 +489,21 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
489} 489}
490 490
491struct wbt_wait_data { 491struct wbt_wait_data {
492 struct wait_queue_entry wq;
493 struct task_struct *task;
494 struct rq_wb *rwb; 492 struct rq_wb *rwb;
495 struct rq_wait *rqw; 493 enum wbt_flags wb_acct;
496 unsigned long rw; 494 unsigned long rw;
497 bool got_token;
498}; 495};
499 496
500static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, 497static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
501 int wake_flags, void *key)
502{ 498{
503 struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data, 499 struct wbt_wait_data *data = private_data;
504 wq); 500 return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw));
505 501}
506 /*
507 * If we fail to get a budget, return -1 to interrupt the wake up
508 * loop in __wake_up_common.
509 */
510 if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw)))
511 return -1;
512 502
513 data->got_token = true; 503static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
514 list_del_init(&curr->entry); 504{
515 wake_up_process(data->task); 505 struct wbt_wait_data *data = private_data;
516 return 1; 506 wbt_rqw_done(data->rwb, rqw, data->wb_acct);
517} 507}
518 508
519/* 509/*
@@ -521,57 +511,16 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode,
521 * the timer to kick off queuing again. 511 * the timer to kick off queuing again.
522 */ 512 */
523static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, 513static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
524 unsigned long rw, spinlock_t *lock) 514 unsigned long rw)
525 __releases(lock)
526 __acquires(lock)
527{ 515{
528 struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); 516 struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
529 struct wbt_wait_data data = { 517 struct wbt_wait_data data = {
530 .wq = {
531 .func = wbt_wake_function,
532 .entry = LIST_HEAD_INIT(data.wq.entry),
533 },
534 .task = current,
535 .rwb = rwb, 518 .rwb = rwb,
536 .rqw = rqw, 519 .wb_acct = wb_acct,
537 .rw = rw, 520 .rw = rw,
538 }; 521 };
539 bool has_sleeper;
540
541 has_sleeper = wq_has_sleeper(&rqw->wait);
542 if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
543 return;
544 522
545 prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); 523 rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
546 do {
547 if (data.got_token)
548 break;
549
550 if (!has_sleeper &&
551 rq_wait_inc_below(rqw, get_limit(rwb, rw))) {
552 finish_wait(&rqw->wait, &data.wq);
553
554 /*
555 * We raced with wbt_wake_function() getting a token,
556 * which means we now have two. Put our local token
557 * and wake anyone else potentially waiting for one.
558 */
559 if (data.got_token)
560 wbt_rqw_done(rwb, rqw, wb_acct);
561 break;
562 }
563
564 if (lock) {
565 spin_unlock_irq(lock);
566 io_schedule();
567 spin_lock_irq(lock);
568 } else
569 io_schedule();
570
571 has_sleeper = false;
572 } while (1);
573
574 finish_wait(&rqw->wait, &data.wq);
575} 524}
576 525
577static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) 526static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
@@ -624,7 +573,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
624 * in an irq held spinlock, if it holds one when calling this function. 573 * in an irq held spinlock, if it holds one when calling this function.
625 * If we do sleep, we'll release and re-grab it. 574 * If we do sleep, we'll release and re-grab it.
626 */ 575 */
627static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) 576static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
628{ 577{
629 struct rq_wb *rwb = RQWB(rqos); 578 struct rq_wb *rwb = RQWB(rqos);
630 enum wbt_flags flags; 579 enum wbt_flags flags;
@@ -636,7 +585,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
636 return; 585 return;
637 } 586 }
638 587
639 __wbt_wait(rwb, flags, bio->bi_opf, lock); 588 __wbt_wait(rwb, flags, bio->bi_opf);
640 589
641 if (!blk_stat_is_active(rwb->cb)) 590 if (!blk_stat_is_active(rwb->cb))
642 rwb_arm_timer(rwb); 591 rwb_arm_timer(rwb);
@@ -709,8 +658,7 @@ void wbt_enable_default(struct request_queue *q)
709 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) 658 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
710 return; 659 return;
711 660
712 if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || 661 if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
713 (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
714 wbt_init(q); 662 wbt_init(q);
715} 663}
716EXPORT_SYMBOL_GPL(wbt_enable_default); 664EXPORT_SYMBOL_GPL(wbt_enable_default);
@@ -760,11 +708,100 @@ void wbt_disable_default(struct request_queue *q)
760 if (!rqos) 708 if (!rqos)
761 return; 709 return;
762 rwb = RQWB(rqos); 710 rwb = RQWB(rqos);
763 if (rwb->enable_state == WBT_STATE_ON_DEFAULT) 711 if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
712 blk_stat_deactivate(rwb->cb);
764 rwb->wb_normal = 0; 713 rwb->wb_normal = 0;
714 }
765} 715}
766EXPORT_SYMBOL_GPL(wbt_disable_default); 716EXPORT_SYMBOL_GPL(wbt_disable_default);
767 717
718#ifdef CONFIG_BLK_DEBUG_FS
719static int wbt_curr_win_nsec_show(void *data, struct seq_file *m)
720{
721 struct rq_qos *rqos = data;
722 struct rq_wb *rwb = RQWB(rqos);
723
724 seq_printf(m, "%llu\n", rwb->cur_win_nsec);
725 return 0;
726}
727
728static int wbt_enabled_show(void *data, struct seq_file *m)
729{
730 struct rq_qos *rqos = data;
731 struct rq_wb *rwb = RQWB(rqos);
732
733 seq_printf(m, "%d\n", rwb->enable_state);
734 return 0;
735}
736
737static int wbt_id_show(void *data, struct seq_file *m)
738{
739 struct rq_qos *rqos = data;
740
741 seq_printf(m, "%u\n", rqos->id);
742 return 0;
743}
744
745static int wbt_inflight_show(void *data, struct seq_file *m)
746{
747 struct rq_qos *rqos = data;
748 struct rq_wb *rwb = RQWB(rqos);
749 int i;
750
751 for (i = 0; i < WBT_NUM_RWQ; i++)
752 seq_printf(m, "%d: inflight %d\n", i,
753 atomic_read(&rwb->rq_wait[i].inflight));
754 return 0;
755}
756
757static int wbt_min_lat_nsec_show(void *data, struct seq_file *m)
758{
759 struct rq_qos *rqos = data;
760 struct rq_wb *rwb = RQWB(rqos);
761
762 seq_printf(m, "%lu\n", rwb->min_lat_nsec);
763 return 0;
764}
765
766static int wbt_unknown_cnt_show(void *data, struct seq_file *m)
767{
768 struct rq_qos *rqos = data;
769 struct rq_wb *rwb = RQWB(rqos);
770
771 seq_printf(m, "%u\n", rwb->unknown_cnt);
772 return 0;
773}
774
775static int wbt_normal_show(void *data, struct seq_file *m)
776{
777 struct rq_qos *rqos = data;
778 struct rq_wb *rwb = RQWB(rqos);
779
780 seq_printf(m, "%u\n", rwb->wb_normal);
781 return 0;
782}
783
784static int wbt_background_show(void *data, struct seq_file *m)
785{
786 struct rq_qos *rqos = data;
787 struct rq_wb *rwb = RQWB(rqos);
788
789 seq_printf(m, "%u\n", rwb->wb_background);
790 return 0;
791}
792
793static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
794 {"curr_win_nsec", 0400, wbt_curr_win_nsec_show},
795 {"enabled", 0400, wbt_enabled_show},
796 {"id", 0400, wbt_id_show},
797 {"inflight", 0400, wbt_inflight_show},
798 {"min_lat_nsec", 0400, wbt_min_lat_nsec_show},
799 {"unknown_cnt", 0400, wbt_unknown_cnt_show},
800 {"wb_normal", 0400, wbt_normal_show},
801 {"wb_background", 0400, wbt_background_show},
802 {},
803};
804#endif
768 805
769static struct rq_qos_ops wbt_rqos_ops = { 806static struct rq_qos_ops wbt_rqos_ops = {
770 .throttle = wbt_wait, 807 .throttle = wbt_wait,
@@ -774,6 +811,9 @@ static struct rq_qos_ops wbt_rqos_ops = {
774 .done = wbt_done, 811 .done = wbt_done,
775 .cleanup = wbt_cleanup, 812 .cleanup = wbt_cleanup,
776 .exit = wbt_exit, 813 .exit = wbt_exit,
814#ifdef CONFIG_BLK_DEBUG_FS
815 .debugfs_attrs = wbt_debugfs_attrs,
816#endif
777}; 817};
778 818
779int wbt_init(struct request_queue *q) 819int wbt_init(struct request_queue *q)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 13ba2011a306..2d98803faec2 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -378,7 +378,7 @@ static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
378 struct page *page; 378 struct page *page;
379 int order; 379 int order;
380 380
381 for (order = get_order(size); order > 0; order--) { 381 for (order = get_order(size); order >= 0; order--) {
382 page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); 382 page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
383 if (page) { 383 if (page) {
384 *nr_zones = min_t(unsigned int, *nr_zones, 384 *nr_zones = min_t(unsigned int, *nr_zones,
@@ -421,7 +421,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
421 * BIO based queues do not use a scheduler so only q->nr_zones 421 * BIO based queues do not use a scheduler so only q->nr_zones
422 * needs to be updated so that the sysfs exposed value is correct. 422 * needs to be updated so that the sysfs exposed value is correct.
423 */ 423 */
424 if (!queue_is_rq_based(q)) { 424 if (!queue_is_mq(q)) {
425 q->nr_zones = nr_zones; 425 q->nr_zones = nr_zones;
426 return 0; 426 return 0;
427 } 427 }
diff --git a/block/blk.h b/block/blk.h
index a1841b8ff129..848278c52030 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -7,12 +7,6 @@
7#include <xen/xen.h> 7#include <xen/xen.h>
8#include "blk-mq.h" 8#include "blk-mq.h"
9 9
10/* Amount of time in which a process may batch requests */
11#define BLK_BATCH_TIME (HZ/50UL)
12
13/* Number of requests a "batching" process may submit */
14#define BLK_BATCH_REQ 32
15
16/* Max future timer expiry for timeouts */ 10/* Max future timer expiry for timeouts */
17#define BLK_MAX_TIMEOUT (5 * HZ) 11#define BLK_MAX_TIMEOUT (5 * HZ)
18 12
@@ -38,85 +32,13 @@ struct blk_flush_queue {
38}; 32};
39 33
40extern struct kmem_cache *blk_requestq_cachep; 34extern struct kmem_cache *blk_requestq_cachep;
41extern struct kmem_cache *request_cachep;
42extern struct kobj_type blk_queue_ktype; 35extern struct kobj_type blk_queue_ktype;
43extern struct ida blk_queue_ida; 36extern struct ida blk_queue_ida;
44 37
45/* 38static inline struct blk_flush_queue *
46 * @q->queue_lock is set while a queue is being initialized. Since we know 39blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
47 * that no other threads access the queue object before @q->queue_lock has
48 * been set, it is safe to manipulate queue flags without holding the
49 * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and
50 * blk_init_allocated_queue().
51 */
52static inline void queue_lockdep_assert_held(struct request_queue *q)
53{
54 if (q->queue_lock)
55 lockdep_assert_held(q->queue_lock);
56}
57
58static inline void queue_flag_set_unlocked(unsigned int flag,
59 struct request_queue *q)
60{
61 if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) &&
62 kref_read(&q->kobj.kref))
63 lockdep_assert_held(q->queue_lock);
64 __set_bit(flag, &q->queue_flags);
65}
66
67static inline void queue_flag_clear_unlocked(unsigned int flag,
68 struct request_queue *q)
69{
70 if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) &&
71 kref_read(&q->kobj.kref))
72 lockdep_assert_held(q->queue_lock);
73 __clear_bit(flag, &q->queue_flags);
74}
75
76static inline int queue_flag_test_and_clear(unsigned int flag,
77 struct request_queue *q)
78{
79 queue_lockdep_assert_held(q);
80
81 if (test_bit(flag, &q->queue_flags)) {
82 __clear_bit(flag, &q->queue_flags);
83 return 1;
84 }
85
86 return 0;
87}
88
89static inline int queue_flag_test_and_set(unsigned int flag,
90 struct request_queue *q)
91{ 40{
92 queue_lockdep_assert_held(q); 41 return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq;
93
94 if (!test_bit(flag, &q->queue_flags)) {
95 __set_bit(flag, &q->queue_flags);
96 return 0;
97 }
98
99 return 1;
100}
101
102static inline void queue_flag_set(unsigned int flag, struct request_queue *q)
103{
104 queue_lockdep_assert_held(q);
105 __set_bit(flag, &q->queue_flags);
106}
107
108static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
109{
110 queue_lockdep_assert_held(q);
111 __clear_bit(flag, &q->queue_flags);
112}
113
114static inline struct blk_flush_queue *blk_get_flush_queue(
115 struct request_queue *q, struct blk_mq_ctx *ctx)
116{
117 if (q->mq_ops)
118 return blk_mq_map_queue(q, ctx->cpu)->fq;
119 return q->fq;
120} 42}
121 43
122static inline void __blk_get_queue(struct request_queue *q) 44static inline void __blk_get_queue(struct request_queue *q)
@@ -128,15 +50,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
128 int node, int cmd_size, gfp_t flags); 50 int node, int cmd_size, gfp_t flags);
129void blk_free_flush_queue(struct blk_flush_queue *q); 51void blk_free_flush_queue(struct blk_flush_queue *q);
130 52
131int blk_init_rl(struct request_list *rl, struct request_queue *q,
132 gfp_t gfp_mask);
133void blk_exit_rl(struct request_queue *q, struct request_list *rl);
134void blk_exit_queue(struct request_queue *q); 53void blk_exit_queue(struct request_queue *q);
135void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 54void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
136 struct bio *bio); 55 struct bio *bio);
137void blk_queue_bypass_start(struct request_queue *q);
138void blk_queue_bypass_end(struct request_queue *q);
139void __blk_queue_free_tags(struct request_queue *q);
140void blk_freeze_queue(struct request_queue *q); 56void blk_freeze_queue(struct request_queue *q);
141 57
142static inline void blk_queue_enter_live(struct request_queue *q) 58static inline void blk_queue_enter_live(struct request_queue *q)
@@ -169,7 +85,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
169static inline bool __bvec_gap_to_prev(struct request_queue *q, 85static inline bool __bvec_gap_to_prev(struct request_queue *q,
170 struct bio_vec *bprv, unsigned int offset) 86 struct bio_vec *bprv, unsigned int offset)
171{ 87{
172 return offset || 88 return (offset & queue_virt_boundary(q)) ||
173 ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); 89 ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
174} 90}
175 91
@@ -235,11 +151,8 @@ static inline bool bio_integrity_endio(struct bio *bio)
235} 151}
236#endif /* CONFIG_BLK_DEV_INTEGRITY */ 152#endif /* CONFIG_BLK_DEV_INTEGRITY */
237 153
238void blk_timeout_work(struct work_struct *work);
239unsigned long blk_rq_timeout(unsigned long timeout); 154unsigned long blk_rq_timeout(unsigned long timeout);
240void blk_add_timer(struct request *req); 155void blk_add_timer(struct request *req);
241void blk_delete_timer(struct request *);
242
243 156
244bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 157bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
245 struct bio *bio); 158 struct bio *bio);
@@ -248,58 +161,19 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
248bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, 161bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
249 struct bio *bio); 162 struct bio *bio);
250bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 163bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
251 unsigned int *request_count,
252 struct request **same_queue_rq); 164 struct request **same_queue_rq);
253unsigned int blk_plug_queued_count(struct request_queue *q);
254 165
255void blk_account_io_start(struct request *req, bool new_io); 166void blk_account_io_start(struct request *req, bool new_io);
256void blk_account_io_completion(struct request *req, unsigned int bytes); 167void blk_account_io_completion(struct request *req, unsigned int bytes);
257void blk_account_io_done(struct request *req, u64 now); 168void blk_account_io_done(struct request *req, u64 now);
258 169
259/* 170/*
260 * EH timer and IO completion will both attempt to 'grab' the request, make
261 * sure that only one of them succeeds. Steal the bottom bit of the
262 * __deadline field for this.
263 */
264static inline int blk_mark_rq_complete(struct request *rq)
265{
266 return test_and_set_bit(0, &rq->__deadline);
267}
268
269static inline void blk_clear_rq_complete(struct request *rq)
270{
271 clear_bit(0, &rq->__deadline);
272}
273
274static inline bool blk_rq_is_complete(struct request *rq)
275{
276 return test_bit(0, &rq->__deadline);
277}
278
279/*
280 * Internal elevator interface 171 * Internal elevator interface
281 */ 172 */
282#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) 173#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
283 174
284void blk_insert_flush(struct request *rq); 175void blk_insert_flush(struct request *rq);
285 176
286static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
287{
288 struct elevator_queue *e = q->elevator;
289
290 if (e->type->ops.sq.elevator_activate_req_fn)
291 e->type->ops.sq.elevator_activate_req_fn(q, rq);
292}
293
294static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
295{
296 struct elevator_queue *e = q->elevator;
297
298 if (e->type->ops.sq.elevator_deactivate_req_fn)
299 e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
300}
301
302int elevator_init(struct request_queue *);
303int elevator_init_mq(struct request_queue *q); 177int elevator_init_mq(struct request_queue *q);
304int elevator_switch_mq(struct request_queue *q, 178int elevator_switch_mq(struct request_queue *q,
305 struct elevator_type *new_e); 179 struct elevator_type *new_e);
@@ -334,31 +208,8 @@ void blk_rq_set_mixed_merge(struct request *rq);
334bool blk_rq_merge_ok(struct request *rq, struct bio *bio); 208bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
335enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); 209enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
336 210
337void blk_queue_congestion_threshold(struct request_queue *q);
338
339int blk_dev_init(void); 211int blk_dev_init(void);
340 212
341
342/*
343 * Return the threshold (number of used requests) at which the queue is
344 * considered to be congested. It include a little hysteresis to keep the
345 * context switch rate down.
346 */
347static inline int queue_congestion_on_threshold(struct request_queue *q)
348{
349 return q->nr_congestion_on;
350}
351
352/*
353 * The threshold at which a queue is considered to be uncongested
354 */
355static inline int queue_congestion_off_threshold(struct request_queue *q)
356{
357 return q->nr_congestion_off;
358}
359
360extern int blk_update_nr_requests(struct request_queue *, unsigned int);
361
362/* 213/*
363 * Contribute to IO statistics IFF: 214 * Contribute to IO statistics IFF:
364 * 215 *
@@ -381,18 +232,13 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
381} 232}
382 233
383/* 234/*
384 * Steal a bit from this field for legacy IO path atomic IO marking. Note that 235 * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
385 * setting the deadline clears the bottom bit, potentially clearing the 236 * is defined as 'unsigned int', meantime it has to aligned to with logical
386 * completed bit. The user has to be OK with this (current ones are fine). 237 * block size which is the minimum accepted unit by hardware.
387 */ 238 */
388static inline void blk_rq_set_deadline(struct request *rq, unsigned long time) 239static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
389{
390 rq->__deadline = time & ~0x1UL;
391}
392
393static inline unsigned long blk_rq_deadline(struct request *rq)
394{ 240{
395 return rq->__deadline & ~0x1UL; 241 return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9;
396} 242}
397 243
398/* 244/*
@@ -407,22 +253,6 @@ void ioc_clear_queue(struct request_queue *q);
407int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); 253int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
408 254
409/** 255/**
410 * rq_ioc - determine io_context for request allocation
411 * @bio: request being allocated is for this bio (can be %NULL)
412 *
413 * Determine io_context to use for request allocation for @bio. May return
414 * %NULL if %current->io_context doesn't exist.
415 */
416static inline struct io_context *rq_ioc(struct bio *bio)
417{
418#ifdef CONFIG_BLK_CGROUP
419 if (bio && bio->bi_ioc)
420 return bio->bi_ioc;
421#endif
422 return current->io_context;
423}
424
425/**
426 * create_io_context - try to create task->io_context 256 * create_io_context - try to create task->io_context
427 * @gfp_mask: allocation mask 257 * @gfp_mask: allocation mask
428 * @node: allocation node 258 * @node: allocation node
@@ -480,8 +310,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
480} 310}
481#endif /* CONFIG_BOUNCE */ 311#endif /* CONFIG_BOUNCE */
482 312
483extern void blk_drain_queue(struct request_queue *q);
484
485#ifdef CONFIG_BLK_CGROUP_IOLATENCY 313#ifdef CONFIG_BLK_CGROUP_IOLATENCY
486extern int blk_iolatency_init(struct request_queue *q); 314extern int blk_iolatency_init(struct request_queue *q);
487#else 315#else
diff --git a/block/bounce.c b/block/bounce.c
index 36869afc258c..ffb9e9ecfa7e 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -248,6 +248,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
248 return NULL; 248 return NULL;
249 bio->bi_disk = bio_src->bi_disk; 249 bio->bi_disk = bio_src->bi_disk;
250 bio->bi_opf = bio_src->bi_opf; 250 bio->bi_opf = bio_src->bi_opf;
251 bio->bi_ioprio = bio_src->bi_ioprio;
251 bio->bi_write_hint = bio_src->bi_write_hint; 252 bio->bi_write_hint = bio_src->bi_write_hint;
252 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; 253 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
253 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; 254 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
@@ -276,7 +277,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
276 } 277 }
277 } 278 }
278 279
279 bio_clone_blkcg_association(bio, bio_src); 280 bio_clone_blkg_association(bio, bio_src);
281 blkcg_bio_issue_init(bio);
280 282
281 return bio; 283 return bio;
282} 284}
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index f3501cdaf1a6..192129856342 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -21,7 +21,7 @@
21 * 21 *
22 */ 22 */
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/blkdev.h> 24#include <linux/blk-mq.h>
25#include <linux/delay.h> 25#include <linux/delay.h>
26#include <linux/scatterlist.h> 26#include <linux/scatterlist.h>
27#include <linux/bsg-lib.h> 27#include <linux/bsg-lib.h>
@@ -31,6 +31,12 @@
31 31
32#define uptr64(val) ((void __user *)(uintptr_t)(val)) 32#define uptr64(val) ((void __user *)(uintptr_t)(val))
33 33
34struct bsg_set {
35 struct blk_mq_tag_set tag_set;
36 bsg_job_fn *job_fn;
37 bsg_timeout_fn *timeout_fn;
38};
39
34static int bsg_transport_check_proto(struct sg_io_v4 *hdr) 40static int bsg_transport_check_proto(struct sg_io_v4 *hdr)
35{ 41{
36 if (hdr->protocol != BSG_PROTOCOL_SCSI || 42 if (hdr->protocol != BSG_PROTOCOL_SCSI ||
@@ -129,7 +135,7 @@ static void bsg_teardown_job(struct kref *kref)
129 kfree(job->request_payload.sg_list); 135 kfree(job->request_payload.sg_list);
130 kfree(job->reply_payload.sg_list); 136 kfree(job->reply_payload.sg_list);
131 137
132 blk_end_request_all(rq, BLK_STS_OK); 138 blk_mq_end_request(rq, BLK_STS_OK);
133} 139}
134 140
135void bsg_job_put(struct bsg_job *job) 141void bsg_job_put(struct bsg_job *job)
@@ -157,15 +163,15 @@ void bsg_job_done(struct bsg_job *job, int result,
157{ 163{
158 job->result = result; 164 job->result = result;
159 job->reply_payload_rcv_len = reply_payload_rcv_len; 165 job->reply_payload_rcv_len = reply_payload_rcv_len;
160 blk_complete_request(blk_mq_rq_from_pdu(job)); 166 blk_mq_complete_request(blk_mq_rq_from_pdu(job));
161} 167}
162EXPORT_SYMBOL_GPL(bsg_job_done); 168EXPORT_SYMBOL_GPL(bsg_job_done);
163 169
164/** 170/**
165 * bsg_softirq_done - softirq done routine for destroying the bsg requests 171 * bsg_complete - softirq done routine for destroying the bsg requests
166 * @rq: BSG request that holds the job to be destroyed 172 * @rq: BSG request that holds the job to be destroyed
167 */ 173 */
168static void bsg_softirq_done(struct request *rq) 174static void bsg_complete(struct request *rq)
169{ 175{
170 struct bsg_job *job = blk_mq_rq_to_pdu(rq); 176 struct bsg_job *job = blk_mq_rq_to_pdu(rq);
171 177
@@ -224,54 +230,48 @@ failjob_rls_job:
224} 230}
225 231
226/** 232/**
227 * bsg_request_fn - generic handler for bsg requests 233 * bsg_queue_rq - generic handler for bsg requests
228 * @q: request queue to manage 234 * @hctx: hardware queue
235 * @bd: queue data
229 * 236 *
230 * On error the create_bsg_job function should return a -Exyz error value 237 * On error the create_bsg_job function should return a -Exyz error value
231 * that will be set to ->result. 238 * that will be set to ->result.
232 * 239 *
233 * Drivers/subsys should pass this to the queue init function. 240 * Drivers/subsys should pass this to the queue init function.
234 */ 241 */
235static void bsg_request_fn(struct request_queue *q) 242static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
236 __releases(q->queue_lock) 243 const struct blk_mq_queue_data *bd)
237 __acquires(q->queue_lock)
238{ 244{
245 struct request_queue *q = hctx->queue;
239 struct device *dev = q->queuedata; 246 struct device *dev = q->queuedata;
240 struct request *req; 247 struct request *req = bd->rq;
248 struct bsg_set *bset =
249 container_of(q->tag_set, struct bsg_set, tag_set);
241 int ret; 250 int ret;
242 251
252 blk_mq_start_request(req);
253
243 if (!get_device(dev)) 254 if (!get_device(dev))
244 return; 255 return BLK_STS_IOERR;
245 256
246 while (1) { 257 if (!bsg_prepare_job(dev, req))
247 req = blk_fetch_request(q); 258 return BLK_STS_IOERR;
248 if (!req) 259
249 break; 260 ret = bset->job_fn(blk_mq_rq_to_pdu(req));
250 spin_unlock_irq(q->queue_lock); 261 if (ret)
251 262 return BLK_STS_IOERR;
252 if (!bsg_prepare_job(dev, req)) {
253 blk_end_request_all(req, BLK_STS_OK);
254 spin_lock_irq(q->queue_lock);
255 continue;
256 }
257
258 ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req));
259 spin_lock_irq(q->queue_lock);
260 if (ret)
261 break;
262 }
263 263
264 spin_unlock_irq(q->queue_lock);
265 put_device(dev); 264 put_device(dev);
266 spin_lock_irq(q->queue_lock); 265 return BLK_STS_OK;
267} 266}
268 267
269/* called right after the request is allocated for the request_queue */ 268/* called right after the request is allocated for the request_queue */
270static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) 269static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
270 unsigned int hctx_idx, unsigned int numa_node)
271{ 271{
272 struct bsg_job *job = blk_mq_rq_to_pdu(req); 272 struct bsg_job *job = blk_mq_rq_to_pdu(req);
273 273
274 job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp); 274 job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
275 if (!job->reply) 275 if (!job->reply)
276 return -ENOMEM; 276 return -ENOMEM;
277 return 0; 277 return 0;
@@ -289,13 +289,47 @@ static void bsg_initialize_rq(struct request *req)
289 job->dd_data = job + 1; 289 job->dd_data = job + 1;
290} 290}
291 291
292static void bsg_exit_rq(struct request_queue *q, struct request *req) 292static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
293 unsigned int hctx_idx)
293{ 294{
294 struct bsg_job *job = blk_mq_rq_to_pdu(req); 295 struct bsg_job *job = blk_mq_rq_to_pdu(req);
295 296
296 kfree(job->reply); 297 kfree(job->reply);
297} 298}
298 299
300void bsg_remove_queue(struct request_queue *q)
301{
302 if (q) {
303 struct bsg_set *bset =
304 container_of(q->tag_set, struct bsg_set, tag_set);
305
306 bsg_unregister_queue(q);
307 blk_cleanup_queue(q);
308 blk_mq_free_tag_set(&bset->tag_set);
309 kfree(bset);
310 }
311}
312EXPORT_SYMBOL_GPL(bsg_remove_queue);
313
314static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved)
315{
316 struct bsg_set *bset =
317 container_of(rq->q->tag_set, struct bsg_set, tag_set);
318
319 if (!bset->timeout_fn)
320 return BLK_EH_DONE;
321 return bset->timeout_fn(rq);
322}
323
324static const struct blk_mq_ops bsg_mq_ops = {
325 .queue_rq = bsg_queue_rq,
326 .init_request = bsg_init_rq,
327 .exit_request = bsg_exit_rq,
328 .initialize_rq_fn = bsg_initialize_rq,
329 .complete = bsg_complete,
330 .timeout = bsg_timeout,
331};
332
299/** 333/**
300 * bsg_setup_queue - Create and add the bsg hooks so we can receive requests 334 * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
301 * @dev: device to attach bsg device to 335 * @dev: device to attach bsg device to
@@ -304,28 +338,38 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
304 * @dd_job_size: size of LLD data needed for each job 338 * @dd_job_size: size of LLD data needed for each job
305 */ 339 */
306struct request_queue *bsg_setup_queue(struct device *dev, const char *name, 340struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
307 bsg_job_fn *job_fn, int dd_job_size) 341 bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size)
308{ 342{
343 struct bsg_set *bset;
344 struct blk_mq_tag_set *set;
309 struct request_queue *q; 345 struct request_queue *q;
310 int ret; 346 int ret = -ENOMEM;
311 347
312 q = blk_alloc_queue(GFP_KERNEL); 348 bset = kzalloc(sizeof(*bset), GFP_KERNEL);
313 if (!q) 349 if (!bset)
314 return ERR_PTR(-ENOMEM); 350 return ERR_PTR(-ENOMEM);
315 q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
316 q->init_rq_fn = bsg_init_rq;
317 q->exit_rq_fn = bsg_exit_rq;
318 q->initialize_rq_fn = bsg_initialize_rq;
319 q->request_fn = bsg_request_fn;
320 351
321 ret = blk_init_allocated_queue(q); 352 bset->job_fn = job_fn;
322 if (ret) 353 bset->timeout_fn = timeout;
323 goto out_cleanup_queue; 354
355 set = &bset->tag_set;
356 set->ops = &bsg_mq_ops,
357 set->nr_hw_queues = 1;
358 set->queue_depth = 128;
359 set->numa_node = NUMA_NO_NODE;
360 set->cmd_size = sizeof(struct bsg_job) + dd_job_size;
361 set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING;
362 if (blk_mq_alloc_tag_set(set))
363 goto out_tag_set;
364
365 q = blk_mq_init_queue(set);
366 if (IS_ERR(q)) {
367 ret = PTR_ERR(q);
368 goto out_queue;
369 }
324 370
325 q->queuedata = dev; 371 q->queuedata = dev;
326 q->bsg_job_fn = job_fn;
327 blk_queue_flag_set(QUEUE_FLAG_BIDI, q); 372 blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
328 blk_queue_softirq_done(q, bsg_softirq_done);
329 blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); 373 blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
330 374
331 ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); 375 ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
@@ -338,6 +382,10 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
338 return q; 382 return q;
339out_cleanup_queue: 383out_cleanup_queue:
340 blk_cleanup_queue(q); 384 blk_cleanup_queue(q);
385out_queue:
386 blk_mq_free_tag_set(set);
387out_tag_set:
388 kfree(bset);
341 return ERR_PTR(ret); 389 return ERR_PTR(ret);
342} 390}
343EXPORT_SYMBOL_GPL(bsg_setup_queue); 391EXPORT_SYMBOL_GPL(bsg_setup_queue);
diff --git a/block/bsg.c b/block/bsg.c
index 9a442c23a715..44f6028b9567 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -471,7 +471,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
471 /* 471 /*
472 * we need a proper transport to send commands, not a stacked device 472 * we need a proper transport to send commands, not a stacked device
473 */ 473 */
474 if (!queue_is_rq_based(q)) 474 if (!queue_is_mq(q))
475 return 0; 475 return 0;
476 476
477 bcd = &q->bsg_dev; 477 bcd = &q->bsg_dev;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
deleted file mode 100644
index ed41aa978c4a..000000000000
--- a/block/cfq-iosched.c
+++ /dev/null
@@ -1,4916 +0,0 @@
1/*
2 * CFQ, or complete fairness queueing, disk scheduler.
3 *
4 * Based on ideas from a previously unfinished io
5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <linux/sched/clock.h>
12#include <linux/blkdev.h>
13#include <linux/elevator.h>
14#include <linux/ktime.h>
15#include <linux/rbtree.h>
16#include <linux/ioprio.h>
17#include <linux/blktrace_api.h>
18#include <linux/blk-cgroup.h>
19#include "blk.h"
20#include "blk-wbt.h"
21
22/*
23 * tunables
24 */
25/* max queue in one round of service */
26static const int cfq_quantum = 8;
27static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
28/* maximum backwards seek, in KiB */
29static const int cfq_back_max = 16 * 1024;
30/* penalty of a backwards seek */
31static const int cfq_back_penalty = 2;
32static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
33static u64 cfq_slice_async = NSEC_PER_SEC / 25;
34static const int cfq_slice_async_rq = 2;
35static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
36static u64 cfq_group_idle = NSEC_PER_SEC / 125;
37static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
38static const int cfq_hist_divisor = 4;
39
40/*
41 * offset from end of queue service tree for idle class
42 */
43#define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5)
44/* offset from end of group service tree under time slice mode */
45#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
46/* offset from end of group service under IOPS mode */
47#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
48
49/*
50 * below this threshold, we consider thinktime immediate
51 */
52#define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ)
53
54#define CFQ_SLICE_SCALE (5)
55#define CFQ_HW_QUEUE_MIN (5)
56#define CFQ_SERVICE_SHIFT 12
57
58#define CFQQ_SEEK_THR (sector_t)(8 * 100)
59#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
60#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
61#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
62
63#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
64#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
65#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
66
67static struct kmem_cache *cfq_pool;
68
69#define CFQ_PRIO_LISTS IOPRIO_BE_NR
70#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
71#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
72
73#define sample_valid(samples) ((samples) > 80)
74#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
75
76/* blkio-related constants */
77#define CFQ_WEIGHT_LEGACY_MIN 10
78#define CFQ_WEIGHT_LEGACY_DFL 500
79#define CFQ_WEIGHT_LEGACY_MAX 1000
80
81struct cfq_ttime {
82 u64 last_end_request;
83
84 u64 ttime_total;
85 u64 ttime_mean;
86 unsigned long ttime_samples;
87};
88
89/*
90 * Most of our rbtree usage is for sorting with min extraction, so
91 * if we cache the leftmost node we don't have to walk down the tree
92 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
93 * move this into the elevator for the rq sorting as well.
94 */
95struct cfq_rb_root {
96 struct rb_root_cached rb;
97 struct rb_node *rb_rightmost;
98 unsigned count;
99 u64 min_vdisktime;
100 struct cfq_ttime ttime;
101};
102#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
103 .rb_rightmost = NULL, \
104 .ttime = {.last_end_request = ktime_get_ns(),},}
105
106/*
107 * Per process-grouping structure
108 */
109struct cfq_queue {
110 /* reference count */
111 int ref;
112 /* various state flags, see below */
113 unsigned int flags;
114 /* parent cfq_data */
115 struct cfq_data *cfqd;
116 /* service_tree member */
117 struct rb_node rb_node;
118 /* service_tree key */
119 u64 rb_key;
120 /* prio tree member */
121 struct rb_node p_node;
122 /* prio tree root we belong to, if any */
123 struct rb_root *p_root;
124 /* sorted list of pending requests */
125 struct rb_root sort_list;
126 /* if fifo isn't expired, next request to serve */
127 struct request *next_rq;
128 /* requests queued in sort_list */
129 int queued[2];
130 /* currently allocated requests */
131 int allocated[2];
132 /* fifo list of requests in sort_list */
133 struct list_head fifo;
134
135 /* time when queue got scheduled in to dispatch first request. */
136 u64 dispatch_start;
137 u64 allocated_slice;
138 u64 slice_dispatch;
139 /* time when first request from queue completed and slice started. */
140 u64 slice_start;
141 u64 slice_end;
142 s64 slice_resid;
143
144 /* pending priority requests */
145 int prio_pending;
146 /* number of requests that are on the dispatch list or inside driver */
147 int dispatched;
148
149 /* io prio of this group */
150 unsigned short ioprio, org_ioprio;
151 unsigned short ioprio_class, org_ioprio_class;
152
153 pid_t pid;
154
155 u32 seek_history;
156 sector_t last_request_pos;
157
158 struct cfq_rb_root *service_tree;
159 struct cfq_queue *new_cfqq;
160 struct cfq_group *cfqg;
161 /* Number of sectors dispatched from queue in single dispatch round */
162 unsigned long nr_sectors;
163};
164
165/*
166 * First index in the service_trees.
167 * IDLE is handled separately, so it has negative index
168 */
169enum wl_class_t {
170 BE_WORKLOAD = 0,
171 RT_WORKLOAD = 1,
172 IDLE_WORKLOAD = 2,
173 CFQ_PRIO_NR,
174};
175
176/*
177 * Second index in the service_trees.
178 */
179enum wl_type_t {
180 ASYNC_WORKLOAD = 0,
181 SYNC_NOIDLE_WORKLOAD = 1,
182 SYNC_WORKLOAD = 2
183};
184
185struct cfqg_stats {
186#ifdef CONFIG_CFQ_GROUP_IOSCHED
187 /* number of ios merged */
188 struct blkg_rwstat merged;
189 /* total time spent on device in ns, may not be accurate w/ queueing */
190 struct blkg_rwstat service_time;
191 /* total time spent waiting in scheduler queue in ns */
192 struct blkg_rwstat wait_time;
193 /* number of IOs queued up */
194 struct blkg_rwstat queued;
195 /* total disk time and nr sectors dispatched by this group */
196 struct blkg_stat time;
197#ifdef CONFIG_DEBUG_BLK_CGROUP
198 /* time not charged to this cgroup */
199 struct blkg_stat unaccounted_time;
200 /* sum of number of ios queued across all samples */
201 struct blkg_stat avg_queue_size_sum;
202 /* count of samples taken for average */
203 struct blkg_stat avg_queue_size_samples;
204 /* how many times this group has been removed from service tree */
205 struct blkg_stat dequeue;
206 /* total time spent waiting for it to be assigned a timeslice. */
207 struct blkg_stat group_wait_time;
208 /* time spent idling for this blkcg_gq */
209 struct blkg_stat idle_time;
210 /* total time with empty current active q with other requests queued */
211 struct blkg_stat empty_time;
212 /* fields after this shouldn't be cleared on stat reset */
213 u64 start_group_wait_time;
214 u64 start_idle_time;
215 u64 start_empty_time;
216 uint16_t flags;
217#endif /* CONFIG_DEBUG_BLK_CGROUP */
218#endif /* CONFIG_CFQ_GROUP_IOSCHED */
219};
220
221/* Per-cgroup data */
222struct cfq_group_data {
223 /* must be the first member */
224 struct blkcg_policy_data cpd;
225
226 unsigned int weight;
227 unsigned int leaf_weight;
228};
229
230/* This is per cgroup per device grouping structure */
231struct cfq_group {
232 /* must be the first member */
233 struct blkg_policy_data pd;
234
235 /* group service_tree member */
236 struct rb_node rb_node;
237
238 /* group service_tree key */
239 u64 vdisktime;
240
241 /*
242 * The number of active cfqgs and sum of their weights under this
243 * cfqg. This covers this cfqg's leaf_weight and all children's
244 * weights, but does not cover weights of further descendants.
245 *
246 * If a cfqg is on the service tree, it's active. An active cfqg
247 * also activates its parent and contributes to the children_weight
248 * of the parent.
249 */
250 int nr_active;
251 unsigned int children_weight;
252
253 /*
254 * vfraction is the fraction of vdisktime that the tasks in this
255 * cfqg are entitled to. This is determined by compounding the
256 * ratios walking up from this cfqg to the root.
257 *
258 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
259 * vfractions on a service tree is approximately 1. The sum may
260 * deviate a bit due to rounding errors and fluctuations caused by
261 * cfqgs entering and leaving the service tree.
262 */
263 unsigned int vfraction;
264
265 /*
266 * There are two weights - (internal) weight is the weight of this
267 * cfqg against the sibling cfqgs. leaf_weight is the wight of
268 * this cfqg against the child cfqgs. For the root cfqg, both
269 * weights are kept in sync for backward compatibility.
270 */
271 unsigned int weight;
272 unsigned int new_weight;
273 unsigned int dev_weight;
274
275 unsigned int leaf_weight;
276 unsigned int new_leaf_weight;
277 unsigned int dev_leaf_weight;
278
279 /* number of cfqq currently on this group */
280 int nr_cfqq;
281
282 /*
283 * Per group busy queues average. Useful for workload slice calc. We
284 * create the array for each prio class but at run time it is used
285 * only for RT and BE class and slot for IDLE class remains unused.
286 * This is primarily done to avoid confusion and a gcc warning.
287 */
288 unsigned int busy_queues_avg[CFQ_PRIO_NR];
289 /*
290 * rr lists of queues with requests. We maintain service trees for
291 * RT and BE classes. These trees are subdivided in subclasses
292 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
293 * class there is no subclassification and all the cfq queues go on
294 * a single tree service_tree_idle.
295 * Counts are embedded in the cfq_rb_root
296 */
297 struct cfq_rb_root service_trees[2][3];
298 struct cfq_rb_root service_tree_idle;
299
300 u64 saved_wl_slice;
301 enum wl_type_t saved_wl_type;
302 enum wl_class_t saved_wl_class;
303
304 /* number of requests that are on the dispatch list or inside driver */
305 int dispatched;
306 struct cfq_ttime ttime;
307 struct cfqg_stats stats; /* stats for this cfqg */
308
309 /* async queue for each priority case */
310 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
311 struct cfq_queue *async_idle_cfqq;
312
313};
314
315struct cfq_io_cq {
316 struct io_cq icq; /* must be the first member */
317 struct cfq_queue *cfqq[2];
318 struct cfq_ttime ttime;
319 int ioprio; /* the current ioprio */
320#ifdef CONFIG_CFQ_GROUP_IOSCHED
321 uint64_t blkcg_serial_nr; /* the current blkcg serial */
322#endif
323};
324
325/*
326 * Per block device queue structure
327 */
328struct cfq_data {
329 struct request_queue *queue;
330 /* Root service tree for cfq_groups */
331 struct cfq_rb_root grp_service_tree;
332 struct cfq_group *root_group;
333
334 /*
335 * The priority currently being served
336 */
337 enum wl_class_t serving_wl_class;
338 enum wl_type_t serving_wl_type;
339 u64 workload_expires;
340 struct cfq_group *serving_group;
341
342 /*
343 * Each priority tree is sorted by next_request position. These
344 * trees are used when determining if two or more queues are
345 * interleaving requests (see cfq_close_cooperator).
346 */
347 struct rb_root prio_trees[CFQ_PRIO_LISTS];
348
349 unsigned int busy_queues;
350 unsigned int busy_sync_queues;
351
352 int rq_in_driver;
353 int rq_in_flight[2];
354
355 /*
356 * queue-depth detection
357 */
358 int rq_queued;
359 int hw_tag;
360 /*
361 * hw_tag can be
362 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
363 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
364 * 0 => no NCQ
365 */
366 int hw_tag_est_depth;
367 unsigned int hw_tag_samples;
368
369 /*
370 * idle window management
371 */
372 struct hrtimer idle_slice_timer;
373 struct work_struct unplug_work;
374
375 struct cfq_queue *active_queue;
376 struct cfq_io_cq *active_cic;
377
378 sector_t last_position;
379
380 /*
381 * tunables, see top of file
382 */
383 unsigned int cfq_quantum;
384 unsigned int cfq_back_penalty;
385 unsigned int cfq_back_max;
386 unsigned int cfq_slice_async_rq;
387 unsigned int cfq_latency;
388 u64 cfq_fifo_expire[2];
389 u64 cfq_slice[2];
390 u64 cfq_slice_idle;
391 u64 cfq_group_idle;
392 u64 cfq_target_latency;
393
394 /*
395 * Fallback dummy cfqq for extreme OOM conditions
396 */
397 struct cfq_queue oom_cfqq;
398
399 u64 last_delayed_sync;
400};
401
402static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
403static void cfq_put_queue(struct cfq_queue *cfqq);
404
405static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
406 enum wl_class_t class,
407 enum wl_type_t type)
408{
409 if (!cfqg)
410 return NULL;
411
412 if (class == IDLE_WORKLOAD)
413 return &cfqg->service_tree_idle;
414
415 return &cfqg->service_trees[class][type];
416}
417
418enum cfqq_state_flags {
419 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
420 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
421 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
422 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
423 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
424 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
425 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
426 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
427 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
428 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
429 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
430 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
431 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
432};
433
434#define CFQ_CFQQ_FNS(name) \
435static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
436{ \
437 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
438} \
439static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
440{ \
441 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
442} \
443static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
444{ \
445 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
446}
447
448CFQ_CFQQ_FNS(on_rr);
449CFQ_CFQQ_FNS(wait_request);
450CFQ_CFQQ_FNS(must_dispatch);
451CFQ_CFQQ_FNS(must_alloc_slice);
452CFQ_CFQQ_FNS(fifo_expire);
453CFQ_CFQQ_FNS(idle_window);
454CFQ_CFQQ_FNS(prio_changed);
455CFQ_CFQQ_FNS(slice_new);
456CFQ_CFQQ_FNS(sync);
457CFQ_CFQQ_FNS(coop);
458CFQ_CFQQ_FNS(split_coop);
459CFQ_CFQQ_FNS(deep);
460CFQ_CFQQ_FNS(wait_busy);
461#undef CFQ_CFQQ_FNS
462
463#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
464
465/* cfqg stats flags */
466enum cfqg_stats_flags {
467 CFQG_stats_waiting = 0,
468 CFQG_stats_idling,
469 CFQG_stats_empty,
470};
471
472#define CFQG_FLAG_FNS(name) \
473static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
474{ \
475 stats->flags |= (1 << CFQG_stats_##name); \
476} \
477static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
478{ \
479 stats->flags &= ~(1 << CFQG_stats_##name); \
480} \
481static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
482{ \
483 return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
484} \
485
486CFQG_FLAG_FNS(waiting)
487CFQG_FLAG_FNS(idling)
488CFQG_FLAG_FNS(empty)
489#undef CFQG_FLAG_FNS
490
491/* This should be called with the queue_lock held. */
492static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
493{
494 u64 now;
495
496 if (!cfqg_stats_waiting(stats))
497 return;
498
499 now = ktime_get_ns();
500 if (now > stats->start_group_wait_time)
501 blkg_stat_add(&stats->group_wait_time,
502 now - stats->start_group_wait_time);
503 cfqg_stats_clear_waiting(stats);
504}
505
506/* This should be called with the queue_lock held. */
507static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
508 struct cfq_group *curr_cfqg)
509{
510 struct cfqg_stats *stats = &cfqg->stats;
511
512 if (cfqg_stats_waiting(stats))
513 return;
514 if (cfqg == curr_cfqg)
515 return;
516 stats->start_group_wait_time = ktime_get_ns();
517 cfqg_stats_mark_waiting(stats);
518}
519
520/* This should be called with the queue_lock held. */
521static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
522{
523 u64 now;
524
525 if (!cfqg_stats_empty(stats))
526 return;
527
528 now = ktime_get_ns();
529 if (now > stats->start_empty_time)
530 blkg_stat_add(&stats->empty_time,
531 now - stats->start_empty_time);
532 cfqg_stats_clear_empty(stats);
533}
534
535static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
536{
537 blkg_stat_add(&cfqg->stats.dequeue, 1);
538}
539
540static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
541{
542 struct cfqg_stats *stats = &cfqg->stats;
543
544 if (blkg_rwstat_total(&stats->queued))
545 return;
546
547 /*
548 * group is already marked empty. This can happen if cfqq got new
549 * request in parent group and moved to this group while being added
550 * to service tree. Just ignore the event and move on.
551 */
552 if (cfqg_stats_empty(stats))
553 return;
554
555 stats->start_empty_time = ktime_get_ns();
556 cfqg_stats_mark_empty(stats);
557}
558
559static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
560{
561 struct cfqg_stats *stats = &cfqg->stats;
562
563 if (cfqg_stats_idling(stats)) {
564 u64 now = ktime_get_ns();
565
566 if (now > stats->start_idle_time)
567 blkg_stat_add(&stats->idle_time,
568 now - stats->start_idle_time);
569 cfqg_stats_clear_idling(stats);
570 }
571}
572
573static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
574{
575 struct cfqg_stats *stats = &cfqg->stats;
576
577 BUG_ON(cfqg_stats_idling(stats));
578
579 stats->start_idle_time = ktime_get_ns();
580 cfqg_stats_mark_idling(stats);
581}
582
583static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
584{
585 struct cfqg_stats *stats = &cfqg->stats;
586
587 blkg_stat_add(&stats->avg_queue_size_sum,
588 blkg_rwstat_total(&stats->queued));
589 blkg_stat_add(&stats->avg_queue_size_samples, 1);
590 cfqg_stats_update_group_wait_time(stats);
591}
592
593#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
594
595static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
596static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
597static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
598static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
599static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
600static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
601static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
602
603#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
604
605#ifdef CONFIG_CFQ_GROUP_IOSCHED
606
607static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
608{
609 return pd ? container_of(pd, struct cfq_group, pd) : NULL;
610}
611
612static struct cfq_group_data
613*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
614{
615 return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
616}
617
618static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
619{
620 return pd_to_blkg(&cfqg->pd);
621}
622
623static struct blkcg_policy blkcg_policy_cfq;
624
625static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
626{
627 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
628}
629
630static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
631{
632 return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
633}
634
635static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
636{
637 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
638
639 return pblkg ? blkg_to_cfqg(pblkg) : NULL;
640}
641
642static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
643 struct cfq_group *ancestor)
644{
645 return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
646 cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
647}
648
649static inline void cfqg_get(struct cfq_group *cfqg)
650{
651 return blkg_get(cfqg_to_blkg(cfqg));
652}
653
654static inline void cfqg_put(struct cfq_group *cfqg)
655{
656 return blkg_put(cfqg_to_blkg(cfqg));
657}
658
659#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
660 blk_add_cgroup_trace_msg((cfqd)->queue, \
661 cfqg_to_blkg((cfqq)->cfqg)->blkcg, \
662 "cfq%d%c%c " fmt, (cfqq)->pid, \
663 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
664 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
665 ##args); \
666} while (0)
667
668#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
669 blk_add_cgroup_trace_msg((cfqd)->queue, \
670 cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \
671} while (0)
672
673static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
674 struct cfq_group *curr_cfqg,
675 unsigned int op)
676{
677 blkg_rwstat_add(&cfqg->stats.queued, op, 1);
678 cfqg_stats_end_empty_time(&cfqg->stats);
679 cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
680}
681
682static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
683 uint64_t time, unsigned long unaccounted_time)
684{
685 blkg_stat_add(&cfqg->stats.time, time);
686#ifdef CONFIG_DEBUG_BLK_CGROUP
687 blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
688#endif
689}
690
691static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
692 unsigned int op)
693{
694 blkg_rwstat_add(&cfqg->stats.queued, op, -1);
695}
696
697static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
698 unsigned int op)
699{
700 blkg_rwstat_add(&cfqg->stats.merged, op, 1);
701}
702
703static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
704 u64 start_time_ns,
705 u64 io_start_time_ns,
706 unsigned int op)
707{
708 struct cfqg_stats *stats = &cfqg->stats;
709 u64 now = ktime_get_ns();
710
711 if (now > io_start_time_ns)
712 blkg_rwstat_add(&stats->service_time, op,
713 now - io_start_time_ns);
714 if (io_start_time_ns > start_time_ns)
715 blkg_rwstat_add(&stats->wait_time, op,
716 io_start_time_ns - start_time_ns);
717}
718
719/* @stats = 0 */
720static void cfqg_stats_reset(struct cfqg_stats *stats)
721{
722 /* queued stats shouldn't be cleared */
723 blkg_rwstat_reset(&stats->merged);
724 blkg_rwstat_reset(&stats->service_time);
725 blkg_rwstat_reset(&stats->wait_time);
726 blkg_stat_reset(&stats->time);
727#ifdef CONFIG_DEBUG_BLK_CGROUP
728 blkg_stat_reset(&stats->unaccounted_time);
729 blkg_stat_reset(&stats->avg_queue_size_sum);
730 blkg_stat_reset(&stats->avg_queue_size_samples);
731 blkg_stat_reset(&stats->dequeue);
732 blkg_stat_reset(&stats->group_wait_time);
733 blkg_stat_reset(&stats->idle_time);
734 blkg_stat_reset(&stats->empty_time);
735#endif
736}
737
738/* @to += @from */
739static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
740{
741 /* queued stats shouldn't be cleared */
742 blkg_rwstat_add_aux(&to->merged, &from->merged);
743 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
744 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
745 blkg_stat_add_aux(&from->time, &from->time);
746#ifdef CONFIG_DEBUG_BLK_CGROUP
747 blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
748 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
749 blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
750 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
751 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
752 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
753 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
754#endif
755}
756
757/*
758 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
759 * recursive stats can still account for the amount used by this cfqg after
760 * it's gone.
761 */
762static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
763{
764 struct cfq_group *parent = cfqg_parent(cfqg);
765
766 lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
767
768 if (unlikely(!parent))
769 return;
770
771 cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
772 cfqg_stats_reset(&cfqg->stats);
773}
774
775#else /* CONFIG_CFQ_GROUP_IOSCHED */
776
777static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
778static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
779 struct cfq_group *ancestor)
780{
781 return true;
782}
783static inline void cfqg_get(struct cfq_group *cfqg) { }
784static inline void cfqg_put(struct cfq_group *cfqg) { }
785
786#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
787 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
788 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
789 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
790 ##args)
791#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
792
793static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
794 struct cfq_group *curr_cfqg, unsigned int op) { }
795static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
796 uint64_t time, unsigned long unaccounted_time) { }
797static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
798 unsigned int op) { }
799static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
800 unsigned int op) { }
801static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
802 u64 start_time_ns,
803 u64 io_start_time_ns,
804 unsigned int op) { }
805
806#endif /* CONFIG_CFQ_GROUP_IOSCHED */
807
808#define cfq_log(cfqd, fmt, args...) \
809 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
810
811/* Traverses through cfq group service trees */
812#define for_each_cfqg_st(cfqg, i, j, st) \
813 for (i = 0; i <= IDLE_WORKLOAD; i++) \
814 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
815 : &cfqg->service_tree_idle; \
816 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
817 (i == IDLE_WORKLOAD && j == 0); \
818 j++, st = i < IDLE_WORKLOAD ? \
819 &cfqg->service_trees[i][j]: NULL) \
820
821static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
822 struct cfq_ttime *ttime, bool group_idle)
823{
824 u64 slice;
825 if (!sample_valid(ttime->ttime_samples))
826 return false;
827 if (group_idle)
828 slice = cfqd->cfq_group_idle;
829 else
830 slice = cfqd->cfq_slice_idle;
831 return ttime->ttime_mean > slice;
832}
833
834static inline bool iops_mode(struct cfq_data *cfqd)
835{
836 /*
837 * If we are not idling on queues and it is a NCQ drive, parallel
838 * execution of requests is on and measuring time is not possible
839 * in most of the cases until and unless we drive shallower queue
840 * depths and that becomes a performance bottleneck. In such cases
841 * switch to start providing fairness in terms of number of IOs.
842 */
843 if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
844 return true;
845 else
846 return false;
847}
848
849static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
850{
851 if (cfq_class_idle(cfqq))
852 return IDLE_WORKLOAD;
853 if (cfq_class_rt(cfqq))
854 return RT_WORKLOAD;
855 return BE_WORKLOAD;
856}
857
858
859static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
860{
861 if (!cfq_cfqq_sync(cfqq))
862 return ASYNC_WORKLOAD;
863 if (!cfq_cfqq_idle_window(cfqq))
864 return SYNC_NOIDLE_WORKLOAD;
865 return SYNC_WORKLOAD;
866}
867
868static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
869 struct cfq_data *cfqd,
870 struct cfq_group *cfqg)
871{
872 if (wl_class == IDLE_WORKLOAD)
873 return cfqg->service_tree_idle.count;
874
875 return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
876 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
877 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
878}
879
880static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
881 struct cfq_group *cfqg)
882{
883 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
884 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
885}
886
887static void cfq_dispatch_insert(struct request_queue *, struct request *);
888static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
889 struct cfq_io_cq *cic, struct bio *bio);
890
891static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
892{
893 /* cic->icq is the first member, %NULL will convert to %NULL */
894 return container_of(icq, struct cfq_io_cq, icq);
895}
896
897static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
898 struct io_context *ioc)
899{
900 if (ioc)
901 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
902 return NULL;
903}
904
905static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
906{
907 return cic->cfqq[is_sync];
908}
909
910static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
911 bool is_sync)
912{
913 cic->cfqq[is_sync] = cfqq;
914}
915
916static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
917{
918 return cic->icq.q->elevator->elevator_data;
919}
920
921/*
922 * scheduler run of queue, if there are requests pending and no one in the
923 * driver that will restart queueing
924 */
925static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
926{
927 if (cfqd->busy_queues) {
928 cfq_log(cfqd, "schedule dispatch");
929 kblockd_schedule_work(&cfqd->unplug_work);
930 }
931}
932
933/*
934 * Scale schedule slice based on io priority. Use the sync time slice only
935 * if a queue is marked sync and has sync io queued. A sync queue with async
936 * io only, should not get full sync slice length.
937 */
938static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
939 unsigned short prio)
940{
941 u64 base_slice = cfqd->cfq_slice[sync];
942 u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
943
944 WARN_ON(prio >= IOPRIO_BE_NR);
945
946 return base_slice + (slice * (4 - prio));
947}
948
949static inline u64
950cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
951{
952 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
953}
954
955/**
956 * cfqg_scale_charge - scale disk time charge according to cfqg weight
957 * @charge: disk time being charged
958 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
959 *
960 * Scale @charge according to @vfraction, which is in range (0, 1]. The
961 * scaling is inversely proportional.
962 *
963 * scaled = charge / vfraction
964 *
965 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
966 */
967static inline u64 cfqg_scale_charge(u64 charge,
968 unsigned int vfraction)
969{
970 u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
971
972 /* charge / vfraction */
973 c <<= CFQ_SERVICE_SHIFT;
974 return div_u64(c, vfraction);
975}
976
977static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
978{
979 s64 delta = (s64)(vdisktime - min_vdisktime);
980 if (delta > 0)
981 min_vdisktime = vdisktime;
982
983 return min_vdisktime;
984}
985
986static void update_min_vdisktime(struct cfq_rb_root *st)
987{
988 if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
989 struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
990
991 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
992 cfqg->vdisktime);
993 }
994}
995
996/*
997 * get averaged number of queues of RT/BE priority.
998 * average is updated, with a formula that gives more weight to higher numbers,
999 * to quickly follows sudden increases and decrease slowly
1000 */
1001
1002static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
1003 struct cfq_group *cfqg, bool rt)
1004{
1005 unsigned min_q, max_q;
1006 unsigned mult = cfq_hist_divisor - 1;
1007 unsigned round = cfq_hist_divisor / 2;
1008 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1009
1010 min_q = min(cfqg->busy_queues_avg[rt], busy);
1011 max_q = max(cfqg->busy_queues_avg[rt], busy);
1012 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1013 cfq_hist_divisor;
1014 return cfqg->busy_queues_avg[rt];
1015}
1016
1017static inline u64
1018cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
1019{
1020 return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1021}
1022
1023static inline u64
1024cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1025{
1026 u64 slice = cfq_prio_to_slice(cfqd, cfqq);
1027 if (cfqd->cfq_latency) {
1028 /*
1029 * interested queues (we consider only the ones with the same
1030 * priority class in the cfq group)
1031 */
1032 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
1033 cfq_class_rt(cfqq));
1034 u64 sync_slice = cfqd->cfq_slice[1];
1035 u64 expect_latency = sync_slice * iq;
1036 u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1037
1038 if (expect_latency > group_slice) {
1039 u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
1040 u64 low_slice;
1041
1042 /* scale low_slice according to IO priority
1043 * and sync vs async */
1044 low_slice = div64_u64(base_low_slice*slice, sync_slice);
1045 low_slice = min(slice, low_slice);
1046 /* the adapted slice value is scaled to fit all iqs
1047 * into the target latency */
1048 slice = div64_u64(slice*group_slice, expect_latency);
1049 slice = max(slice, low_slice);
1050 }
1051 }
1052 return slice;
1053}
1054
1055static inline void
1056cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1057{
1058 u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1059 u64 now = ktime_get_ns();
1060
1061 cfqq->slice_start = now;
1062 cfqq->slice_end = now + slice;
1063 cfqq->allocated_slice = slice;
1064 cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
1065}
1066
1067/*
1068 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1069 * isn't valid until the first request from the dispatch is activated
1070 * and the slice time set.
1071 */
1072static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1073{
1074 if (cfq_cfqq_slice_new(cfqq))
1075 return false;
1076 if (ktime_get_ns() < cfqq->slice_end)
1077 return false;
1078
1079 return true;
1080}
1081
1082/*
1083 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1084 * We choose the request that is closest to the head right now. Distance
1085 * behind the head is penalized and only allowed to a certain extent.
1086 */
1087static struct request *
1088cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1089{
1090 sector_t s1, s2, d1 = 0, d2 = 0;
1091 unsigned long back_max;
1092#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1093#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1094 unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1095
1096 if (rq1 == NULL || rq1 == rq2)
1097 return rq2;
1098 if (rq2 == NULL)
1099 return rq1;
1100
1101 if (rq_is_sync(rq1) != rq_is_sync(rq2))
1102 return rq_is_sync(rq1) ? rq1 : rq2;
1103
1104 if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1105 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1106
1107 s1 = blk_rq_pos(rq1);
1108 s2 = blk_rq_pos(rq2);
1109
1110 /*
1111 * by definition, 1KiB is 2 sectors
1112 */
1113 back_max = cfqd->cfq_back_max * 2;
1114
1115 /*
1116 * Strict one way elevator _except_ in the case where we allow
1117 * short backward seeks which are biased as twice the cost of a
1118 * similar forward seek.
1119 */
1120 if (s1 >= last)
1121 d1 = s1 - last;
1122 else if (s1 + back_max >= last)
1123 d1 = (last - s1) * cfqd->cfq_back_penalty;
1124 else
1125 wrap |= CFQ_RQ1_WRAP;
1126
1127 if (s2 >= last)
1128 d2 = s2 - last;
1129 else if (s2 + back_max >= last)
1130 d2 = (last - s2) * cfqd->cfq_back_penalty;
1131 else
1132 wrap |= CFQ_RQ2_WRAP;
1133
1134 /* Found required data */
1135
1136 /*
1137 * By doing switch() on the bit mask "wrap" we avoid having to
1138 * check two variables for all permutations: --> faster!
1139 */
1140 switch (wrap) {
1141 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1142 if (d1 < d2)
1143 return rq1;
1144 else if (d2 < d1)
1145 return rq2;
1146 else {
1147 if (s1 >= s2)
1148 return rq1;
1149 else
1150 return rq2;
1151 }
1152
1153 case CFQ_RQ2_WRAP:
1154 return rq1;
1155 case CFQ_RQ1_WRAP:
1156 return rq2;
1157 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1158 default:
1159 /*
1160 * Since both rqs are wrapped,
1161 * start with the one that's further behind head
1162 * (--> only *one* back seek required),
1163 * since back seek takes more time than forward.
1164 */
1165 if (s1 <= s2)
1166 return rq1;
1167 else
1168 return rq2;
1169 }
1170}
1171
1172static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1173{
1174 /* Service tree is empty */
1175 if (!root->count)
1176 return NULL;
1177
1178 return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
1179}
1180
1181static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1182{
1183 return rb_entry_cfqg(rb_first_cached(&root->rb));
1184}
1185
1186static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1187{
1188 if (root->rb_rightmost == n)
1189 root->rb_rightmost = rb_prev(n);
1190
1191 rb_erase_cached(n, &root->rb);
1192 RB_CLEAR_NODE(n);
1193
1194 --root->count;
1195}
1196
1197/*
1198 * would be nice to take fifo expire time into account as well
1199 */
1200static struct request *
1201cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1202 struct request *last)
1203{
1204 struct rb_node *rbnext = rb_next(&last->rb_node);
1205 struct rb_node *rbprev = rb_prev(&last->rb_node);
1206 struct request *next = NULL, *prev = NULL;
1207
1208 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1209
1210 if (rbprev)
1211 prev = rb_entry_rq(rbprev);
1212
1213 if (rbnext)
1214 next = rb_entry_rq(rbnext);
1215 else {
1216 rbnext = rb_first(&cfqq->sort_list);
1217 if (rbnext && rbnext != &last->rb_node)
1218 next = rb_entry_rq(rbnext);
1219 }
1220
1221 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1222}
1223
1224static u64 cfq_slice_offset(struct cfq_data *cfqd,
1225 struct cfq_queue *cfqq)
1226{
1227 /*
1228 * just an approximation, should be ok.
1229 */
1230 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1231 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1232}
1233
1234static inline s64
1235cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1236{
1237 return cfqg->vdisktime - st->min_vdisktime;
1238}
1239
1240static void
1241__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1242{
1243 struct rb_node **node = &st->rb.rb_root.rb_node;
1244 struct rb_node *parent = NULL;
1245 struct cfq_group *__cfqg;
1246 s64 key = cfqg_key(st, cfqg);
1247 bool leftmost = true, rightmost = true;
1248
1249 while (*node != NULL) {
1250 parent = *node;
1251 __cfqg = rb_entry_cfqg(parent);
1252
1253 if (key < cfqg_key(st, __cfqg)) {
1254 node = &parent->rb_left;
1255 rightmost = false;
1256 } else {
1257 node = &parent->rb_right;
1258 leftmost = false;
1259 }
1260 }
1261
1262 if (rightmost)
1263 st->rb_rightmost = &cfqg->rb_node;
1264
1265 rb_link_node(&cfqg->rb_node, parent, node);
1266 rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
1267}
1268
1269/*
1270 * This has to be called only on activation of cfqg
1271 */
1272static void
1273cfq_update_group_weight(struct cfq_group *cfqg)
1274{
1275 if (cfqg->new_weight) {
1276 cfqg->weight = cfqg->new_weight;
1277 cfqg->new_weight = 0;
1278 }
1279}
1280
1281static void
1282cfq_update_group_leaf_weight(struct cfq_group *cfqg)
1283{
1284 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1285
1286 if (cfqg->new_leaf_weight) {
1287 cfqg->leaf_weight = cfqg->new_leaf_weight;
1288 cfqg->new_leaf_weight = 0;
1289 }
1290}
1291
1292static void
1293cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1294{
1295 unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
1296 struct cfq_group *pos = cfqg;
1297 struct cfq_group *parent;
1298 bool propagate;
1299
1300 /* add to the service tree */
1301 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1302
1303 /*
1304 * Update leaf_weight. We cannot update weight at this point
1305 * because cfqg might already have been activated and is
1306 * contributing its current weight to the parent's child_weight.
1307 */
1308 cfq_update_group_leaf_weight(cfqg);
1309 __cfq_group_service_tree_add(st, cfqg);
1310
1311 /*
1312 * Activate @cfqg and calculate the portion of vfraction @cfqg is
1313 * entitled to. vfraction is calculated by walking the tree
1314 * towards the root calculating the fraction it has at each level.
1315 * The compounded ratio is how much vfraction @cfqg owns.
1316 *
1317 * Start with the proportion tasks in this cfqg has against active
1318 * children cfqgs - its leaf_weight against children_weight.
1319 */
1320 propagate = !pos->nr_active++;
1321 pos->children_weight += pos->leaf_weight;
1322 vfr = vfr * pos->leaf_weight / pos->children_weight;
1323
1324 /*
1325 * Compound ->weight walking up the tree. Both activation and
1326 * vfraction calculation are done in the same loop. Propagation
1327 * stops once an already activated node is met. vfraction
1328 * calculation should always continue to the root.
1329 */
1330 while ((parent = cfqg_parent(pos))) {
1331 if (propagate) {
1332 cfq_update_group_weight(pos);
1333 propagate = !parent->nr_active++;
1334 parent->children_weight += pos->weight;
1335 }
1336 vfr = vfr * pos->weight / parent->children_weight;
1337 pos = parent;
1338 }
1339
1340 cfqg->vfraction = max_t(unsigned, vfr, 1);
1341}
1342
1343static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
1344{
1345 if (!iops_mode(cfqd))
1346 return CFQ_SLICE_MODE_GROUP_DELAY;
1347 else
1348 return CFQ_IOPS_MODE_GROUP_DELAY;
1349}
1350
1351static void
1352cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1353{
1354 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1355 struct cfq_group *__cfqg;
1356 struct rb_node *n;
1357
1358 cfqg->nr_cfqq++;
1359 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1360 return;
1361
1362 /*
1363 * Currently put the group at the end. Later implement something
1364 * so that groups get lesser vtime based on their weights, so that
1365 * if group does not loose all if it was not continuously backlogged.
1366 */
1367 n = st->rb_rightmost;
1368 if (n) {
1369 __cfqg = rb_entry_cfqg(n);
1370 cfqg->vdisktime = __cfqg->vdisktime +
1371 cfq_get_cfqg_vdisktime_delay(cfqd);
1372 } else
1373 cfqg->vdisktime = st->min_vdisktime;
1374 cfq_group_service_tree_add(st, cfqg);
1375}
1376
1377static void
1378cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1379{
1380 struct cfq_group *pos = cfqg;
1381 bool propagate;
1382
1383 /*
1384 * Undo activation from cfq_group_service_tree_add(). Deactivate
1385 * @cfqg and propagate deactivation upwards.
1386 */
1387 propagate = !--pos->nr_active;
1388 pos->children_weight -= pos->leaf_weight;
1389
1390 while (propagate) {
1391 struct cfq_group *parent = cfqg_parent(pos);
1392
1393 /* @pos has 0 nr_active at this point */
1394 WARN_ON_ONCE(pos->children_weight);
1395 pos->vfraction = 0;
1396
1397 if (!parent)
1398 break;
1399
1400 propagate = !--parent->nr_active;
1401 parent->children_weight -= pos->weight;
1402 pos = parent;
1403 }
1404
1405 /* remove from the service tree */
1406 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1407 cfq_rb_erase(&cfqg->rb_node, st);
1408}
1409
1410static void
1411cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1412{
1413 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1414
1415 BUG_ON(cfqg->nr_cfqq < 1);
1416 cfqg->nr_cfqq--;
1417
1418 /* If there are other cfq queues under this group, don't delete it */
1419 if (cfqg->nr_cfqq)
1420 return;
1421
1422 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1423 cfq_group_service_tree_del(st, cfqg);
1424 cfqg->saved_wl_slice = 0;
1425 cfqg_stats_update_dequeue(cfqg);
1426}
1427
1428static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1429 u64 *unaccounted_time)
1430{
1431 u64 slice_used;
1432 u64 now = ktime_get_ns();
1433
1434 /*
1435 * Queue got expired before even a single request completed or
1436 * got expired immediately after first request completion.
1437 */
1438 if (!cfqq->slice_start || cfqq->slice_start == now) {
1439 /*
1440 * Also charge the seek time incurred to the group, otherwise
1441 * if there are mutiple queues in the group, each can dispatch
1442 * a single request on seeky media and cause lots of seek time
1443 * and group will never know it.
1444 */
1445 slice_used = max_t(u64, (now - cfqq->dispatch_start),
1446 jiffies_to_nsecs(1));
1447 } else {
1448 slice_used = now - cfqq->slice_start;
1449 if (slice_used > cfqq->allocated_slice) {
1450 *unaccounted_time = slice_used - cfqq->allocated_slice;
1451 slice_used = cfqq->allocated_slice;
1452 }
1453 if (cfqq->slice_start > cfqq->dispatch_start)
1454 *unaccounted_time += cfqq->slice_start -
1455 cfqq->dispatch_start;
1456 }
1457
1458 return slice_used;
1459}
1460
1461static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1462 struct cfq_queue *cfqq)
1463{
1464 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1465 u64 used_sl, charge, unaccounted_sl = 0;
1466 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1467 - cfqg->service_tree_idle.count;
1468 unsigned int vfr;
1469 u64 now = ktime_get_ns();
1470
1471 BUG_ON(nr_sync < 0);
1472 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1473
1474 if (iops_mode(cfqd))
1475 charge = cfqq->slice_dispatch;
1476 else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1477 charge = cfqq->allocated_slice;
1478
1479 /*
1480 * Can't update vdisktime while on service tree and cfqg->vfraction
1481 * is valid only while on it. Cache vfr, leave the service tree,
1482 * update vdisktime and go back on. The re-addition to the tree
1483 * will also update the weights as necessary.
1484 */
1485 vfr = cfqg->vfraction;
1486 cfq_group_service_tree_del(st, cfqg);
1487 cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1488 cfq_group_service_tree_add(st, cfqg);
1489
1490 /* This group is being expired. Save the context */
1491 if (cfqd->workload_expires > now) {
1492 cfqg->saved_wl_slice = cfqd->workload_expires - now;
1493 cfqg->saved_wl_type = cfqd->serving_wl_type;
1494 cfqg->saved_wl_class = cfqd->serving_wl_class;
1495 } else
1496 cfqg->saved_wl_slice = 0;
1497
1498 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1499 st->min_vdisktime);
1500 cfq_log_cfqq(cfqq->cfqd, cfqq,
1501 "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
1502 used_sl, cfqq->slice_dispatch, charge,
1503 iops_mode(cfqd), cfqq->nr_sectors);
1504 cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1505 cfqg_stats_set_start_empty_time(cfqg);
1506}
1507
1508/**
1509 * cfq_init_cfqg_base - initialize base part of a cfq_group
1510 * @cfqg: cfq_group to initialize
1511 *
1512 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1513 * is enabled or not.
1514 */
1515static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1516{
1517 struct cfq_rb_root *st;
1518 int i, j;
1519
1520 for_each_cfqg_st(cfqg, i, j, st)
1521 *st = CFQ_RB_ROOT;
1522 RB_CLEAR_NODE(&cfqg->rb_node);
1523
1524 cfqg->ttime.last_end_request = ktime_get_ns();
1525}
1526
1527#ifdef CONFIG_CFQ_GROUP_IOSCHED
1528static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1529 bool on_dfl, bool reset_dev, bool is_leaf_weight);
1530
1531static void cfqg_stats_exit(struct cfqg_stats *stats)
1532{
1533 blkg_rwstat_exit(&stats->merged);
1534 blkg_rwstat_exit(&stats->service_time);
1535 blkg_rwstat_exit(&stats->wait_time);
1536 blkg_rwstat_exit(&stats->queued);
1537 blkg_stat_exit(&stats->time);
1538#ifdef CONFIG_DEBUG_BLK_CGROUP
1539 blkg_stat_exit(&stats->unaccounted_time);
1540 blkg_stat_exit(&stats->avg_queue_size_sum);
1541 blkg_stat_exit(&stats->avg_queue_size_samples);
1542 blkg_stat_exit(&stats->dequeue);
1543 blkg_stat_exit(&stats->group_wait_time);
1544 blkg_stat_exit(&stats->idle_time);
1545 blkg_stat_exit(&stats->empty_time);
1546#endif
1547}
1548
1549static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
1550{
1551 if (blkg_rwstat_init(&stats->merged, gfp) ||
1552 blkg_rwstat_init(&stats->service_time, gfp) ||
1553 blkg_rwstat_init(&stats->wait_time, gfp) ||
1554 blkg_rwstat_init(&stats->queued, gfp) ||
1555 blkg_stat_init(&stats->time, gfp))
1556 goto err;
1557
1558#ifdef CONFIG_DEBUG_BLK_CGROUP
1559 if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
1560 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
1561 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
1562 blkg_stat_init(&stats->dequeue, gfp) ||
1563 blkg_stat_init(&stats->group_wait_time, gfp) ||
1564 blkg_stat_init(&stats->idle_time, gfp) ||
1565 blkg_stat_init(&stats->empty_time, gfp))
1566 goto err;
1567#endif
1568 return 0;
1569err:
1570 cfqg_stats_exit(stats);
1571 return -ENOMEM;
1572}
1573
1574static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1575{
1576 struct cfq_group_data *cgd;
1577
1578 cgd = kzalloc(sizeof(*cgd), gfp);
1579 if (!cgd)
1580 return NULL;
1581 return &cgd->cpd;
1582}
1583
1584static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1585{
1586 struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1587 unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
1588 CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1589
1590 if (cpd_to_blkcg(cpd) == &blkcg_root)
1591 weight *= 2;
1592
1593 cgd->weight = weight;
1594 cgd->leaf_weight = weight;
1595}
1596
1597static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1598{
1599 kfree(cpd_to_cfqgd(cpd));
1600}
1601
1602static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1603{
1604 struct blkcg *blkcg = cpd_to_blkcg(cpd);
1605 bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
1606 unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1607
1608 if (blkcg == &blkcg_root)
1609 weight *= 2;
1610
1611 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
1612 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
1613}
1614
1615static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
1616{
1617 struct cfq_group *cfqg;
1618
1619 cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
1620 if (!cfqg)
1621 return NULL;
1622
1623 cfq_init_cfqg_base(cfqg);
1624 if (cfqg_stats_init(&cfqg->stats, gfp)) {
1625 kfree(cfqg);
1626 return NULL;
1627 }
1628
1629 return &cfqg->pd;
1630}
1631
1632static void cfq_pd_init(struct blkg_policy_data *pd)
1633{
1634 struct cfq_group *cfqg = pd_to_cfqg(pd);
1635 struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
1636
1637 cfqg->weight = cgd->weight;
1638 cfqg->leaf_weight = cgd->leaf_weight;
1639}
1640
1641static void cfq_pd_offline(struct blkg_policy_data *pd)
1642{
1643 struct cfq_group *cfqg = pd_to_cfqg(pd);
1644 int i;
1645
1646 for (i = 0; i < IOPRIO_BE_NR; i++) {
1647 if (cfqg->async_cfqq[0][i]) {
1648 cfq_put_queue(cfqg->async_cfqq[0][i]);
1649 cfqg->async_cfqq[0][i] = NULL;
1650 }
1651 if (cfqg->async_cfqq[1][i]) {
1652 cfq_put_queue(cfqg->async_cfqq[1][i]);
1653 cfqg->async_cfqq[1][i] = NULL;
1654 }
1655 }
1656
1657 if (cfqg->async_idle_cfqq) {
1658 cfq_put_queue(cfqg->async_idle_cfqq);
1659 cfqg->async_idle_cfqq = NULL;
1660 }
1661
1662 /*
1663 * @blkg is going offline and will be ignored by
1664 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
1665 * that they don't get lost. If IOs complete after this point, the
1666 * stats for them will be lost. Oh well...
1667 */
1668 cfqg_stats_xfer_dead(cfqg);
1669}
1670
1671static void cfq_pd_free(struct blkg_policy_data *pd)
1672{
1673 struct cfq_group *cfqg = pd_to_cfqg(pd);
1674
1675 cfqg_stats_exit(&cfqg->stats);
1676 return kfree(cfqg);
1677}
1678
1679static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
1680{
1681 struct cfq_group *cfqg = pd_to_cfqg(pd);
1682
1683 cfqg_stats_reset(&cfqg->stats);
1684}
1685
1686static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
1687 struct blkcg *blkcg)
1688{
1689 struct blkcg_gq *blkg;
1690
1691 blkg = blkg_lookup(blkcg, cfqd->queue);
1692 if (likely(blkg))
1693 return blkg_to_cfqg(blkg);
1694 return NULL;
1695}
1696
1697static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1698{
1699 cfqq->cfqg = cfqg;
1700 /* cfqq reference on cfqg */
1701 cfqg_get(cfqg);
1702}
1703
1704static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1705 struct blkg_policy_data *pd, int off)
1706{
1707 struct cfq_group *cfqg = pd_to_cfqg(pd);
1708
1709 if (!cfqg->dev_weight)
1710 return 0;
1711 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1712}
1713
1714static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1715{
1716 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1717 cfqg_prfill_weight_device, &blkcg_policy_cfq,
1718 0, false);
1719 return 0;
1720}
1721
1722static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1723 struct blkg_policy_data *pd, int off)
1724{
1725 struct cfq_group *cfqg = pd_to_cfqg(pd);
1726
1727 if (!cfqg->dev_leaf_weight)
1728 return 0;
1729 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1730}
1731
1732static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1733{
1734 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1735 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1736 0, false);
1737 return 0;
1738}
1739
1740static int cfq_print_weight(struct seq_file *sf, void *v)
1741{
1742 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1743 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1744 unsigned int val = 0;
1745
1746 if (cgd)
1747 val = cgd->weight;
1748
1749 seq_printf(sf, "%u\n", val);
1750 return 0;
1751}
1752
1753static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1754{
1755 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1756 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1757 unsigned int val = 0;
1758
1759 if (cgd)
1760 val = cgd->leaf_weight;
1761
1762 seq_printf(sf, "%u\n", val);
1763 return 0;
1764}
1765
1766static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1767 char *buf, size_t nbytes, loff_t off,
1768 bool on_dfl, bool is_leaf_weight)
1769{
1770 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1771 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1772 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1773 struct blkg_conf_ctx ctx;
1774 struct cfq_group *cfqg;
1775 struct cfq_group_data *cfqgd;
1776 int ret;
1777 u64 v;
1778
1779 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1780 if (ret)
1781 return ret;
1782
1783 if (sscanf(ctx.body, "%llu", &v) == 1) {
1784 /* require "default" on dfl */
1785 ret = -ERANGE;
1786 if (!v && on_dfl)
1787 goto out_finish;
1788 } else if (!strcmp(strim(ctx.body), "default")) {
1789 v = 0;
1790 } else {
1791 ret = -EINVAL;
1792 goto out_finish;
1793 }
1794
1795 cfqg = blkg_to_cfqg(ctx.blkg);
1796 cfqgd = blkcg_to_cfqgd(blkcg);
1797
1798 ret = -ERANGE;
1799 if (!v || (v >= min && v <= max)) {
1800 if (!is_leaf_weight) {
1801 cfqg->dev_weight = v;
1802 cfqg->new_weight = v ?: cfqgd->weight;
1803 } else {
1804 cfqg->dev_leaf_weight = v;
1805 cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
1806 }
1807 ret = 0;
1808 }
1809out_finish:
1810 blkg_conf_finish(&ctx);
1811 return ret ?: nbytes;
1812}
1813
1814static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1815 char *buf, size_t nbytes, loff_t off)
1816{
1817 return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
1818}
1819
1820static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1821 char *buf, size_t nbytes, loff_t off)
1822{
1823 return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
1824}
1825
1826static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1827 bool on_dfl, bool reset_dev, bool is_leaf_weight)
1828{
1829 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1830 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1831 struct blkcg *blkcg = css_to_blkcg(css);
1832 struct blkcg_gq *blkg;
1833 struct cfq_group_data *cfqgd;
1834 int ret = 0;
1835
1836 if (val < min || val > max)
1837 return -ERANGE;
1838
1839 spin_lock_irq(&blkcg->lock);
1840 cfqgd = blkcg_to_cfqgd(blkcg);
1841 if (!cfqgd) {
1842 ret = -EINVAL;
1843 goto out;
1844 }
1845
1846 if (!is_leaf_weight)
1847 cfqgd->weight = val;
1848 else
1849 cfqgd->leaf_weight = val;
1850
1851 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1852 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1853
1854 if (!cfqg)
1855 continue;
1856
1857 if (!is_leaf_weight) {
1858 if (reset_dev)
1859 cfqg->dev_weight = 0;
1860 if (!cfqg->dev_weight)
1861 cfqg->new_weight = cfqgd->weight;
1862 } else {
1863 if (reset_dev)
1864 cfqg->dev_leaf_weight = 0;
1865 if (!cfqg->dev_leaf_weight)
1866 cfqg->new_leaf_weight = cfqgd->leaf_weight;
1867 }
1868 }
1869
1870out:
1871 spin_unlock_irq(&blkcg->lock);
1872 return ret;
1873}
1874
1875static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1876 u64 val)
1877{
1878 return __cfq_set_weight(css, val, false, false, false);
1879}
1880
1881static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1882 struct cftype *cft, u64 val)
1883{
1884 return __cfq_set_weight(css, val, false, false, true);
1885}
1886
1887static int cfqg_print_stat(struct seq_file *sf, void *v)
1888{
1889 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1890 &blkcg_policy_cfq, seq_cft(sf)->private, false);
1891 return 0;
1892}
1893
1894static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1895{
1896 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1897 &blkcg_policy_cfq, seq_cft(sf)->private, true);
1898 return 0;
1899}
1900
1901static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1902 struct blkg_policy_data *pd, int off)
1903{
1904 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
1905 &blkcg_policy_cfq, off);
1906 return __blkg_prfill_u64(sf, pd, sum);
1907}
1908
1909static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1910 struct blkg_policy_data *pd, int off)
1911{
1912 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
1913 &blkcg_policy_cfq, off);
1914 return __blkg_prfill_rwstat(sf, pd, &sum);
1915}
1916
1917static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1918{
1919 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1920 cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1921 seq_cft(sf)->private, false);
1922 return 0;
1923}
1924
1925static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1926{
1927 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1928 cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1929 seq_cft(sf)->private, true);
1930 return 0;
1931}
1932
1933static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
1934 int off)
1935{
1936 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
1937
1938 return __blkg_prfill_u64(sf, pd, sum >> 9);
1939}
1940
1941static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
1942{
1943 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1944 cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
1945 return 0;
1946}
1947
1948static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
1949 struct blkg_policy_data *pd, int off)
1950{
1951 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
1952 offsetof(struct blkcg_gq, stat_bytes));
1953 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
1954 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
1955
1956 return __blkg_prfill_u64(sf, pd, sum >> 9);
1957}
1958
1959static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
1960{
1961 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1962 cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
1963 false);
1964 return 0;
1965}
1966
1967#ifdef CONFIG_DEBUG_BLK_CGROUP
1968static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1969 struct blkg_policy_data *pd, int off)
1970{
1971 struct cfq_group *cfqg = pd_to_cfqg(pd);
1972 u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1973 u64 v = 0;
1974
1975 if (samples) {
1976 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1977 v = div64_u64(v, samples);
1978 }
1979 __blkg_prfill_u64(sf, pd, v);
1980 return 0;
1981}
1982
1983/* print avg_queue_size */
1984static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1985{
1986 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1987 cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1988 0, false);
1989 return 0;
1990}
1991#endif /* CONFIG_DEBUG_BLK_CGROUP */
1992
1993static struct cftype cfq_blkcg_legacy_files[] = {
1994 /* on root, weight is mapped to leaf_weight */
1995 {
1996 .name = "weight_device",
1997 .flags = CFTYPE_ONLY_ON_ROOT,
1998 .seq_show = cfqg_print_leaf_weight_device,
1999 .write = cfqg_set_leaf_weight_device,
2000 },
2001 {
2002 .name = "weight",
2003 .flags = CFTYPE_ONLY_ON_ROOT,
2004 .seq_show = cfq_print_leaf_weight,
2005 .write_u64 = cfq_set_leaf_weight,
2006 },
2007
2008 /* no such mapping necessary for !roots */
2009 {
2010 .name = "weight_device",
2011 .flags = CFTYPE_NOT_ON_ROOT,
2012 .seq_show = cfqg_print_weight_device,
2013 .write = cfqg_set_weight_device,
2014 },
2015 {
2016 .name = "weight",
2017 .flags = CFTYPE_NOT_ON_ROOT,
2018 .seq_show = cfq_print_weight,
2019 .write_u64 = cfq_set_weight,
2020 },
2021
2022 {
2023 .name = "leaf_weight_device",
2024 .seq_show = cfqg_print_leaf_weight_device,
2025 .write = cfqg_set_leaf_weight_device,
2026 },
2027 {
2028 .name = "leaf_weight",
2029 .seq_show = cfq_print_leaf_weight,
2030 .write_u64 = cfq_set_leaf_weight,
2031 },
2032
2033 /* statistics, covers only the tasks in the cfqg */
2034 {
2035 .name = "time",
2036 .private = offsetof(struct cfq_group, stats.time),
2037 .seq_show = cfqg_print_stat,
2038 },
2039 {
2040 .name = "sectors",
2041 .seq_show = cfqg_print_stat_sectors,
2042 },
2043 {
2044 .name = "io_service_bytes",
2045 .private = (unsigned long)&blkcg_policy_cfq,
2046 .seq_show = blkg_print_stat_bytes,
2047 },
2048 {
2049 .name = "io_serviced",
2050 .private = (unsigned long)&blkcg_policy_cfq,
2051 .seq_show = blkg_print_stat_ios,
2052 },
2053 {
2054 .name = "io_service_time",
2055 .private = offsetof(struct cfq_group, stats.service_time),
2056 .seq_show = cfqg_print_rwstat,
2057 },
2058 {
2059 .name = "io_wait_time",
2060 .private = offsetof(struct cfq_group, stats.wait_time),
2061 .seq_show = cfqg_print_rwstat,
2062 },
2063 {
2064 .name = "io_merged",
2065 .private = offsetof(struct cfq_group, stats.merged),
2066 .seq_show = cfqg_print_rwstat,
2067 },
2068 {
2069 .name = "io_queued",
2070 .private = offsetof(struct cfq_group, stats.queued),
2071 .seq_show = cfqg_print_rwstat,
2072 },
2073
2074 /* the same statictics which cover the cfqg and its descendants */
2075 {
2076 .name = "time_recursive",
2077 .private = offsetof(struct cfq_group, stats.time),
2078 .seq_show = cfqg_print_stat_recursive,
2079 },
2080 {
2081 .name = "sectors_recursive",
2082 .seq_show = cfqg_print_stat_sectors_recursive,
2083 },
2084 {
2085 .name = "io_service_bytes_recursive",
2086 .private = (unsigned long)&blkcg_policy_cfq,
2087 .seq_show = blkg_print_stat_bytes_recursive,
2088 },
2089 {
2090 .name = "io_serviced_recursive",
2091 .private = (unsigned long)&blkcg_policy_cfq,
2092 .seq_show = blkg_print_stat_ios_recursive,
2093 },
2094 {
2095 .name = "io_service_time_recursive",
2096 .private = offsetof(struct cfq_group, stats.service_time),
2097 .seq_show = cfqg_print_rwstat_recursive,
2098 },
2099 {
2100 .name = "io_wait_time_recursive",
2101 .private = offsetof(struct cfq_group, stats.wait_time),
2102 .seq_show = cfqg_print_rwstat_recursive,
2103 },
2104 {
2105 .name = "io_merged_recursive",
2106 .private = offsetof(struct cfq_group, stats.merged),
2107 .seq_show = cfqg_print_rwstat_recursive,
2108 },
2109 {
2110 .name = "io_queued_recursive",
2111 .private = offsetof(struct cfq_group, stats.queued),
2112 .seq_show = cfqg_print_rwstat_recursive,
2113 },
2114#ifdef CONFIG_DEBUG_BLK_CGROUP
2115 {
2116 .name = "avg_queue_size",
2117 .seq_show = cfqg_print_avg_queue_size,
2118 },
2119 {
2120 .name = "group_wait_time",
2121 .private = offsetof(struct cfq_group, stats.group_wait_time),
2122 .seq_show = cfqg_print_stat,
2123 },
2124 {
2125 .name = "idle_time",
2126 .private = offsetof(struct cfq_group, stats.idle_time),
2127 .seq_show = cfqg_print_stat,
2128 },
2129 {
2130 .name = "empty_time",
2131 .private = offsetof(struct cfq_group, stats.empty_time),
2132 .seq_show = cfqg_print_stat,
2133 },
2134 {
2135 .name = "dequeue",
2136 .private = offsetof(struct cfq_group, stats.dequeue),
2137 .seq_show = cfqg_print_stat,
2138 },
2139 {
2140 .name = "unaccounted_time",
2141 .private = offsetof(struct cfq_group, stats.unaccounted_time),
2142 .seq_show = cfqg_print_stat,
2143 },
2144#endif /* CONFIG_DEBUG_BLK_CGROUP */
2145 { } /* terminate */
2146};
2147
2148static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
2149{
2150 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2151 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
2152
2153 seq_printf(sf, "default %u\n", cgd->weight);
2154 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
2155 &blkcg_policy_cfq, 0, false);
2156 return 0;
2157}
2158
2159static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
2160 char *buf, size_t nbytes, loff_t off)
2161{
2162 char *endp;
2163 int ret;
2164 u64 v;
2165
2166 buf = strim(buf);
2167
2168 /* "WEIGHT" or "default WEIGHT" sets the default weight */
2169 v = simple_strtoull(buf, &endp, 0);
2170 if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
2171 ret = __cfq_set_weight(of_css(of), v, true, false, false);
2172 return ret ?: nbytes;
2173 }
2174
2175 /* "MAJ:MIN WEIGHT" */
2176 return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
2177}
2178
2179static struct cftype cfq_blkcg_files[] = {
2180 {
2181 .name = "weight",
2182 .flags = CFTYPE_NOT_ON_ROOT,
2183 .seq_show = cfq_print_weight_on_dfl,
2184 .write = cfq_set_weight_on_dfl,
2185 },
2186 { } /* terminate */
2187};
2188
2189#else /* GROUP_IOSCHED */
2190static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
2191 struct blkcg *blkcg)
2192{
2193 return cfqd->root_group;
2194}
2195
2196static inline void
2197cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
2198 cfqq->cfqg = cfqg;
2199}
2200
2201#endif /* GROUP_IOSCHED */
2202
2203/*
2204 * The cfqd->service_trees holds all pending cfq_queue's that have
2205 * requests waiting to be processed. It is sorted in the order that
2206 * we will service the queues.
2207 */
2208static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2209 bool add_front)
2210{
2211 struct rb_node **p, *parent;
2212 struct cfq_queue *__cfqq;
2213 u64 rb_key;
2214 struct cfq_rb_root *st;
2215 bool leftmost = true;
2216 int new_cfqq = 1;
2217 u64 now = ktime_get_ns();
2218
2219 st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
2220 if (cfq_class_idle(cfqq)) {
2221 rb_key = CFQ_IDLE_DELAY;
2222 parent = st->rb_rightmost;
2223 if (parent && parent != &cfqq->rb_node) {
2224 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2225 rb_key += __cfqq->rb_key;
2226 } else
2227 rb_key += now;
2228 } else if (!add_front) {
2229 /*
2230 * Get our rb key offset. Subtract any residual slice
2231 * value carried from last service. A negative resid
2232 * count indicates slice overrun, and this should position
2233 * the next service time further away in the tree.
2234 */
2235 rb_key = cfq_slice_offset(cfqd, cfqq) + now;
2236 rb_key -= cfqq->slice_resid;
2237 cfqq->slice_resid = 0;
2238 } else {
2239 rb_key = -NSEC_PER_SEC;
2240 __cfqq = cfq_rb_first(st);
2241 rb_key += __cfqq ? __cfqq->rb_key : now;
2242 }
2243
2244 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2245 new_cfqq = 0;
2246 /*
2247 * same position, nothing more to do
2248 */
2249 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
2250 return;
2251
2252 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2253 cfqq->service_tree = NULL;
2254 }
2255
2256 parent = NULL;
2257 cfqq->service_tree = st;
2258 p = &st->rb.rb_root.rb_node;
2259 while (*p) {
2260 parent = *p;
2261 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2262
2263 /*
2264 * sort by key, that represents service time.
2265 */
2266 if (rb_key < __cfqq->rb_key)
2267 p = &parent->rb_left;
2268 else {
2269 p = &parent->rb_right;
2270 leftmost = false;
2271 }
2272 }
2273
2274 cfqq->rb_key = rb_key;
2275 rb_link_node(&cfqq->rb_node, parent, p);
2276 rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
2277 st->count++;
2278 if (add_front || !new_cfqq)
2279 return;
2280 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
2281}
2282
2283static struct cfq_queue *
2284cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
2285 sector_t sector, struct rb_node **ret_parent,
2286 struct rb_node ***rb_link)
2287{
2288 struct rb_node **p, *parent;
2289 struct cfq_queue *cfqq = NULL;
2290
2291 parent = NULL;
2292 p = &root->rb_node;
2293 while (*p) {
2294 struct rb_node **n;
2295
2296 parent = *p;
2297 cfqq = rb_entry(parent, struct cfq_queue, p_node);
2298
2299 /*
2300 * Sort strictly based on sector. Smallest to the left,
2301 * largest to the right.
2302 */
2303 if (sector > blk_rq_pos(cfqq->next_rq))
2304 n = &(*p)->rb_right;
2305 else if (sector < blk_rq_pos(cfqq->next_rq))
2306 n = &(*p)->rb_left;
2307 else
2308 break;
2309 p = n;
2310 cfqq = NULL;
2311 }
2312
2313 *ret_parent = parent;
2314 if (rb_link)
2315 *rb_link = p;
2316 return cfqq;
2317}
2318
2319static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2320{
2321 struct rb_node **p, *parent;
2322 struct cfq_queue *__cfqq;
2323
2324 if (cfqq->p_root) {
2325 rb_erase(&cfqq->p_node, cfqq->p_root);
2326 cfqq->p_root = NULL;
2327 }
2328
2329 if (cfq_class_idle(cfqq))
2330 return;
2331 if (!cfqq->next_rq)
2332 return;
2333
2334 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2335 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
2336 blk_rq_pos(cfqq->next_rq), &parent, &p);
2337 if (!__cfqq) {
2338 rb_link_node(&cfqq->p_node, parent, p);
2339 rb_insert_color(&cfqq->p_node, cfqq->p_root);
2340 } else
2341 cfqq->p_root = NULL;
2342}
2343
2344/*
2345 * Update cfqq's position in the service tree.
2346 */
2347static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2348{
2349 /*
2350 * Resorting requires the cfqq to be on the RR list already.
2351 */
2352 if (cfq_cfqq_on_rr(cfqq)) {
2353 cfq_service_tree_add(cfqd, cfqq, 0);
2354 cfq_prio_tree_add(cfqd, cfqq);
2355 }
2356}
2357
2358/*
2359 * add to busy list of queues for service, trying to be fair in ordering
2360 * the pending list according to last request service
2361 */
2362static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2363{
2364 cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2365 BUG_ON(cfq_cfqq_on_rr(cfqq));
2366 cfq_mark_cfqq_on_rr(cfqq);
2367 cfqd->busy_queues++;
2368 if (cfq_cfqq_sync(cfqq))
2369 cfqd->busy_sync_queues++;
2370
2371 cfq_resort_rr_list(cfqd, cfqq);
2372}
2373
2374/*
2375 * Called when the cfqq no longer has requests pending, remove it from
2376 * the service tree.
2377 */
2378static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2379{
2380 cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2381 BUG_ON(!cfq_cfqq_on_rr(cfqq));
2382 cfq_clear_cfqq_on_rr(cfqq);
2383
2384 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2385 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2386 cfqq->service_tree = NULL;
2387 }
2388 if (cfqq->p_root) {
2389 rb_erase(&cfqq->p_node, cfqq->p_root);
2390 cfqq->p_root = NULL;
2391 }
2392
2393 cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2394 BUG_ON(!cfqd->busy_queues);
2395 cfqd->busy_queues--;
2396 if (cfq_cfqq_sync(cfqq))
2397 cfqd->busy_sync_queues--;
2398}
2399
2400/*
2401 * rb tree support functions
2402 */
2403static void cfq_del_rq_rb(struct request *rq)
2404{
2405 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2406 const int sync = rq_is_sync(rq);
2407
2408 BUG_ON(!cfqq->queued[sync]);
2409 cfqq->queued[sync]--;
2410
2411 elv_rb_del(&cfqq->sort_list, rq);
2412
2413 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2414 /*
2415 * Queue will be deleted from service tree when we actually
2416 * expire it later. Right now just remove it from prio tree
2417 * as it is empty.
2418 */
2419 if (cfqq->p_root) {
2420 rb_erase(&cfqq->p_node, cfqq->p_root);
2421 cfqq->p_root = NULL;
2422 }
2423 }
2424}
2425
2426static void cfq_add_rq_rb(struct request *rq)
2427{
2428 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2429 struct cfq_data *cfqd = cfqq->cfqd;
2430 struct request *prev;
2431
2432 cfqq->queued[rq_is_sync(rq)]++;
2433
2434 elv_rb_add(&cfqq->sort_list, rq);
2435
2436 if (!cfq_cfqq_on_rr(cfqq))
2437 cfq_add_cfqq_rr(cfqd, cfqq);
2438
2439 /*
2440 * check if this request is a better next-serve candidate
2441 */
2442 prev = cfqq->next_rq;
2443 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2444
2445 /*
2446 * adjust priority tree position, if ->next_rq changes
2447 */
2448 if (prev != cfqq->next_rq)
2449 cfq_prio_tree_add(cfqd, cfqq);
2450
2451 BUG_ON(!cfqq->next_rq);
2452}
2453
2454static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2455{
2456 elv_rb_del(&cfqq->sort_list, rq);
2457 cfqq->queued[rq_is_sync(rq)]--;
2458 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2459 cfq_add_rq_rb(rq);
2460 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2461 rq->cmd_flags);
2462}
2463
2464static struct request *
2465cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2466{
2467 struct task_struct *tsk = current;
2468 struct cfq_io_cq *cic;
2469 struct cfq_queue *cfqq;
2470
2471 cic = cfq_cic_lookup(cfqd, tsk->io_context);
2472 if (!cic)
2473 return NULL;
2474
2475 cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
2476 if (cfqq)
2477 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
2478
2479 return NULL;
2480}
2481
2482static void cfq_activate_request(struct request_queue *q, struct request *rq)
2483{
2484 struct cfq_data *cfqd = q->elevator->elevator_data;
2485
2486 cfqd->rq_in_driver++;
2487 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2488 cfqd->rq_in_driver);
2489
2490 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2491}
2492
2493static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2494{
2495 struct cfq_data *cfqd = q->elevator->elevator_data;
2496
2497 WARN_ON(!cfqd->rq_in_driver);
2498 cfqd->rq_in_driver--;
2499 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2500 cfqd->rq_in_driver);
2501}
2502
2503static void cfq_remove_request(struct request *rq)
2504{
2505 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2506
2507 if (cfqq->next_rq == rq)
2508 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2509
2510 list_del_init(&rq->queuelist);
2511 cfq_del_rq_rb(rq);
2512
2513 cfqq->cfqd->rq_queued--;
2514 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2515 if (rq->cmd_flags & REQ_PRIO) {
2516 WARN_ON(!cfqq->prio_pending);
2517 cfqq->prio_pending--;
2518 }
2519}
2520
2521static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
2522 struct bio *bio)
2523{
2524 struct cfq_data *cfqd = q->elevator->elevator_data;
2525 struct request *__rq;
2526
2527 __rq = cfq_find_rq_fmerge(cfqd, bio);
2528 if (__rq && elv_bio_merge_ok(__rq, bio)) {
2529 *req = __rq;
2530 return ELEVATOR_FRONT_MERGE;
2531 }
2532
2533 return ELEVATOR_NO_MERGE;
2534}
2535
2536static void cfq_merged_request(struct request_queue *q, struct request *req,
2537 enum elv_merge type)
2538{
2539 if (type == ELEVATOR_FRONT_MERGE) {
2540 struct cfq_queue *cfqq = RQ_CFQQ(req);
2541
2542 cfq_reposition_rq_rb(cfqq, req);
2543 }
2544}
2545
2546static void cfq_bio_merged(struct request_queue *q, struct request *req,
2547 struct bio *bio)
2548{
2549 cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
2550}
2551
2552static void
2553cfq_merged_requests(struct request_queue *q, struct request *rq,
2554 struct request *next)
2555{
2556 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2557 struct cfq_data *cfqd = q->elevator->elevator_data;
2558
2559 /*
2560 * reposition in fifo if next is older than rq
2561 */
2562 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2563 next->fifo_time < rq->fifo_time &&
2564 cfqq == RQ_CFQQ(next)) {
2565 list_move(&rq->queuelist, &next->queuelist);
2566 rq->fifo_time = next->fifo_time;
2567 }
2568
2569 if (cfqq->next_rq == next)
2570 cfqq->next_rq = rq;
2571 cfq_remove_request(next);
2572 cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2573
2574 cfqq = RQ_CFQQ(next);
2575 /*
2576 * all requests of this queue are merged to other queues, delete it
2577 * from the service tree. If it's the active_queue,
2578 * cfq_dispatch_requests() will choose to expire it or do idle
2579 */
2580 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2581 cfqq != cfqd->active_queue)
2582 cfq_del_cfqq_rr(cfqd, cfqq);
2583}
2584
2585static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2586 struct bio *bio)
2587{
2588 struct cfq_data *cfqd = q->elevator->elevator_data;
2589 bool is_sync = op_is_sync(bio->bi_opf);
2590 struct cfq_io_cq *cic;
2591 struct cfq_queue *cfqq;
2592
2593 /*
2594 * Disallow merge of a sync bio into an async request.
2595 */
2596 if (is_sync && !rq_is_sync(rq))
2597 return false;
2598
2599 /*
2600 * Lookup the cfqq that this bio will be queued with and allow
2601 * merge only if rq is queued there.
2602 */
2603 cic = cfq_cic_lookup(cfqd, current->io_context);
2604 if (!cic)
2605 return false;
2606
2607 cfqq = cic_to_cfqq(cic, is_sync);
2608 return cfqq == RQ_CFQQ(rq);
2609}
2610
2611static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
2612 struct request *next)
2613{
2614 return RQ_CFQQ(rq) == RQ_CFQQ(next);
2615}
2616
2617static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2618{
2619 hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
2620 cfqg_stats_update_idle_time(cfqq->cfqg);
2621}
2622
2623static void __cfq_set_active_queue(struct cfq_data *cfqd,
2624 struct cfq_queue *cfqq)
2625{
2626 if (cfqq) {
2627 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2628 cfqd->serving_wl_class, cfqd->serving_wl_type);
2629 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2630 cfqq->slice_start = 0;
2631 cfqq->dispatch_start = ktime_get_ns();
2632 cfqq->allocated_slice = 0;
2633 cfqq->slice_end = 0;
2634 cfqq->slice_dispatch = 0;
2635 cfqq->nr_sectors = 0;
2636
2637 cfq_clear_cfqq_wait_request(cfqq);
2638 cfq_clear_cfqq_must_dispatch(cfqq);
2639 cfq_clear_cfqq_must_alloc_slice(cfqq);
2640 cfq_clear_cfqq_fifo_expire(cfqq);
2641 cfq_mark_cfqq_slice_new(cfqq);
2642
2643 cfq_del_timer(cfqd, cfqq);
2644 }
2645
2646 cfqd->active_queue = cfqq;
2647}
2648
2649/*
2650 * current cfqq expired its slice (or was too idle), select new one
2651 */
2652static void
2653__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2654 bool timed_out)
2655{
2656 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2657
2658 if (cfq_cfqq_wait_request(cfqq))
2659 cfq_del_timer(cfqd, cfqq);
2660
2661 cfq_clear_cfqq_wait_request(cfqq);
2662 cfq_clear_cfqq_wait_busy(cfqq);
2663
2664 /*
2665 * If this cfqq is shared between multiple processes, check to
2666 * make sure that those processes are still issuing I/Os within
2667 * the mean seek distance. If not, it may be time to break the
2668 * queues apart again.
2669 */
2670 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2671 cfq_mark_cfqq_split_coop(cfqq);
2672
2673 /*
2674 * store what was left of this slice, if the queue idled/timed out
2675 */
2676 if (timed_out) {
2677 if (cfq_cfqq_slice_new(cfqq))
2678 cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2679 else
2680 cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
2681 cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
2682 }
2683
2684 cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2685
2686 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2687 cfq_del_cfqq_rr(cfqd, cfqq);
2688
2689 cfq_resort_rr_list(cfqd, cfqq);
2690
2691 if (cfqq == cfqd->active_queue)
2692 cfqd->active_queue = NULL;
2693
2694 if (cfqd->active_cic) {
2695 put_io_context(cfqd->active_cic->icq.ioc);
2696 cfqd->active_cic = NULL;
2697 }
2698}
2699
2700static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2701{
2702 struct cfq_queue *cfqq = cfqd->active_queue;
2703
2704 if (cfqq)
2705 __cfq_slice_expired(cfqd, cfqq, timed_out);
2706}
2707
2708/*
2709 * Get next queue for service. Unless we have a queue preemption,
2710 * we'll simply select the first cfqq in the service tree.
2711 */
2712static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2713{
2714 struct cfq_rb_root *st = st_for(cfqd->serving_group,
2715 cfqd->serving_wl_class, cfqd->serving_wl_type);
2716
2717 if (!cfqd->rq_queued)
2718 return NULL;
2719
2720 /* There is nothing to dispatch */
2721 if (!st)
2722 return NULL;
2723 if (RB_EMPTY_ROOT(&st->rb.rb_root))
2724 return NULL;
2725 return cfq_rb_first(st);
2726}
2727
2728static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2729{
2730 struct cfq_group *cfqg;
2731 struct cfq_queue *cfqq;
2732 int i, j;
2733 struct cfq_rb_root *st;
2734
2735 if (!cfqd->rq_queued)
2736 return NULL;
2737
2738 cfqg = cfq_get_next_cfqg(cfqd);
2739 if (!cfqg)
2740 return NULL;
2741
2742 for_each_cfqg_st(cfqg, i, j, st) {
2743 cfqq = cfq_rb_first(st);
2744 if (cfqq)
2745 return cfqq;
2746 }
2747 return NULL;
2748}
2749
2750/*
2751 * Get and set a new active queue for service.
2752 */
2753static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2754 struct cfq_queue *cfqq)
2755{
2756 if (!cfqq)
2757 cfqq = cfq_get_next_queue(cfqd);
2758
2759 __cfq_set_active_queue(cfqd, cfqq);
2760 return cfqq;
2761}
2762
2763static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2764 struct request *rq)
2765{
2766 if (blk_rq_pos(rq) >= cfqd->last_position)
2767 return blk_rq_pos(rq) - cfqd->last_position;
2768 else
2769 return cfqd->last_position - blk_rq_pos(rq);
2770}
2771
2772static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2773 struct request *rq)
2774{
2775 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2776}
2777
2778static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2779 struct cfq_queue *cur_cfqq)
2780{
2781 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2782 struct rb_node *parent, *node;
2783 struct cfq_queue *__cfqq;
2784 sector_t sector = cfqd->last_position;
2785
2786 if (RB_EMPTY_ROOT(root))
2787 return NULL;
2788
2789 /*
2790 * First, if we find a request starting at the end of the last
2791 * request, choose it.
2792 */
2793 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2794 if (__cfqq)
2795 return __cfqq;
2796
2797 /*
2798 * If the exact sector wasn't found, the parent of the NULL leaf
2799 * will contain the closest sector.
2800 */
2801 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
2802 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2803 return __cfqq;
2804
2805 if (blk_rq_pos(__cfqq->next_rq) < sector)
2806 node = rb_next(&__cfqq->p_node);
2807 else
2808 node = rb_prev(&__cfqq->p_node);
2809 if (!node)
2810 return NULL;
2811
2812 __cfqq = rb_entry(node, struct cfq_queue, p_node);
2813 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2814 return __cfqq;
2815
2816 return NULL;
2817}
2818
2819/*
2820 * cfqd - obvious
2821 * cur_cfqq - passed in so that we don't decide that the current queue is
2822 * closely cooperating with itself.
2823 *
2824 * So, basically we're assuming that that cur_cfqq has dispatched at least
2825 * one request, and that cfqd->last_position reflects a position on the disk
2826 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
2827 * assumption.
2828 */
2829static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2830 struct cfq_queue *cur_cfqq)
2831{
2832 struct cfq_queue *cfqq;
2833
2834 if (cfq_class_idle(cur_cfqq))
2835 return NULL;
2836 if (!cfq_cfqq_sync(cur_cfqq))
2837 return NULL;
2838 if (CFQQ_SEEKY(cur_cfqq))
2839 return NULL;
2840
2841 /*
2842 * Don't search priority tree if it's the only queue in the group.
2843 */
2844 if (cur_cfqq->cfqg->nr_cfqq == 1)
2845 return NULL;
2846
2847 /*
2848 * We should notice if some of the queues are cooperating, eg
2849 * working closely on the same area of the disk. In that case,
2850 * we can group them together and don't waste time idling.
2851 */
2852 cfqq = cfqq_close(cfqd, cur_cfqq);
2853 if (!cfqq)
2854 return NULL;
2855
2856 /* If new queue belongs to different cfq_group, don't choose it */
2857 if (cur_cfqq->cfqg != cfqq->cfqg)
2858 return NULL;
2859
2860 /*
2861 * It only makes sense to merge sync queues.
2862 */
2863 if (!cfq_cfqq_sync(cfqq))
2864 return NULL;
2865 if (CFQQ_SEEKY(cfqq))
2866 return NULL;
2867
2868 /*
2869 * Do not merge queues of different priority classes
2870 */
2871 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2872 return NULL;
2873
2874 return cfqq;
2875}
2876
2877/*
2878 * Determine whether we should enforce idle window for this queue.
2879 */
2880
2881static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2882{
2883 enum wl_class_t wl_class = cfqq_class(cfqq);
2884 struct cfq_rb_root *st = cfqq->service_tree;
2885
2886 BUG_ON(!st);
2887 BUG_ON(!st->count);
2888
2889 if (!cfqd->cfq_slice_idle)
2890 return false;
2891
2892 /* We never do for idle class queues. */
2893 if (wl_class == IDLE_WORKLOAD)
2894 return false;
2895
2896 /* We do for queues that were marked with idle window flag. */
2897 if (cfq_cfqq_idle_window(cfqq) &&
2898 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2899 return true;
2900
2901 /*
2902 * Otherwise, we do only if they are the last ones
2903 * in their service tree.
2904 */
2905 if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2906 !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2907 return true;
2908 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2909 return false;
2910}
2911
2912static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2913{
2914 struct cfq_queue *cfqq = cfqd->active_queue;
2915 struct cfq_rb_root *st = cfqq->service_tree;
2916 struct cfq_io_cq *cic;
2917 u64 sl, group_idle = 0;
2918 u64 now = ktime_get_ns();
2919
2920 /*
2921 * SSD device without seek penalty, disable idling. But only do so
2922 * for devices that support queuing, otherwise we still have a problem
2923 * with sync vs async workloads.
2924 */
2925 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
2926 !cfqd->cfq_group_idle)
2927 return;
2928
2929 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2930 WARN_ON(cfq_cfqq_slice_new(cfqq));
2931
2932 /*
2933 * idle is disabled, either manually or by past process history
2934 */
2935 if (!cfq_should_idle(cfqd, cfqq)) {
2936 /* no queue idling. Check for group idling */
2937 if (cfqd->cfq_group_idle)
2938 group_idle = cfqd->cfq_group_idle;
2939 else
2940 return;
2941 }
2942
2943 /*
2944 * still active requests from this queue, don't idle
2945 */
2946 if (cfqq->dispatched)
2947 return;
2948
2949 /*
2950 * task has exited, don't wait
2951 */
2952 cic = cfqd->active_cic;
2953 if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2954 return;
2955
2956 /*
2957 * If our average think time is larger than the remaining time
2958 * slice, then don't idle. This avoids overrunning the allotted
2959 * time slice.
2960 */
2961 if (sample_valid(cic->ttime.ttime_samples) &&
2962 (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
2963 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
2964 cic->ttime.ttime_mean);
2965 return;
2966 }
2967
2968 /*
2969 * There are other queues in the group or this is the only group and
2970 * it has too big thinktime, don't do group idle.
2971 */
2972 if (group_idle &&
2973 (cfqq->cfqg->nr_cfqq > 1 ||
2974 cfq_io_thinktime_big(cfqd, &st->ttime, true)))
2975 return;
2976
2977 cfq_mark_cfqq_wait_request(cfqq);
2978
2979 if (group_idle)
2980 sl = cfqd->cfq_group_idle;
2981 else
2982 sl = cfqd->cfq_slice_idle;
2983
2984 hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
2985 HRTIMER_MODE_REL);
2986 cfqg_stats_set_start_idle_time(cfqq->cfqg);
2987 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
2988 group_idle ? 1 : 0);
2989}
2990
2991/*
2992 * Move request from internal lists to the request queue dispatch list.
2993 */
2994static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2995{
2996 struct cfq_data *cfqd = q->elevator->elevator_data;
2997 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2998
2999 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
3000
3001 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
3002 cfq_remove_request(rq);
3003 cfqq->dispatched++;
3004 (RQ_CFQG(rq))->dispatched++;
3005 elv_dispatch_sort(q, rq);
3006
3007 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
3008 cfqq->nr_sectors += blk_rq_sectors(rq);
3009}
3010
3011/*
3012 * return expired entry, or NULL to just start from scratch in rbtree
3013 */
3014static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
3015{
3016 struct request *rq = NULL;
3017
3018 if (cfq_cfqq_fifo_expire(cfqq))
3019 return NULL;
3020
3021 cfq_mark_cfqq_fifo_expire(cfqq);
3022
3023 if (list_empty(&cfqq->fifo))
3024 return NULL;
3025
3026 rq = rq_entry_fifo(cfqq->fifo.next);
3027 if (ktime_get_ns() < rq->fifo_time)
3028 rq = NULL;
3029
3030 return rq;
3031}
3032
3033static inline int
3034cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3035{
3036 const int base_rq = cfqd->cfq_slice_async_rq;
3037
3038 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
3039
3040 return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
3041}
3042
3043/*
3044 * Must be called with the queue_lock held.
3045 */
3046static int cfqq_process_refs(struct cfq_queue *cfqq)
3047{
3048 int process_refs, io_refs;
3049
3050 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
3051 process_refs = cfqq->ref - io_refs;
3052 BUG_ON(process_refs < 0);
3053 return process_refs;
3054}
3055
3056static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
3057{
3058 int process_refs, new_process_refs;
3059 struct cfq_queue *__cfqq;
3060
3061 /*
3062 * If there are no process references on the new_cfqq, then it is
3063 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
3064 * chain may have dropped their last reference (not just their
3065 * last process reference).
3066 */
3067 if (!cfqq_process_refs(new_cfqq))
3068 return;
3069
3070 /* Avoid a circular list and skip interim queue merges */
3071 while ((__cfqq = new_cfqq->new_cfqq)) {
3072 if (__cfqq == cfqq)
3073 return;
3074 new_cfqq = __cfqq;
3075 }
3076
3077 process_refs = cfqq_process_refs(cfqq);
3078 new_process_refs = cfqq_process_refs(new_cfqq);
3079 /*
3080 * If the process for the cfqq has gone away, there is no
3081 * sense in merging the queues.
3082 */
3083 if (process_refs == 0 || new_process_refs == 0)
3084 return;
3085
3086 /*
3087 * Merge in the direction of the lesser amount of work.
3088 */
3089 if (new_process_refs >= process_refs) {
3090 cfqq->new_cfqq = new_cfqq;
3091 new_cfqq->ref += process_refs;
3092 } else {
3093 new_cfqq->new_cfqq = cfqq;
3094 cfqq->ref += new_process_refs;
3095 }
3096}
3097
3098static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
3099 struct cfq_group *cfqg, enum wl_class_t wl_class)
3100{
3101 struct cfq_queue *queue;
3102 int i;
3103 bool key_valid = false;
3104 u64 lowest_key = 0;
3105 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
3106
3107 for (i = 0; i <= SYNC_WORKLOAD; ++i) {
3108 /* select the one with lowest rb_key */
3109 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
3110 if (queue &&
3111 (!key_valid || queue->rb_key < lowest_key)) {
3112 lowest_key = queue->rb_key;
3113 cur_best = i;
3114 key_valid = true;
3115 }
3116 }
3117
3118 return cur_best;
3119}
3120
3121static void
3122choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
3123{
3124 u64 slice;
3125 unsigned count;
3126 struct cfq_rb_root *st;
3127 u64 group_slice;
3128 enum wl_class_t original_class = cfqd->serving_wl_class;
3129 u64 now = ktime_get_ns();
3130
3131 /* Choose next priority. RT > BE > IDLE */
3132 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
3133 cfqd->serving_wl_class = RT_WORKLOAD;
3134 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
3135 cfqd->serving_wl_class = BE_WORKLOAD;
3136 else {
3137 cfqd->serving_wl_class = IDLE_WORKLOAD;
3138 cfqd->workload_expires = now + jiffies_to_nsecs(1);
3139 return;
3140 }
3141
3142 if (original_class != cfqd->serving_wl_class)
3143 goto new_workload;
3144
3145 /*
3146 * For RT and BE, we have to choose also the type
3147 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
3148 * expiration time
3149 */
3150 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3151 count = st->count;
3152
3153 /*
3154 * check workload expiration, and that we still have other queues ready
3155 */
3156 if (count && !(now > cfqd->workload_expires))
3157 return;
3158
3159new_workload:
3160 /* otherwise select new workload type */
3161 cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
3162 cfqd->serving_wl_class);
3163 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3164 count = st->count;
3165
3166 /*
3167 * the workload slice is computed as a fraction of target latency
3168 * proportional to the number of queues in that workload, over
3169 * all the queues in the same priority class
3170 */
3171 group_slice = cfq_group_slice(cfqd, cfqg);
3172
3173 slice = div_u64(group_slice * count,
3174 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
3175 cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
3176 cfqg)));
3177
3178 if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
3179 u64 tmp;
3180
3181 /*
3182 * Async queues are currently system wide. Just taking
3183 * proportion of queues with-in same group will lead to higher
3184 * async ratio system wide as generally root group is going
3185 * to have higher weight. A more accurate thing would be to
3186 * calculate system wide asnc/sync ratio.
3187 */
3188 tmp = cfqd->cfq_target_latency *
3189 cfqg_busy_async_queues(cfqd, cfqg);
3190 tmp = div_u64(tmp, cfqd->busy_queues);
3191 slice = min_t(u64, slice, tmp);
3192
3193 /* async workload slice is scaled down according to
3194 * the sync/async slice ratio. */
3195 slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
3196 } else
3197 /* sync workload slice is at least 2 * cfq_slice_idle */
3198 slice = max(slice, 2 * cfqd->cfq_slice_idle);
3199
3200 slice = max_t(u64, slice, CFQ_MIN_TT);
3201 cfq_log(cfqd, "workload slice:%llu", slice);
3202 cfqd->workload_expires = now + slice;
3203}
3204
3205static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
3206{
3207 struct cfq_rb_root *st = &cfqd->grp_service_tree;
3208 struct cfq_group *cfqg;
3209
3210 if (RB_EMPTY_ROOT(&st->rb.rb_root))
3211 return NULL;
3212 cfqg = cfq_rb_first_group(st);
3213 update_min_vdisktime(st);
3214 return cfqg;
3215}
3216
3217static void cfq_choose_cfqg(struct cfq_data *cfqd)
3218{
3219 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
3220 u64 now = ktime_get_ns();
3221
3222 cfqd->serving_group = cfqg;
3223
3224 /* Restore the workload type data */
3225 if (cfqg->saved_wl_slice) {
3226 cfqd->workload_expires = now + cfqg->saved_wl_slice;
3227 cfqd->serving_wl_type = cfqg->saved_wl_type;
3228 cfqd->serving_wl_class = cfqg->saved_wl_class;
3229 } else
3230 cfqd->workload_expires = now - 1;
3231
3232 choose_wl_class_and_type(cfqd, cfqg);
3233}
3234
3235/*
3236 * Select a queue for service. If we have a current active queue,
3237 * check whether to continue servicing it, or retrieve and set a new one.
3238 */
3239static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
3240{
3241 struct cfq_queue *cfqq, *new_cfqq = NULL;
3242 u64 now = ktime_get_ns();
3243
3244 cfqq = cfqd->active_queue;
3245 if (!cfqq)
3246 goto new_queue;
3247
3248 if (!cfqd->rq_queued)
3249 return NULL;
3250
3251 /*
3252 * We were waiting for group to get backlogged. Expire the queue
3253 */
3254 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
3255 goto expire;
3256
3257 /*
3258 * The active queue has run out of time, expire it and select new.
3259 */
3260 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
3261 /*
3262 * If slice had not expired at the completion of last request
3263 * we might not have turned on wait_busy flag. Don't expire
3264 * the queue yet. Allow the group to get backlogged.
3265 *
3266 * The very fact that we have used the slice, that means we
3267 * have been idling all along on this queue and it should be
3268 * ok to wait for this request to complete.
3269 */
3270 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
3271 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3272 cfqq = NULL;
3273 goto keep_queue;
3274 } else
3275 goto check_group_idle;
3276 }
3277
3278 /*
3279 * The active queue has requests and isn't expired, allow it to
3280 * dispatch.
3281 */
3282 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3283 goto keep_queue;
3284
3285 /*
3286 * If another queue has a request waiting within our mean seek
3287 * distance, let it run. The expire code will check for close
3288 * cooperators and put the close queue at the front of the service
3289 * tree. If possible, merge the expiring queue with the new cfqq.
3290 */
3291 new_cfqq = cfq_close_cooperator(cfqd, cfqq);
3292 if (new_cfqq) {
3293 if (!cfqq->new_cfqq)
3294 cfq_setup_merge(cfqq, new_cfqq);
3295 goto expire;
3296 }
3297
3298 /*
3299 * No requests pending. If the active queue still has requests in
3300 * flight or is idling for a new request, allow either of these
3301 * conditions to happen (or time out) before selecting a new queue.
3302 */
3303 if (hrtimer_active(&cfqd->idle_slice_timer)) {
3304 cfqq = NULL;
3305 goto keep_queue;
3306 }
3307
3308 /*
3309 * This is a deep seek queue, but the device is much faster than
3310 * the queue can deliver, don't idle
3311 **/
3312 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
3313 (cfq_cfqq_slice_new(cfqq) ||
3314 (cfqq->slice_end - now > now - cfqq->slice_start))) {
3315 cfq_clear_cfqq_deep(cfqq);
3316 cfq_clear_cfqq_idle_window(cfqq);
3317 }
3318
3319 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3320 cfqq = NULL;
3321 goto keep_queue;
3322 }
3323
3324 /*
3325 * If group idle is enabled and there are requests dispatched from
3326 * this group, wait for requests to complete.
3327 */
3328check_group_idle:
3329 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
3330 cfqq->cfqg->dispatched &&
3331 !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
3332 cfqq = NULL;
3333 goto keep_queue;
3334 }
3335
3336expire:
3337 cfq_slice_expired(cfqd, 0);
3338new_queue:
3339 /*
3340 * Current queue expired. Check if we have to switch to a new
3341 * service tree
3342 */
3343 if (!new_cfqq)
3344 cfq_choose_cfqg(cfqd);
3345
3346 cfqq = cfq_set_active_queue(cfqd, new_cfqq);
3347keep_queue:
3348 return cfqq;
3349}
3350
3351static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
3352{
3353 int dispatched = 0;
3354
3355 while (cfqq->next_rq) {
3356 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
3357 dispatched++;
3358 }
3359
3360 BUG_ON(!list_empty(&cfqq->fifo));
3361
3362 /* By default cfqq is not expired if it is empty. Do it explicitly */
3363 __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
3364 return dispatched;
3365}
3366
3367/*
3368 * Drain our current requests. Used for barriers and when switching
3369 * io schedulers on-the-fly.
3370 */
3371static int cfq_forced_dispatch(struct cfq_data *cfqd)
3372{
3373 struct cfq_queue *cfqq;
3374 int dispatched = 0;
3375
3376 /* Expire the timeslice of the current active queue first */
3377 cfq_slice_expired(cfqd, 0);
3378 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3379 __cfq_set_active_queue(cfqd, cfqq);
3380 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3381 }
3382
3383 BUG_ON(cfqd->busy_queues);
3384
3385 cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3386 return dispatched;
3387}
3388
3389static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3390 struct cfq_queue *cfqq)
3391{
3392 u64 now = ktime_get_ns();
3393
3394 /* the queue hasn't finished any request, can't estimate */
3395 if (cfq_cfqq_slice_new(cfqq))
3396 return true;
3397 if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
3398 return true;
3399
3400 return false;
3401}
3402
3403static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3404{
3405 unsigned int max_dispatch;
3406
3407 if (cfq_cfqq_must_dispatch(cfqq))
3408 return true;
3409
3410 /*
3411 * Drain async requests before we start sync IO
3412 */
3413 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3414 return false;
3415
3416 /*
3417 * If this is an async queue and we have sync IO in flight, let it wait
3418 */
3419 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3420 return false;
3421
3422 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3423 if (cfq_class_idle(cfqq))
3424 max_dispatch = 1;
3425
3426 /*
3427 * Does this cfqq already have too much IO in flight?
3428 */
3429 if (cfqq->dispatched >= max_dispatch) {
3430 bool promote_sync = false;
3431 /*
3432 * idle queue must always only have a single IO in flight
3433 */
3434 if (cfq_class_idle(cfqq))
3435 return false;
3436
3437 /*
3438 * If there is only one sync queue
3439 * we can ignore async queue here and give the sync
3440 * queue no dispatch limit. The reason is a sync queue can
3441 * preempt async queue, limiting the sync queue doesn't make
3442 * sense. This is useful for aiostress test.
3443 */
3444 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3445 promote_sync = true;
3446
3447 /*
3448 * We have other queues, don't allow more IO from this one
3449 */
3450 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3451 !promote_sync)
3452 return false;
3453
3454 /*
3455 * Sole queue user, no limit
3456 */
3457 if (cfqd->busy_queues == 1 || promote_sync)
3458 max_dispatch = -1;
3459 else
3460 /*
3461 * Normally we start throttling cfqq when cfq_quantum/2
3462 * requests have been dispatched. But we can drive
3463 * deeper queue depths at the beginning of slice
3464 * subjected to upper limit of cfq_quantum.
3465 * */
3466 max_dispatch = cfqd->cfq_quantum;
3467 }
3468
3469 /*
3470 * Async queues must wait a bit before being allowed dispatch.
3471 * We also ramp up the dispatch depth gradually for async IO,
3472 * based on the last sync IO we serviced
3473 */
3474 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3475 u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
3476 unsigned int depth;
3477
3478 depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
3479 if (!depth && !cfqq->dispatched)
3480 depth = 1;
3481 if (depth < max_dispatch)
3482 max_dispatch = depth;
3483 }
3484
3485 /*
3486 * If we're below the current max, allow a dispatch
3487 */
3488 return cfqq->dispatched < max_dispatch;
3489}
3490
3491/*
3492 * Dispatch a request from cfqq, moving them to the request queue
3493 * dispatch list.
3494 */
3495static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3496{
3497 struct request *rq;
3498
3499 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3500
3501 rq = cfq_check_fifo(cfqq);
3502 if (rq)
3503 cfq_mark_cfqq_must_dispatch(cfqq);
3504
3505 if (!cfq_may_dispatch(cfqd, cfqq))
3506 return false;
3507
3508 /*
3509 * follow expired path, else get first next available
3510 */
3511 if (!rq)
3512 rq = cfqq->next_rq;
3513 else
3514 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
3515
3516 /*
3517 * insert request into driver dispatch list
3518 */
3519 cfq_dispatch_insert(cfqd->queue, rq);
3520
3521 if (!cfqd->active_cic) {
3522 struct cfq_io_cq *cic = RQ_CIC(rq);
3523
3524 atomic_long_inc(&cic->icq.ioc->refcount);
3525 cfqd->active_cic = cic;
3526 }
3527
3528 return true;
3529}
3530
3531/*
3532 * Find the cfqq that we need to service and move a request from that to the
3533 * dispatch list
3534 */
3535static int cfq_dispatch_requests(struct request_queue *q, int force)
3536{
3537 struct cfq_data *cfqd = q->elevator->elevator_data;
3538 struct cfq_queue *cfqq;
3539
3540 if (!cfqd->busy_queues)
3541 return 0;
3542
3543 if (unlikely(force))
3544 return cfq_forced_dispatch(cfqd);
3545
3546 cfqq = cfq_select_queue(cfqd);
3547 if (!cfqq)
3548 return 0;
3549
3550 /*
3551 * Dispatch a request from this cfqq, if it is allowed
3552 */
3553 if (!cfq_dispatch_request(cfqd, cfqq))
3554 return 0;
3555
3556 cfqq->slice_dispatch++;
3557 cfq_clear_cfqq_must_dispatch(cfqq);
3558
3559 /*
3560 * expire an async queue immediately if it has used up its slice. idle
3561 * queue always expire after 1 dispatch round.
3562 */
3563 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3564 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3565 cfq_class_idle(cfqq))) {
3566 cfqq->slice_end = ktime_get_ns() + 1;
3567 cfq_slice_expired(cfqd, 0);
3568 }
3569
3570 cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3571 return 1;
3572}
3573
3574/*
3575 * task holds one reference to the queue, dropped when task exits. each rq
3576 * in-flight on this queue also holds a reference, dropped when rq is freed.
3577 *
3578 * Each cfq queue took a reference on the parent group. Drop it now.
3579 * queue lock must be held here.
3580 */
3581static void cfq_put_queue(struct cfq_queue *cfqq)
3582{
3583 struct cfq_data *cfqd = cfqq->cfqd;
3584 struct cfq_group *cfqg;
3585
3586 BUG_ON(cfqq->ref <= 0);
3587
3588 cfqq->ref--;
3589 if (cfqq->ref)
3590 return;
3591
3592 cfq_log_cfqq(cfqd, cfqq, "put_queue");
3593 BUG_ON(rb_first(&cfqq->sort_list));
3594 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3595 cfqg = cfqq->cfqg;
3596
3597 if (unlikely(cfqd->active_queue == cfqq)) {
3598 __cfq_slice_expired(cfqd, cfqq, 0);
3599 cfq_schedule_dispatch(cfqd);
3600 }
3601
3602 BUG_ON(cfq_cfqq_on_rr(cfqq));
3603 kmem_cache_free(cfq_pool, cfqq);
3604 cfqg_put(cfqg);
3605}
3606
3607static void cfq_put_cooperator(struct cfq_queue *cfqq)
3608{
3609 struct cfq_queue *__cfqq, *next;
3610
3611 /*
3612 * If this queue was scheduled to merge with another queue, be
3613 * sure to drop the reference taken on that queue (and others in
3614 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
3615 */
3616 __cfqq = cfqq->new_cfqq;
3617 while (__cfqq) {
3618 if (__cfqq == cfqq) {
3619 WARN(1, "cfqq->new_cfqq loop detected\n");
3620 break;
3621 }
3622 next = __cfqq->new_cfqq;
3623 cfq_put_queue(__cfqq);
3624 __cfqq = next;
3625 }
3626}
3627
3628static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3629{
3630 if (unlikely(cfqq == cfqd->active_queue)) {
3631 __cfq_slice_expired(cfqd, cfqq, 0);
3632 cfq_schedule_dispatch(cfqd);
3633 }
3634
3635 cfq_put_cooperator(cfqq);
3636
3637 cfq_put_queue(cfqq);
3638}
3639
3640static void cfq_init_icq(struct io_cq *icq)
3641{
3642 struct cfq_io_cq *cic = icq_to_cic(icq);
3643
3644 cic->ttime.last_end_request = ktime_get_ns();
3645}
3646
3647static void cfq_exit_icq(struct io_cq *icq)
3648{
3649 struct cfq_io_cq *cic = icq_to_cic(icq);
3650 struct cfq_data *cfqd = cic_to_cfqd(cic);
3651
3652 if (cic_to_cfqq(cic, false)) {
3653 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
3654 cic_set_cfqq(cic, NULL, false);
3655 }
3656
3657 if (cic_to_cfqq(cic, true)) {
3658 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
3659 cic_set_cfqq(cic, NULL, true);
3660 }
3661}
3662
3663static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3664{
3665 struct task_struct *tsk = current;
3666 int ioprio_class;
3667
3668 if (!cfq_cfqq_prio_changed(cfqq))
3669 return;
3670
3671 ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3672 switch (ioprio_class) {
3673 default:
3674 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3675 /* fall through */
3676 case IOPRIO_CLASS_NONE:
3677 /*
3678 * no prio set, inherit CPU scheduling settings
3679 */
3680 cfqq->ioprio = task_nice_ioprio(tsk);
3681 cfqq->ioprio_class = task_nice_ioclass(tsk);
3682 break;
3683 case IOPRIO_CLASS_RT:
3684 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3685 cfqq->ioprio_class = IOPRIO_CLASS_RT;
3686 break;
3687 case IOPRIO_CLASS_BE:
3688 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3689 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3690 break;
3691 case IOPRIO_CLASS_IDLE:
3692 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3693 cfqq->ioprio = 7;
3694 cfq_clear_cfqq_idle_window(cfqq);
3695 break;
3696 }
3697
3698 /*
3699 * keep track of original prio settings in case we have to temporarily
3700 * elevate the priority of this queue
3701 */
3702 cfqq->org_ioprio = cfqq->ioprio;
3703 cfqq->org_ioprio_class = cfqq->ioprio_class;
3704 cfq_clear_cfqq_prio_changed(cfqq);
3705}
3706
3707static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3708{
3709 int ioprio = cic->icq.ioc->ioprio;
3710 struct cfq_data *cfqd = cic_to_cfqd(cic);
3711 struct cfq_queue *cfqq;
3712
3713 /*
3714 * Check whether ioprio has changed. The condition may trigger
3715 * spuriously on a newly created cic but there's no harm.
3716 */
3717 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3718 return;
3719
3720 cfqq = cic_to_cfqq(cic, false);
3721 if (cfqq) {
3722 cfq_put_queue(cfqq);
3723 cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
3724 cic_set_cfqq(cic, cfqq, false);
3725 }
3726
3727 cfqq = cic_to_cfqq(cic, true);
3728 if (cfqq)
3729 cfq_mark_cfqq_prio_changed(cfqq);
3730
3731 cic->ioprio = ioprio;
3732}
3733
3734static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3735 pid_t pid, bool is_sync)
3736{
3737 RB_CLEAR_NODE(&cfqq->rb_node);
3738 RB_CLEAR_NODE(&cfqq->p_node);
3739 INIT_LIST_HEAD(&cfqq->fifo);
3740
3741 cfqq->ref = 0;
3742 cfqq->cfqd = cfqd;
3743
3744 cfq_mark_cfqq_prio_changed(cfqq);
3745
3746 if (is_sync) {
3747 if (!cfq_class_idle(cfqq))
3748 cfq_mark_cfqq_idle_window(cfqq);
3749 cfq_mark_cfqq_sync(cfqq);
3750 }
3751 cfqq->pid = pid;
3752}
3753
3754#ifdef CONFIG_CFQ_GROUP_IOSCHED
3755static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3756{
3757 struct cfq_data *cfqd = cic_to_cfqd(cic);
3758 struct cfq_queue *cfqq;
3759 uint64_t serial_nr;
3760
3761 rcu_read_lock();
3762 serial_nr = bio_blkcg(bio)->css.serial_nr;
3763 rcu_read_unlock();
3764
3765 /*
3766 * Check whether blkcg has changed. The condition may trigger
3767 * spuriously on a newly created cic but there's no harm.
3768 */
3769 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3770 return;
3771
3772 /*
3773 * Drop reference to queues. New queues will be assigned in new
3774 * group upon arrival of fresh requests.
3775 */
3776 cfqq = cic_to_cfqq(cic, false);
3777 if (cfqq) {
3778 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3779 cic_set_cfqq(cic, NULL, false);
3780 cfq_put_queue(cfqq);
3781 }
3782
3783 cfqq = cic_to_cfqq(cic, true);
3784 if (cfqq) {
3785 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3786 cic_set_cfqq(cic, NULL, true);
3787 cfq_put_queue(cfqq);
3788 }
3789
3790 cic->blkcg_serial_nr = serial_nr;
3791}
3792#else
3793static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3794{
3795}
3796#endif /* CONFIG_CFQ_GROUP_IOSCHED */
3797
3798static struct cfq_queue **
3799cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
3800{
3801 switch (ioprio_class) {
3802 case IOPRIO_CLASS_RT:
3803 return &cfqg->async_cfqq[0][ioprio];
3804 case IOPRIO_CLASS_NONE:
3805 ioprio = IOPRIO_NORM;
3806 /* fall through */
3807 case IOPRIO_CLASS_BE:
3808 return &cfqg->async_cfqq[1][ioprio];
3809 case IOPRIO_CLASS_IDLE:
3810 return &cfqg->async_idle_cfqq;
3811 default:
3812 BUG();
3813 }
3814}
3815
3816static struct cfq_queue *
3817cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3818 struct bio *bio)
3819{
3820 int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3821 int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3822 struct cfq_queue **async_cfqq = NULL;
3823 struct cfq_queue *cfqq;
3824 struct cfq_group *cfqg;
3825
3826 rcu_read_lock();
3827 cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
3828 if (!cfqg) {
3829 cfqq = &cfqd->oom_cfqq;
3830 goto out;
3831 }
3832
3833 if (!is_sync) {
3834 if (!ioprio_valid(cic->ioprio)) {
3835 struct task_struct *tsk = current;
3836 ioprio = task_nice_ioprio(tsk);
3837 ioprio_class = task_nice_ioclass(tsk);
3838 }
3839 async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
3840 cfqq = *async_cfqq;
3841 if (cfqq)
3842 goto out;
3843 }
3844
3845 cfqq = kmem_cache_alloc_node(cfq_pool,
3846 GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3847 cfqd->queue->node);
3848 if (!cfqq) {
3849 cfqq = &cfqd->oom_cfqq;
3850 goto out;
3851 }
3852
3853 /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
3854 cfqq->ioprio_class = IOPRIO_CLASS_NONE;
3855 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3856 cfq_init_prio_data(cfqq, cic);
3857 cfq_link_cfqq_cfqg(cfqq, cfqg);
3858 cfq_log_cfqq(cfqd, cfqq, "alloced");
3859
3860 if (async_cfqq) {
3861 /* a new async queue is created, pin and remember */
3862 cfqq->ref++;
3863 *async_cfqq = cfqq;
3864 }
3865out:
3866 cfqq->ref++;
3867 rcu_read_unlock();
3868 return cfqq;
3869}
3870
3871static void
3872__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
3873{
3874 u64 elapsed = ktime_get_ns() - ttime->last_end_request;
3875 elapsed = min(elapsed, 2UL * slice_idle);
3876
3877 ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3878 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
3879 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3880 ttime->ttime_samples);
3881}
3882
3883static void
3884cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3885 struct cfq_io_cq *cic)
3886{
3887 if (cfq_cfqq_sync(cfqq)) {
3888 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3889 __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3890 cfqd->cfq_slice_idle);
3891 }
3892#ifdef CONFIG_CFQ_GROUP_IOSCHED
3893 __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3894#endif
3895}
3896
3897static void
3898cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3899 struct request *rq)
3900{
3901 sector_t sdist = 0;
3902 sector_t n_sec = blk_rq_sectors(rq);
3903 if (cfqq->last_request_pos) {
3904 if (cfqq->last_request_pos < blk_rq_pos(rq))
3905 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3906 else
3907 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3908 }
3909
3910 cfqq->seek_history <<= 1;
3911 if (blk_queue_nonrot(cfqd->queue))
3912 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3913 else
3914 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3915}
3916
3917static inline bool req_noidle(struct request *req)
3918{
3919 return req_op(req) == REQ_OP_WRITE &&
3920 (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
3921}
3922
3923/*
3924 * Disable idle window if the process thinks too long or seeks so much that
3925 * it doesn't matter
3926 */
3927static void
3928cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3929 struct cfq_io_cq *cic)
3930{
3931 int old_idle, enable_idle;
3932
3933 /*
3934 * Don't idle for async or idle io prio class
3935 */
3936 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3937 return;
3938
3939 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3940
3941 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3942 cfq_mark_cfqq_deep(cfqq);
3943
3944 if (cfqq->next_rq && req_noidle(cfqq->next_rq))
3945 enable_idle = 0;
3946 else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3947 !cfqd->cfq_slice_idle ||
3948 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3949 enable_idle = 0;
3950 else if (sample_valid(cic->ttime.ttime_samples)) {
3951 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3952 enable_idle = 0;
3953 else
3954 enable_idle = 1;
3955 }
3956
3957 if (old_idle != enable_idle) {
3958 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3959 if (enable_idle)
3960 cfq_mark_cfqq_idle_window(cfqq);
3961 else
3962 cfq_clear_cfqq_idle_window(cfqq);
3963 }
3964}
3965
3966/*
3967 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3968 * no or if we aren't sure, a 1 will cause a preempt.
3969 */
3970static bool
3971cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3972 struct request *rq)
3973{
3974 struct cfq_queue *cfqq;
3975
3976 cfqq = cfqd->active_queue;
3977 if (!cfqq)
3978 return false;
3979
3980 if (cfq_class_idle(new_cfqq))
3981 return false;
3982
3983 if (cfq_class_idle(cfqq))
3984 return true;
3985
3986 /*
3987 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3988 */
3989 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3990 return false;
3991
3992 /*
3993 * if the new request is sync, but the currently running queue is
3994 * not, let the sync request have priority.
3995 */
3996 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
3997 return true;
3998
3999 /*
4000 * Treat ancestors of current cgroup the same way as current cgroup.
4001 * For anybody else we disallow preemption to guarantee service
4002 * fairness among cgroups.
4003 */
4004 if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
4005 return false;
4006
4007 if (cfq_slice_used(cfqq))
4008 return true;
4009
4010 /*
4011 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
4012 */
4013 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
4014 return true;
4015
4016 WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
4017 /* Allow preemption only if we are idling on sync-noidle tree */
4018 if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
4019 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
4020 RB_EMPTY_ROOT(&cfqq->sort_list))
4021 return true;
4022
4023 /*
4024 * So both queues are sync. Let the new request get disk time if
4025 * it's a metadata request and the current queue is doing regular IO.
4026 */
4027 if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
4028 return true;
4029
4030 /* An idle queue should not be idle now for some reason */
4031 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
4032 return true;
4033
4034 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
4035 return false;
4036
4037 /*
4038 * if this request is as-good as one we would expect from the
4039 * current cfqq, let it preempt
4040 */
4041 if (cfq_rq_close(cfqd, cfqq, rq))
4042 return true;
4043
4044 return false;
4045}
4046
4047/*
4048 * cfqq preempts the active queue. if we allowed preempt with no slice left,
4049 * let it have half of its nominal slice.
4050 */
4051static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4052{
4053 enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
4054
4055 cfq_log_cfqq(cfqd, cfqq, "preempt");
4056 cfq_slice_expired(cfqd, 1);
4057
4058 /*
4059 * workload type is changed, don't save slice, otherwise preempt
4060 * doesn't happen
4061 */
4062 if (old_type != cfqq_type(cfqq))
4063 cfqq->cfqg->saved_wl_slice = 0;
4064
4065 /*
4066 * Put the new queue at the front of the of the current list,
4067 * so we know that it will be selected next.
4068 */
4069 BUG_ON(!cfq_cfqq_on_rr(cfqq));
4070
4071 cfq_service_tree_add(cfqd, cfqq, 1);
4072
4073 cfqq->slice_end = 0;
4074 cfq_mark_cfqq_slice_new(cfqq);
4075}
4076
4077/*
4078 * Called when a new fs request (rq) is added (to cfqq). Check if there's
4079 * something we should do about it
4080 */
4081static void
4082cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
4083 struct request *rq)
4084{
4085 struct cfq_io_cq *cic = RQ_CIC(rq);
4086
4087 cfqd->rq_queued++;
4088 if (rq->cmd_flags & REQ_PRIO)
4089 cfqq->prio_pending++;
4090
4091 cfq_update_io_thinktime(cfqd, cfqq, cic);
4092 cfq_update_io_seektime(cfqd, cfqq, rq);
4093 cfq_update_idle_window(cfqd, cfqq, cic);
4094
4095 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4096
4097 if (cfqq == cfqd->active_queue) {
4098 /*
4099 * Remember that we saw a request from this process, but
4100 * don't start queuing just yet. Otherwise we risk seeing lots
4101 * of tiny requests, because we disrupt the normal plugging
4102 * and merging. If the request is already larger than a single
4103 * page, let it rip immediately. For that case we assume that
4104 * merging is already done. Ditto for a busy system that
4105 * has other work pending, don't risk delaying until the
4106 * idle timer unplug to continue working.
4107 */
4108 if (cfq_cfqq_wait_request(cfqq)) {
4109 if (blk_rq_bytes(rq) > PAGE_SIZE ||
4110 cfqd->busy_queues > 1) {
4111 cfq_del_timer(cfqd, cfqq);
4112 cfq_clear_cfqq_wait_request(cfqq);
4113 __blk_run_queue(cfqd->queue);
4114 } else {
4115 cfqg_stats_update_idle_time(cfqq->cfqg);
4116 cfq_mark_cfqq_must_dispatch(cfqq);
4117 }
4118 }
4119 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
4120 /*
4121 * not the active queue - expire current slice if it is
4122 * idle and has expired it's mean thinktime or this new queue
4123 * has some old slice time left and is of higher priority or
4124 * this new queue is RT and the current one is BE
4125 */
4126 cfq_preempt_queue(cfqd, cfqq);
4127 __blk_run_queue(cfqd->queue);
4128 }
4129}
4130
4131static void cfq_insert_request(struct request_queue *q, struct request *rq)
4132{
4133 struct cfq_data *cfqd = q->elevator->elevator_data;
4134 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4135
4136 cfq_log_cfqq(cfqd, cfqq, "insert_request");
4137 cfq_init_prio_data(cfqq, RQ_CIC(rq));
4138
4139 rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
4140 list_add_tail(&rq->queuelist, &cfqq->fifo);
4141 cfq_add_rq_rb(rq);
4142 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
4143 rq->cmd_flags);
4144 cfq_rq_enqueued(cfqd, cfqq, rq);
4145}
4146
4147/*
4148 * Update hw_tag based on peak queue depth over 50 samples under
4149 * sufficient load.
4150 */
4151static void cfq_update_hw_tag(struct cfq_data *cfqd)
4152{
4153 struct cfq_queue *cfqq = cfqd->active_queue;
4154
4155 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
4156 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
4157
4158 if (cfqd->hw_tag == 1)
4159 return;
4160
4161 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
4162 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
4163 return;
4164
4165 /*
4166 * If active queue hasn't enough requests and can idle, cfq might not
4167 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
4168 * case
4169 */
4170 if (cfqq && cfq_cfqq_idle_window(cfqq) &&
4171 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
4172 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
4173 return;
4174
4175 if (cfqd->hw_tag_samples++ < 50)
4176 return;
4177
4178 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
4179 cfqd->hw_tag = 1;
4180 else
4181 cfqd->hw_tag = 0;
4182}
4183
4184static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4185{
4186 struct cfq_io_cq *cic = cfqd->active_cic;
4187 u64 now = ktime_get_ns();
4188
4189 /* If the queue already has requests, don't wait */
4190 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4191 return false;
4192
4193 /* If there are other queues in the group, don't wait */
4194 if (cfqq->cfqg->nr_cfqq > 1)
4195 return false;
4196
4197 /* the only queue in the group, but think time is big */
4198 if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
4199 return false;
4200
4201 if (cfq_slice_used(cfqq))
4202 return true;
4203
4204 /* if slice left is less than think time, wait busy */
4205 if (cic && sample_valid(cic->ttime.ttime_samples)
4206 && (cfqq->slice_end - now < cic->ttime.ttime_mean))
4207 return true;
4208
4209 /*
4210 * If think times is less than a jiffy than ttime_mean=0 and above
4211 * will not be true. It might happen that slice has not expired yet
4212 * but will expire soon (4-5 ns) during select_queue(). To cover the
4213 * case where think time is less than a jiffy, mark the queue wait
4214 * busy if only 1 jiffy is left in the slice.
4215 */
4216 if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
4217 return true;
4218
4219 return false;
4220}
4221
4222static void cfq_completed_request(struct request_queue *q, struct request *rq)
4223{
4224 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4225 struct cfq_data *cfqd = cfqq->cfqd;
4226 const int sync = rq_is_sync(rq);
4227 u64 now = ktime_get_ns();
4228
4229 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
4230
4231 cfq_update_hw_tag(cfqd);
4232
4233 WARN_ON(!cfqd->rq_in_driver);
4234 WARN_ON(!cfqq->dispatched);
4235 cfqd->rq_in_driver--;
4236 cfqq->dispatched--;
4237 (RQ_CFQG(rq))->dispatched--;
4238 cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
4239 rq->io_start_time_ns, rq->cmd_flags);
4240
4241 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
4242
4243 if (sync) {
4244 struct cfq_rb_root *st;
4245
4246 RQ_CIC(rq)->ttime.last_end_request = now;
4247
4248 if (cfq_cfqq_on_rr(cfqq))
4249 st = cfqq->service_tree;
4250 else
4251 st = st_for(cfqq->cfqg, cfqq_class(cfqq),
4252 cfqq_type(cfqq));
4253
4254 st->ttime.last_end_request = now;
4255 if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
4256 cfqd->last_delayed_sync = now;
4257 }
4258
4259#ifdef CONFIG_CFQ_GROUP_IOSCHED
4260 cfqq->cfqg->ttime.last_end_request = now;
4261#endif
4262
4263 /*
4264 * If this is the active queue, check if it needs to be expired,
4265 * or if we want to idle in case it has no pending requests.
4266 */
4267 if (cfqd->active_queue == cfqq) {
4268 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
4269
4270 if (cfq_cfqq_slice_new(cfqq)) {
4271 cfq_set_prio_slice(cfqd, cfqq);
4272 cfq_clear_cfqq_slice_new(cfqq);
4273 }
4274
4275 /*
4276 * Should we wait for next request to come in before we expire
4277 * the queue.
4278 */
4279 if (cfq_should_wait_busy(cfqd, cfqq)) {
4280 u64 extend_sl = cfqd->cfq_slice_idle;
4281 if (!cfqd->cfq_slice_idle)
4282 extend_sl = cfqd->cfq_group_idle;
4283 cfqq->slice_end = now + extend_sl;
4284 cfq_mark_cfqq_wait_busy(cfqq);
4285 cfq_log_cfqq(cfqd, cfqq, "will busy wait");
4286 }
4287
4288 /*
4289 * Idling is not enabled on:
4290 * - expired queues
4291 * - idle-priority queues
4292 * - async queues
4293 * - queues with still some requests queued
4294 * - when there is a close cooperator
4295 */
4296 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
4297 cfq_slice_expired(cfqd, 1);
4298 else if (sync && cfqq_empty &&
4299 !cfq_close_cooperator(cfqd, cfqq)) {
4300 cfq_arm_slice_timer(cfqd);
4301 }
4302 }
4303
4304 if (!cfqd->rq_in_driver)
4305 cfq_schedule_dispatch(cfqd);
4306}
4307
4308static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
4309{
4310 /*
4311 * If REQ_PRIO is set, boost class and prio level, if it's below
4312 * BE/NORM. If prio is not set, restore the potentially boosted
4313 * class/prio level.
4314 */
4315 if (!(op & REQ_PRIO)) {
4316 cfqq->ioprio_class = cfqq->org_ioprio_class;
4317 cfqq->ioprio = cfqq->org_ioprio;
4318 } else {
4319 if (cfq_class_idle(cfqq))
4320 cfqq->ioprio_class = IOPRIO_CLASS_BE;
4321 if (cfqq->ioprio > IOPRIO_NORM)
4322 cfqq->ioprio = IOPRIO_NORM;
4323 }
4324}
4325
4326static inline int __cfq_may_queue(struct cfq_queue *cfqq)
4327{
4328 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
4329 cfq_mark_cfqq_must_alloc_slice(cfqq);
4330 return ELV_MQUEUE_MUST;
4331 }
4332
4333 return ELV_MQUEUE_MAY;
4334}
4335
4336static int cfq_may_queue(struct request_queue *q, unsigned int op)
4337{
4338 struct cfq_data *cfqd = q->elevator->elevator_data;
4339 struct task_struct *tsk = current;
4340 struct cfq_io_cq *cic;
4341 struct cfq_queue *cfqq;
4342
4343 /*
4344 * don't force setup of a queue from here, as a call to may_queue
4345 * does not necessarily imply that a request actually will be queued.
4346 * so just lookup a possibly existing queue, or return 'may queue'
4347 * if that fails
4348 */
4349 cic = cfq_cic_lookup(cfqd, tsk->io_context);
4350 if (!cic)
4351 return ELV_MQUEUE_MAY;
4352
4353 cfqq = cic_to_cfqq(cic, op_is_sync(op));
4354 if (cfqq) {
4355 cfq_init_prio_data(cfqq, cic);
4356 cfqq_boost_on_prio(cfqq, op);
4357
4358 return __cfq_may_queue(cfqq);
4359 }
4360
4361 return ELV_MQUEUE_MAY;
4362}
4363
4364/*
4365 * queue lock held here
4366 */
4367static void cfq_put_request(struct request *rq)
4368{
4369 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4370
4371 if (cfqq) {
4372 const int rw = rq_data_dir(rq);
4373
4374 BUG_ON(!cfqq->allocated[rw]);
4375 cfqq->allocated[rw]--;
4376
4377 /* Put down rq reference on cfqg */
4378 cfqg_put(RQ_CFQG(rq));
4379 rq->elv.priv[0] = NULL;
4380 rq->elv.priv[1] = NULL;
4381
4382 cfq_put_queue(cfqq);
4383 }
4384}
4385
4386static struct cfq_queue *
4387cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
4388 struct cfq_queue *cfqq)
4389{
4390 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4391 cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4392 cfq_mark_cfqq_coop(cfqq->new_cfqq);
4393 cfq_put_queue(cfqq);
4394 return cic_to_cfqq(cic, 1);
4395}
4396
4397/*
4398 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4399 * was the last process referring to said cfqq.
4400 */
4401static struct cfq_queue *
4402split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4403{
4404 if (cfqq_process_refs(cfqq) == 1) {
4405 cfqq->pid = current->pid;
4406 cfq_clear_cfqq_coop(cfqq);
4407 cfq_clear_cfqq_split_coop(cfqq);
4408 return cfqq;
4409 }
4410
4411 cic_set_cfqq(cic, NULL, 1);
4412
4413 cfq_put_cooperator(cfqq);
4414
4415 cfq_put_queue(cfqq);
4416 return NULL;
4417}
4418/*
4419 * Allocate cfq data structures associated with this request.
4420 */
4421static int
4422cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4423 gfp_t gfp_mask)
4424{
4425 struct cfq_data *cfqd = q->elevator->elevator_data;
4426 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4427 const int rw = rq_data_dir(rq);
4428 const bool is_sync = rq_is_sync(rq);
4429 struct cfq_queue *cfqq;
4430
4431 spin_lock_irq(q->queue_lock);
4432
4433 check_ioprio_changed(cic, bio);
4434 check_blkcg_changed(cic, bio);
4435new_queue:
4436 cfqq = cic_to_cfqq(cic, is_sync);
4437 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4438 if (cfqq)
4439 cfq_put_queue(cfqq);
4440 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
4441 cic_set_cfqq(cic, cfqq, is_sync);
4442 } else {
4443 /*
4444 * If the queue was seeky for too long, break it apart.
4445 */
4446 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4447 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4448 cfqq = split_cfqq(cic, cfqq);
4449 if (!cfqq)
4450 goto new_queue;
4451 }
4452
4453 /*
4454 * Check to see if this queue is scheduled to merge with
4455 * another, closely cooperating queue. The merging of
4456 * queues happens here as it must be done in process context.
4457 * The reference on new_cfqq was taken in merge_cfqqs.
4458 */
4459 if (cfqq->new_cfqq)
4460 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4461 }
4462
4463 cfqq->allocated[rw]++;
4464
4465 cfqq->ref++;
4466 cfqg_get(cfqq->cfqg);
4467 rq->elv.priv[0] = cfqq;
4468 rq->elv.priv[1] = cfqq->cfqg;
4469 spin_unlock_irq(q->queue_lock);
4470
4471 return 0;
4472}
4473
4474static void cfq_kick_queue(struct work_struct *work)
4475{
4476 struct cfq_data *cfqd =
4477 container_of(work, struct cfq_data, unplug_work);
4478 struct request_queue *q = cfqd->queue;
4479
4480 spin_lock_irq(q->queue_lock);
4481 __blk_run_queue(cfqd->queue);
4482 spin_unlock_irq(q->queue_lock);
4483}
4484
4485/*
4486 * Timer running if the active_queue is currently idling inside its time slice
4487 */
4488static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
4489{
4490 struct cfq_data *cfqd = container_of(timer, struct cfq_data,
4491 idle_slice_timer);
4492 struct cfq_queue *cfqq;
4493 unsigned long flags;
4494 int timed_out = 1;
4495
4496 cfq_log(cfqd, "idle timer fired");
4497
4498 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4499
4500 cfqq = cfqd->active_queue;
4501 if (cfqq) {
4502 timed_out = 0;
4503
4504 /*
4505 * We saw a request before the queue expired, let it through
4506 */
4507 if (cfq_cfqq_must_dispatch(cfqq))
4508 goto out_kick;
4509
4510 /*
4511 * expired
4512 */
4513 if (cfq_slice_used(cfqq))
4514 goto expire;
4515
4516 /*
4517 * only expire and reinvoke request handler, if there are
4518 * other queues with pending requests
4519 */
4520 if (!cfqd->busy_queues)
4521 goto out_cont;
4522
4523 /*
4524 * not expired and it has a request pending, let it dispatch
4525 */
4526 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4527 goto out_kick;
4528
4529 /*
4530 * Queue depth flag is reset only when the idle didn't succeed
4531 */
4532 cfq_clear_cfqq_deep(cfqq);
4533 }
4534expire:
4535 cfq_slice_expired(cfqd, timed_out);
4536out_kick:
4537 cfq_schedule_dispatch(cfqd);
4538out_cont:
4539 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4540 return HRTIMER_NORESTART;
4541}
4542
4543static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4544{
4545 hrtimer_cancel(&cfqd->idle_slice_timer);
4546 cancel_work_sync(&cfqd->unplug_work);
4547}
4548
4549static void cfq_exit_queue(struct elevator_queue *e)
4550{
4551 struct cfq_data *cfqd = e->elevator_data;
4552 struct request_queue *q = cfqd->queue;
4553
4554 cfq_shutdown_timer_wq(cfqd);
4555
4556 spin_lock_irq(q->queue_lock);
4557
4558 if (cfqd->active_queue)
4559 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4560
4561 spin_unlock_irq(q->queue_lock);
4562
4563 cfq_shutdown_timer_wq(cfqd);
4564
4565#ifdef CONFIG_CFQ_GROUP_IOSCHED
4566 blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4567#else
4568 kfree(cfqd->root_group);
4569#endif
4570 kfree(cfqd);
4571}
4572
4573static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4574{
4575 struct cfq_data *cfqd;
4576 struct blkcg_gq *blkg __maybe_unused;
4577 int i, ret;
4578 struct elevator_queue *eq;
4579
4580 eq = elevator_alloc(q, e);
4581 if (!eq)
4582 return -ENOMEM;
4583
4584 cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
4585 if (!cfqd) {
4586 kobject_put(&eq->kobj);
4587 return -ENOMEM;
4588 }
4589 eq->elevator_data = cfqd;
4590
4591 cfqd->queue = q;
4592 spin_lock_irq(q->queue_lock);
4593 q->elevator = eq;
4594 spin_unlock_irq(q->queue_lock);
4595
4596 /* Init root service tree */
4597 cfqd->grp_service_tree = CFQ_RB_ROOT;
4598
4599 /* Init root group and prefer root group over other groups by default */
4600#ifdef CONFIG_CFQ_GROUP_IOSCHED
4601 ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4602 if (ret)
4603 goto out_free;
4604
4605 cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4606#else
4607 ret = -ENOMEM;
4608 cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4609 GFP_KERNEL, cfqd->queue->node);
4610 if (!cfqd->root_group)
4611 goto out_free;
4612
4613 cfq_init_cfqg_base(cfqd->root_group);
4614 cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4615 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4616#endif
4617
4618 /*
4619 * Not strictly needed (since RB_ROOT just clears the node and we
4620 * zeroed cfqd on alloc), but better be safe in case someone decides
4621 * to add magic to the rb code
4622 */
4623 for (i = 0; i < CFQ_PRIO_LISTS; i++)
4624 cfqd->prio_trees[i] = RB_ROOT;
4625
4626 /*
4627 * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
4628 * Grab a permanent reference to it, so that the normal code flow
4629 * will not attempt to free it. oom_cfqq is linked to root_group
4630 * but shouldn't hold a reference as it'll never be unlinked. Lose
4631 * the reference from linking right away.
4632 */
4633 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4634 cfqd->oom_cfqq.ref++;
4635
4636 spin_lock_irq(q->queue_lock);
4637 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4638 cfqg_put(cfqd->root_group);
4639 spin_unlock_irq(q->queue_lock);
4640
4641 hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
4642 HRTIMER_MODE_REL);
4643 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4644
4645 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4646
4647 cfqd->cfq_quantum = cfq_quantum;
4648 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4649 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4650 cfqd->cfq_back_max = cfq_back_max;
4651 cfqd->cfq_back_penalty = cfq_back_penalty;
4652 cfqd->cfq_slice[0] = cfq_slice_async;
4653 cfqd->cfq_slice[1] = cfq_slice_sync;
4654 cfqd->cfq_target_latency = cfq_target_latency;
4655 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4656 cfqd->cfq_slice_idle = cfq_slice_idle;
4657 cfqd->cfq_group_idle = cfq_group_idle;
4658 cfqd->cfq_latency = 1;
4659 cfqd->hw_tag = -1;
4660 /*
4661 * we optimistically start assuming sync ops weren't delayed in last
4662 * second, in order to have larger depth for async operations.
4663 */
4664 cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
4665 return 0;
4666
4667out_free:
4668 kfree(cfqd);
4669 kobject_put(&eq->kobj);
4670 return ret;
4671}
4672
4673static void cfq_registered_queue(struct request_queue *q)
4674{
4675 struct elevator_queue *e = q->elevator;
4676 struct cfq_data *cfqd = e->elevator_data;
4677
4678 /*
4679 * Default to IOPS mode with no idling for SSDs
4680 */
4681 if (blk_queue_nonrot(q))
4682 cfqd->cfq_slice_idle = 0;
4683 wbt_disable_default(q);
4684}
4685
4686/*
4687 * sysfs parts below -->
4688 */
4689static ssize_t
4690cfq_var_show(unsigned int var, char *page)
4691{
4692 return sprintf(page, "%u\n", var);
4693}
4694
4695static void
4696cfq_var_store(unsigned int *var, const char *page)
4697{
4698 char *p = (char *) page;
4699
4700 *var = simple_strtoul(p, &p, 10);
4701}
4702
4703#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4704static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4705{ \
4706 struct cfq_data *cfqd = e->elevator_data; \
4707 u64 __data = __VAR; \
4708 if (__CONV) \
4709 __data = div_u64(__data, NSEC_PER_MSEC); \
4710 return cfq_var_show(__data, (page)); \
4711}
4712SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4713SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4714SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4715SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4716SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4717SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4718SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4719SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4720SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4721SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4722SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4723SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4724#undef SHOW_FUNCTION
4725
4726#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
4727static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4728{ \
4729 struct cfq_data *cfqd = e->elevator_data; \
4730 u64 __data = __VAR; \
4731 __data = div_u64(__data, NSEC_PER_USEC); \
4732 return cfq_var_show(__data, (page)); \
4733}
4734USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
4735USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
4736USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
4737USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
4738USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
4739#undef USEC_SHOW_FUNCTION
4740
4741#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4742static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4743{ \
4744 struct cfq_data *cfqd = e->elevator_data; \
4745 unsigned int __data, __min = (MIN), __max = (MAX); \
4746 \
4747 cfq_var_store(&__data, (page)); \
4748 if (__data < __min) \
4749 __data = __min; \
4750 else if (__data > __max) \
4751 __data = __max; \
4752 if (__CONV) \
4753 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
4754 else \
4755 *(__PTR) = __data; \
4756 return count; \
4757}
4758STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4759STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4760 UINT_MAX, 1);
4761STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4762 UINT_MAX, 1);
4763STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4764STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4765 UINT_MAX, 0);
4766STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4767STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4768STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4769STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4770STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4771 UINT_MAX, 0);
4772STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4773STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4774#undef STORE_FUNCTION
4775
4776#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
4777static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4778{ \
4779 struct cfq_data *cfqd = e->elevator_data; \
4780 unsigned int __data, __min = (MIN), __max = (MAX); \
4781 \
4782 cfq_var_store(&__data, (page)); \
4783 if (__data < __min) \
4784 __data = __min; \
4785 else if (__data > __max) \
4786 __data = __max; \
4787 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
4788 return count; \
4789}
4790USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
4791USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
4792USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
4793USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
4794USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
4795#undef USEC_STORE_FUNCTION
4796
4797#define CFQ_ATTR(name) \
4798 __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
4799
4800static struct elv_fs_entry cfq_attrs[] = {
4801 CFQ_ATTR(quantum),
4802 CFQ_ATTR(fifo_expire_sync),
4803 CFQ_ATTR(fifo_expire_async),
4804 CFQ_ATTR(back_seek_max),
4805 CFQ_ATTR(back_seek_penalty),
4806 CFQ_ATTR(slice_sync),
4807 CFQ_ATTR(slice_sync_us),
4808 CFQ_ATTR(slice_async),
4809 CFQ_ATTR(slice_async_us),
4810 CFQ_ATTR(slice_async_rq),
4811 CFQ_ATTR(slice_idle),
4812 CFQ_ATTR(slice_idle_us),
4813 CFQ_ATTR(group_idle),
4814 CFQ_ATTR(group_idle_us),
4815 CFQ_ATTR(low_latency),
4816 CFQ_ATTR(target_latency),
4817 CFQ_ATTR(target_latency_us),
4818 __ATTR_NULL
4819};
4820
4821static struct elevator_type iosched_cfq = {
4822 .ops.sq = {
4823 .elevator_merge_fn = cfq_merge,
4824 .elevator_merged_fn = cfq_merged_request,
4825 .elevator_merge_req_fn = cfq_merged_requests,
4826 .elevator_allow_bio_merge_fn = cfq_allow_bio_merge,
4827 .elevator_allow_rq_merge_fn = cfq_allow_rq_merge,
4828 .elevator_bio_merged_fn = cfq_bio_merged,
4829 .elevator_dispatch_fn = cfq_dispatch_requests,
4830 .elevator_add_req_fn = cfq_insert_request,
4831 .elevator_activate_req_fn = cfq_activate_request,
4832 .elevator_deactivate_req_fn = cfq_deactivate_request,
4833 .elevator_completed_req_fn = cfq_completed_request,
4834 .elevator_former_req_fn = elv_rb_former_request,
4835 .elevator_latter_req_fn = elv_rb_latter_request,
4836 .elevator_init_icq_fn = cfq_init_icq,
4837 .elevator_exit_icq_fn = cfq_exit_icq,
4838 .elevator_set_req_fn = cfq_set_request,
4839 .elevator_put_req_fn = cfq_put_request,
4840 .elevator_may_queue_fn = cfq_may_queue,
4841 .elevator_init_fn = cfq_init_queue,
4842 .elevator_exit_fn = cfq_exit_queue,
4843 .elevator_registered_fn = cfq_registered_queue,
4844 },
4845 .icq_size = sizeof(struct cfq_io_cq),
4846 .icq_align = __alignof__(struct cfq_io_cq),
4847 .elevator_attrs = cfq_attrs,
4848 .elevator_name = "cfq",
4849 .elevator_owner = THIS_MODULE,
4850};
4851
4852#ifdef CONFIG_CFQ_GROUP_IOSCHED
4853static struct blkcg_policy blkcg_policy_cfq = {
4854 .dfl_cftypes = cfq_blkcg_files,
4855 .legacy_cftypes = cfq_blkcg_legacy_files,
4856
4857 .cpd_alloc_fn = cfq_cpd_alloc,
4858 .cpd_init_fn = cfq_cpd_init,
4859 .cpd_free_fn = cfq_cpd_free,
4860 .cpd_bind_fn = cfq_cpd_bind,
4861
4862 .pd_alloc_fn = cfq_pd_alloc,
4863 .pd_init_fn = cfq_pd_init,
4864 .pd_offline_fn = cfq_pd_offline,
4865 .pd_free_fn = cfq_pd_free,
4866 .pd_reset_stats_fn = cfq_pd_reset_stats,
4867};
4868#endif
4869
4870static int __init cfq_init(void)
4871{
4872 int ret;
4873
4874#ifdef CONFIG_CFQ_GROUP_IOSCHED
4875 ret = blkcg_policy_register(&blkcg_policy_cfq);
4876 if (ret)
4877 return ret;
4878#else
4879 cfq_group_idle = 0;
4880#endif
4881
4882 ret = -ENOMEM;
4883 cfq_pool = KMEM_CACHE(cfq_queue, 0);
4884 if (!cfq_pool)
4885 goto err_pol_unreg;
4886
4887 ret = elv_register(&iosched_cfq);
4888 if (ret)
4889 goto err_free_pool;
4890
4891 return 0;
4892
4893err_free_pool:
4894 kmem_cache_destroy(cfq_pool);
4895err_pol_unreg:
4896#ifdef CONFIG_CFQ_GROUP_IOSCHED
4897 blkcg_policy_unregister(&blkcg_policy_cfq);
4898#endif
4899 return ret;
4900}
4901
4902static void __exit cfq_exit(void)
4903{
4904#ifdef CONFIG_CFQ_GROUP_IOSCHED
4905 blkcg_policy_unregister(&blkcg_policy_cfq);
4906#endif
4907 elv_unregister(&iosched_cfq);
4908 kmem_cache_destroy(cfq_pool);
4909}
4910
4911module_init(cfq_init);
4912module_exit(cfq_exit);
4913
4914MODULE_AUTHOR("Jens Axboe");
4915MODULE_LICENSE("GPL");
4916MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
deleted file mode 100644
index ef2f1f09e9b3..000000000000
--- a/block/deadline-iosched.c
+++ /dev/null
@@ -1,560 +0,0 @@
1/*
2 * Deadline i/o scheduler.
3 *
4 * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
5 */
6#include <linux/kernel.h>
7#include <linux/fs.h>
8#include <linux/blkdev.h>
9#include <linux/elevator.h>
10#include <linux/bio.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13#include <linux/init.h>
14#include <linux/compiler.h>
15#include <linux/rbtree.h>
16
17/*
18 * See Documentation/block/deadline-iosched.txt
19 */
20static const int read_expire = HZ / 2; /* max time before a read is submitted. */
21static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
22static const int writes_starved = 2; /* max times reads can starve a write */
23static const int fifo_batch = 16; /* # of sequential requests treated as one
24 by the above parameters. For throughput. */
25
26struct deadline_data {
27 /*
28 * run time data
29 */
30
31 /*
32 * requests (deadline_rq s) are present on both sort_list and fifo_list
33 */
34 struct rb_root sort_list[2];
35 struct list_head fifo_list[2];
36
37 /*
38 * next in sort order. read, write or both are NULL
39 */
40 struct request *next_rq[2];
41 unsigned int batching; /* number of sequential requests made */
42 unsigned int starved; /* times reads have starved writes */
43
44 /*
45 * settings that change how the i/o scheduler behaves
46 */
47 int fifo_expire[2];
48 int fifo_batch;
49 int writes_starved;
50 int front_merges;
51};
52
53static inline struct rb_root *
54deadline_rb_root(struct deadline_data *dd, struct request *rq)
55{
56 return &dd->sort_list[rq_data_dir(rq)];
57}
58
59/*
60 * get the request after `rq' in sector-sorted order
61 */
62static inline struct request *
63deadline_latter_request(struct request *rq)
64{
65 struct rb_node *node = rb_next(&rq->rb_node);
66
67 if (node)
68 return rb_entry_rq(node);
69
70 return NULL;
71}
72
73static void
74deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
75{
76 struct rb_root *root = deadline_rb_root(dd, rq);
77
78 elv_rb_add(root, rq);
79}
80
81static inline void
82deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
83{
84 const int data_dir = rq_data_dir(rq);
85
86 if (dd->next_rq[data_dir] == rq)
87 dd->next_rq[data_dir] = deadline_latter_request(rq);
88
89 elv_rb_del(deadline_rb_root(dd, rq), rq);
90}
91
92/*
93 * add rq to rbtree and fifo
94 */
95static void
96deadline_add_request(struct request_queue *q, struct request *rq)
97{
98 struct deadline_data *dd = q->elevator->elevator_data;
99 const int data_dir = rq_data_dir(rq);
100
101 /*
102 * This may be a requeue of a write request that has locked its
103 * target zone. If it is the case, this releases the zone lock.
104 */
105 blk_req_zone_write_unlock(rq);
106
107 deadline_add_rq_rb(dd, rq);
108
109 /*
110 * set expire time and add to fifo list
111 */
112 rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
113 list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
114}
115
116/*
117 * remove rq from rbtree and fifo.
118 */
119static void deadline_remove_request(struct request_queue *q, struct request *rq)
120{
121 struct deadline_data *dd = q->elevator->elevator_data;
122
123 rq_fifo_clear(rq);
124 deadline_del_rq_rb(dd, rq);
125}
126
127static enum elv_merge
128deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
129{
130 struct deadline_data *dd = q->elevator->elevator_data;
131 struct request *__rq;
132
133 /*
134 * check for front merge
135 */
136 if (dd->front_merges) {
137 sector_t sector = bio_end_sector(bio);
138
139 __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
140 if (__rq) {
141 BUG_ON(sector != blk_rq_pos(__rq));
142
143 if (elv_bio_merge_ok(__rq, bio)) {
144 *req = __rq;
145 return ELEVATOR_FRONT_MERGE;
146 }
147 }
148 }
149
150 return ELEVATOR_NO_MERGE;
151}
152
153static void deadline_merged_request(struct request_queue *q,
154 struct request *req, enum elv_merge type)
155{
156 struct deadline_data *dd = q->elevator->elevator_data;
157
158 /*
159 * if the merge was a front merge, we need to reposition request
160 */
161 if (type == ELEVATOR_FRONT_MERGE) {
162 elv_rb_del(deadline_rb_root(dd, req), req);
163 deadline_add_rq_rb(dd, req);
164 }
165}
166
167static void
168deadline_merged_requests(struct request_queue *q, struct request *req,
169 struct request *next)
170{
171 /*
172 * if next expires before rq, assign its expire time to rq
173 * and move into next position (next will be deleted) in fifo
174 */
175 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
176 if (time_before((unsigned long)next->fifo_time,
177 (unsigned long)req->fifo_time)) {
178 list_move(&req->queuelist, &next->queuelist);
179 req->fifo_time = next->fifo_time;
180 }
181 }
182
183 /*
184 * kill knowledge of next, this one is a goner
185 */
186 deadline_remove_request(q, next);
187}
188
189/*
190 * move request from sort list to dispatch queue.
191 */
192static inline void
193deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
194{
195 struct request_queue *q = rq->q;
196
197 /*
198 * For a zoned block device, write requests must write lock their
199 * target zone.
200 */
201 blk_req_zone_write_lock(rq);
202
203 deadline_remove_request(q, rq);
204 elv_dispatch_add_tail(q, rq);
205}
206
207/*
208 * move an entry to dispatch queue
209 */
210static void
211deadline_move_request(struct deadline_data *dd, struct request *rq)
212{
213 const int data_dir = rq_data_dir(rq);
214
215 dd->next_rq[READ] = NULL;
216 dd->next_rq[WRITE] = NULL;
217 dd->next_rq[data_dir] = deadline_latter_request(rq);
218
219 /*
220 * take it off the sort and fifo list, move
221 * to dispatch queue
222 */
223 deadline_move_to_dispatch(dd, rq);
224}
225
226/*
227 * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
228 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
229 */
230static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
231{
232 struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
233
234 /*
235 * rq is expired!
236 */
237 if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
238 return 1;
239
240 return 0;
241}
242
243/*
244 * For the specified data direction, return the next request to dispatch using
245 * arrival ordered lists.
246 */
247static struct request *
248deadline_fifo_request(struct deadline_data *dd, int data_dir)
249{
250 struct request *rq;
251
252 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
253 return NULL;
254
255 if (list_empty(&dd->fifo_list[data_dir]))
256 return NULL;
257
258 rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
259 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
260 return rq;
261
262 /*
263 * Look for a write request that can be dispatched, that is one with
264 * an unlocked target zone.
265 */
266 list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
267 if (blk_req_can_dispatch_to_zone(rq))
268 return rq;
269 }
270
271 return NULL;
272}
273
274/*
275 * For the specified data direction, return the next request to dispatch using
276 * sector position sorted lists.
277 */
278static struct request *
279deadline_next_request(struct deadline_data *dd, int data_dir)
280{
281 struct request *rq;
282
283 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
284 return NULL;
285
286 rq = dd->next_rq[data_dir];
287 if (!rq)
288 return NULL;
289
290 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
291 return rq;
292
293 /*
294 * Look for a write request that can be dispatched, that is one with
295 * an unlocked target zone.
296 */
297 while (rq) {
298 if (blk_req_can_dispatch_to_zone(rq))
299 return rq;
300 rq = deadline_latter_request(rq);
301 }
302
303 return NULL;
304}
305
306/*
307 * deadline_dispatch_requests selects the best request according to
308 * read/write expire, fifo_batch, etc
309 */
310static int deadline_dispatch_requests(struct request_queue *q, int force)
311{
312 struct deadline_data *dd = q->elevator->elevator_data;
313 const int reads = !list_empty(&dd->fifo_list[READ]);
314 const int writes = !list_empty(&dd->fifo_list[WRITE]);
315 struct request *rq, *next_rq;
316 int data_dir;
317
318 /*
319 * batches are currently reads XOR writes
320 */
321 rq = deadline_next_request(dd, WRITE);
322 if (!rq)
323 rq = deadline_next_request(dd, READ);
324
325 if (rq && dd->batching < dd->fifo_batch)
326 /* we have a next request are still entitled to batch */
327 goto dispatch_request;
328
329 /*
330 * at this point we are not running a batch. select the appropriate
331 * data direction (read / write)
332 */
333
334 if (reads) {
335 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
336
337 if (deadline_fifo_request(dd, WRITE) &&
338 (dd->starved++ >= dd->writes_starved))
339 goto dispatch_writes;
340
341 data_dir = READ;
342
343 goto dispatch_find_request;
344 }
345
346 /*
347 * there are either no reads or writes have been starved
348 */
349
350 if (writes) {
351dispatch_writes:
352 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
353
354 dd->starved = 0;
355
356 data_dir = WRITE;
357
358 goto dispatch_find_request;
359 }
360
361 return 0;
362
363dispatch_find_request:
364 /*
365 * we are not running a batch, find best request for selected data_dir
366 */
367 next_rq = deadline_next_request(dd, data_dir);
368 if (deadline_check_fifo(dd, data_dir) || !next_rq) {
369 /*
370 * A deadline has expired, the last request was in the other
371 * direction, or we have run out of higher-sectored requests.
372 * Start again from the request with the earliest expiry time.
373 */
374 rq = deadline_fifo_request(dd, data_dir);
375 } else {
376 /*
377 * The last req was the same dir and we have a next request in
378 * sort order. No expired requests so continue on from here.
379 */
380 rq = next_rq;
381 }
382
383 /*
384 * For a zoned block device, if we only have writes queued and none of
385 * them can be dispatched, rq will be NULL.
386 */
387 if (!rq)
388 return 0;
389
390 dd->batching = 0;
391
392dispatch_request:
393 /*
394 * rq is the selected appropriate request.
395 */
396 dd->batching++;
397 deadline_move_request(dd, rq);
398
399 return 1;
400}
401
402/*
403 * For zoned block devices, write unlock the target zone of completed
404 * write requests.
405 */
406static void
407deadline_completed_request(struct request_queue *q, struct request *rq)
408{
409 blk_req_zone_write_unlock(rq);
410}
411
412static void deadline_exit_queue(struct elevator_queue *e)
413{
414 struct deadline_data *dd = e->elevator_data;
415
416 BUG_ON(!list_empty(&dd->fifo_list[READ]));
417 BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
418
419 kfree(dd);
420}
421
422/*
423 * initialize elevator private data (deadline_data).
424 */
425static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
426{
427 struct deadline_data *dd;
428 struct elevator_queue *eq;
429
430 eq = elevator_alloc(q, e);
431 if (!eq)
432 return -ENOMEM;
433
434 dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
435 if (!dd) {
436 kobject_put(&eq->kobj);
437 return -ENOMEM;
438 }
439 eq->elevator_data = dd;
440
441 INIT_LIST_HEAD(&dd->fifo_list[READ]);
442 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
443 dd->sort_list[READ] = RB_ROOT;
444 dd->sort_list[WRITE] = RB_ROOT;
445 dd->fifo_expire[READ] = read_expire;
446 dd->fifo_expire[WRITE] = write_expire;
447 dd->writes_starved = writes_starved;
448 dd->front_merges = 1;
449 dd->fifo_batch = fifo_batch;
450
451 spin_lock_irq(q->queue_lock);
452 q->elevator = eq;
453 spin_unlock_irq(q->queue_lock);
454 return 0;
455}
456
457/*
458 * sysfs parts below
459 */
460
461static ssize_t
462deadline_var_show(int var, char *page)
463{
464 return sprintf(page, "%d\n", var);
465}
466
467static void
468deadline_var_store(int *var, const char *page)
469{
470 char *p = (char *) page;
471
472 *var = simple_strtol(p, &p, 10);
473}
474
475#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
476static ssize_t __FUNC(struct elevator_queue *e, char *page) \
477{ \
478 struct deadline_data *dd = e->elevator_data; \
479 int __data = __VAR; \
480 if (__CONV) \
481 __data = jiffies_to_msecs(__data); \
482 return deadline_var_show(__data, (page)); \
483}
484SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
485SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
486SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
487SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
488SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
489#undef SHOW_FUNCTION
490
491#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
492static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
493{ \
494 struct deadline_data *dd = e->elevator_data; \
495 int __data; \
496 deadline_var_store(&__data, (page)); \
497 if (__data < (MIN)) \
498 __data = (MIN); \
499 else if (__data > (MAX)) \
500 __data = (MAX); \
501 if (__CONV) \
502 *(__PTR) = msecs_to_jiffies(__data); \
503 else \
504 *(__PTR) = __data; \
505 return count; \
506}
507STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
508STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
509STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
510STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
511STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
512#undef STORE_FUNCTION
513
514#define DD_ATTR(name) \
515 __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
516
517static struct elv_fs_entry deadline_attrs[] = {
518 DD_ATTR(read_expire),
519 DD_ATTR(write_expire),
520 DD_ATTR(writes_starved),
521 DD_ATTR(front_merges),
522 DD_ATTR(fifo_batch),
523 __ATTR_NULL
524};
525
526static struct elevator_type iosched_deadline = {
527 .ops.sq = {
528 .elevator_merge_fn = deadline_merge,
529 .elevator_merged_fn = deadline_merged_request,
530 .elevator_merge_req_fn = deadline_merged_requests,
531 .elevator_dispatch_fn = deadline_dispatch_requests,
532 .elevator_completed_req_fn = deadline_completed_request,
533 .elevator_add_req_fn = deadline_add_request,
534 .elevator_former_req_fn = elv_rb_former_request,
535 .elevator_latter_req_fn = elv_rb_latter_request,
536 .elevator_init_fn = deadline_init_queue,
537 .elevator_exit_fn = deadline_exit_queue,
538 },
539
540 .elevator_attrs = deadline_attrs,
541 .elevator_name = "deadline",
542 .elevator_owner = THIS_MODULE,
543};
544
545static int __init deadline_init(void)
546{
547 return elv_register(&iosched_deadline);
548}
549
550static void __exit deadline_exit(void)
551{
552 elv_unregister(&iosched_deadline);
553}
554
555module_init(deadline_init);
556module_exit(deadline_exit);
557
558MODULE_AUTHOR("Jens Axboe");
559MODULE_LICENSE("GPL");
560MODULE_DESCRIPTION("deadline IO scheduler");
diff --git a/block/elevator.c b/block/elevator.c
index 8fdcd64ae12e..f05e90d4e695 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -61,10 +61,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
61 struct request_queue *q = rq->q; 61 struct request_queue *q = rq->q;
62 struct elevator_queue *e = q->elevator; 62 struct elevator_queue *e = q->elevator;
63 63
64 if (e->uses_mq && e->type->ops.mq.allow_merge) 64 if (e->type->ops.allow_merge)
65 return e->type->ops.mq.allow_merge(q, rq, bio); 65 return e->type->ops.allow_merge(q, rq, bio);
66 else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
67 return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
68 66
69 return 1; 67 return 1;
70} 68}
@@ -95,14 +93,14 @@ static bool elevator_match(const struct elevator_type *e, const char *name)
95} 93}
96 94
97/* 95/*
98 * Return scheduler with name 'name' and with matching 'mq capability 96 * Return scheduler with name 'name'
99 */ 97 */
100static struct elevator_type *elevator_find(const char *name, bool mq) 98static struct elevator_type *elevator_find(const char *name)
101{ 99{
102 struct elevator_type *e; 100 struct elevator_type *e;
103 101
104 list_for_each_entry(e, &elv_list, list) { 102 list_for_each_entry(e, &elv_list, list) {
105 if (elevator_match(e, name) && (mq == e->uses_mq)) 103 if (elevator_match(e, name))
106 return e; 104 return e;
107 } 105 }
108 106
@@ -121,12 +119,12 @@ static struct elevator_type *elevator_get(struct request_queue *q,
121 119
122 spin_lock(&elv_list_lock); 120 spin_lock(&elv_list_lock);
123 121
124 e = elevator_find(name, q->mq_ops != NULL); 122 e = elevator_find(name);
125 if (!e && try_loading) { 123 if (!e && try_loading) {
126 spin_unlock(&elv_list_lock); 124 spin_unlock(&elv_list_lock);
127 request_module("%s-iosched", name); 125 request_module("%s-iosched", name);
128 spin_lock(&elv_list_lock); 126 spin_lock(&elv_list_lock);
129 e = elevator_find(name, q->mq_ops != NULL); 127 e = elevator_find(name);
130 } 128 }
131 129
132 if (e && !try_module_get(e->elevator_owner)) 130 if (e && !try_module_get(e->elevator_owner))
@@ -150,26 +148,6 @@ static int __init elevator_setup(char *str)
150 148
151__setup("elevator=", elevator_setup); 149__setup("elevator=", elevator_setup);
152 150
153/* called during boot to load the elevator chosen by the elevator param */
154void __init load_default_elevator_module(void)
155{
156 struct elevator_type *e;
157
158 if (!chosen_elevator[0])
159 return;
160
161 /*
162 * Boot parameter is deprecated, we haven't supported that for MQ.
163 * Only look for non-mq schedulers from here.
164 */
165 spin_lock(&elv_list_lock);
166 e = elevator_find(chosen_elevator, false);
167 spin_unlock(&elv_list_lock);
168
169 if (!e)
170 request_module("%s-iosched", chosen_elevator);
171}
172
173static struct kobj_type elv_ktype; 151static struct kobj_type elv_ktype;
174 152
175struct elevator_queue *elevator_alloc(struct request_queue *q, 153struct elevator_queue *elevator_alloc(struct request_queue *q,
@@ -185,7 +163,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
185 kobject_init(&eq->kobj, &elv_ktype); 163 kobject_init(&eq->kobj, &elv_ktype);
186 mutex_init(&eq->sysfs_lock); 164 mutex_init(&eq->sysfs_lock);
187 hash_init(eq->hash); 165 hash_init(eq->hash);
188 eq->uses_mq = e->uses_mq;
189 166
190 return eq; 167 return eq;
191} 168}
@@ -200,54 +177,11 @@ static void elevator_release(struct kobject *kobj)
200 kfree(e); 177 kfree(e);
201} 178}
202 179
203/*
204 * Use the default elevator specified by config boot param for non-mq devices,
205 * or by config option. Don't try to load modules as we could be running off
206 * async and request_module() isn't allowed from async.
207 */
208int elevator_init(struct request_queue *q)
209{
210 struct elevator_type *e = NULL;
211 int err = 0;
212
213 /*
214 * q->sysfs_lock must be held to provide mutual exclusion between
215 * elevator_switch() and here.
216 */
217 mutex_lock(&q->sysfs_lock);
218 if (unlikely(q->elevator))
219 goto out_unlock;
220
221 if (*chosen_elevator) {
222 e = elevator_get(q, chosen_elevator, false);
223 if (!e)
224 printk(KERN_ERR "I/O scheduler %s not found\n",
225 chosen_elevator);
226 }
227
228 if (!e)
229 e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
230 if (!e) {
231 printk(KERN_ERR
232 "Default I/O scheduler not found. Using noop.\n");
233 e = elevator_get(q, "noop", false);
234 }
235
236 err = e->ops.sq.elevator_init_fn(q, e);
237 if (err)
238 elevator_put(e);
239out_unlock:
240 mutex_unlock(&q->sysfs_lock);
241 return err;
242}
243
244void elevator_exit(struct request_queue *q, struct elevator_queue *e) 180void elevator_exit(struct request_queue *q, struct elevator_queue *e)
245{ 181{
246 mutex_lock(&e->sysfs_lock); 182 mutex_lock(&e->sysfs_lock);
247 if (e->uses_mq && e->type->ops.mq.exit_sched) 183 if (e->type->ops.exit_sched)
248 blk_mq_exit_sched(q, e); 184 blk_mq_exit_sched(q, e);
249 else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
250 e->type->ops.sq.elevator_exit_fn(e);
251 mutex_unlock(&e->sysfs_lock); 185 mutex_unlock(&e->sysfs_lock);
252 186
253 kobject_put(&e->kobj); 187 kobject_put(&e->kobj);
@@ -356,68 +290,6 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector)
356} 290}
357EXPORT_SYMBOL(elv_rb_find); 291EXPORT_SYMBOL(elv_rb_find);
358 292
359/*
360 * Insert rq into dispatch queue of q. Queue lock must be held on
361 * entry. rq is sort instead into the dispatch queue. To be used by
362 * specific elevators.
363 */
364void elv_dispatch_sort(struct request_queue *q, struct request *rq)
365{
366 sector_t boundary;
367 struct list_head *entry;
368
369 if (q->last_merge == rq)
370 q->last_merge = NULL;
371
372 elv_rqhash_del(q, rq);
373
374 q->nr_sorted--;
375
376 boundary = q->end_sector;
377 list_for_each_prev(entry, &q->queue_head) {
378 struct request *pos = list_entry_rq(entry);
379
380 if (req_op(rq) != req_op(pos))
381 break;
382 if (rq_data_dir(rq) != rq_data_dir(pos))
383 break;
384 if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
385 break;
386 if (blk_rq_pos(rq) >= boundary) {
387 if (blk_rq_pos(pos) < boundary)
388 continue;
389 } else {
390 if (blk_rq_pos(pos) >= boundary)
391 break;
392 }
393 if (blk_rq_pos(rq) >= blk_rq_pos(pos))
394 break;
395 }
396
397 list_add(&rq->queuelist, entry);
398}
399EXPORT_SYMBOL(elv_dispatch_sort);
400
401/*
402 * Insert rq into dispatch queue of q. Queue lock must be held on
403 * entry. rq is added to the back of the dispatch queue. To be used by
404 * specific elevators.
405 */
406void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
407{
408 if (q->last_merge == rq)
409 q->last_merge = NULL;
410
411 elv_rqhash_del(q, rq);
412
413 q->nr_sorted--;
414
415 q->end_sector = rq_end_sector(rq);
416 q->boundary_rq = rq;
417 list_add_tail(&rq->queuelist, &q->queue_head);
418}
419EXPORT_SYMBOL(elv_dispatch_add_tail);
420
421enum elv_merge elv_merge(struct request_queue *q, struct request **req, 293enum elv_merge elv_merge(struct request_queue *q, struct request **req,
422 struct bio *bio) 294 struct bio *bio)
423{ 295{
@@ -457,10 +329,8 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
457 return ELEVATOR_BACK_MERGE; 329 return ELEVATOR_BACK_MERGE;
458 } 330 }
459 331
460 if (e->uses_mq && e->type->ops.mq.request_merge) 332 if (e->type->ops.request_merge)
461 return e->type->ops.mq.request_merge(q, req, bio); 333 return e->type->ops.request_merge(q, req, bio);
462 else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
463 return e->type->ops.sq.elevator_merge_fn(q, req, bio);
464 334
465 return ELEVATOR_NO_MERGE; 335 return ELEVATOR_NO_MERGE;
466} 336}
@@ -511,10 +381,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq,
511{ 381{
512 struct elevator_queue *e = q->elevator; 382 struct elevator_queue *e = q->elevator;
513 383
514 if (e->uses_mq && e->type->ops.mq.request_merged) 384 if (e->type->ops.request_merged)
515 e->type->ops.mq.request_merged(q, rq, type); 385 e->type->ops.request_merged(q, rq, type);
516 else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
517 e->type->ops.sq.elevator_merged_fn(q, rq, type);
518 386
519 if (type == ELEVATOR_BACK_MERGE) 387 if (type == ELEVATOR_BACK_MERGE)
520 elv_rqhash_reposition(q, rq); 388 elv_rqhash_reposition(q, rq);
@@ -526,176 +394,20 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
526 struct request *next) 394 struct request *next)
527{ 395{
528 struct elevator_queue *e = q->elevator; 396 struct elevator_queue *e = q->elevator;
529 bool next_sorted = false;
530
531 if (e->uses_mq && e->type->ops.mq.requests_merged)
532 e->type->ops.mq.requests_merged(q, rq, next);
533 else if (e->type->ops.sq.elevator_merge_req_fn) {
534 next_sorted = (__force bool)(next->rq_flags & RQF_SORTED);
535 if (next_sorted)
536 e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
537 }
538 397
539 elv_rqhash_reposition(q, rq); 398 if (e->type->ops.requests_merged)
540 399 e->type->ops.requests_merged(q, rq, next);
541 if (next_sorted) {
542 elv_rqhash_del(q, next);
543 q->nr_sorted--;
544 }
545 400
401 elv_rqhash_reposition(q, rq);
546 q->last_merge = rq; 402 q->last_merge = rq;
547} 403}
548 404
549void elv_bio_merged(struct request_queue *q, struct request *rq,
550 struct bio *bio)
551{
552 struct elevator_queue *e = q->elevator;
553
554 if (WARN_ON_ONCE(e->uses_mq))
555 return;
556
557 if (e->type->ops.sq.elevator_bio_merged_fn)
558 e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
559}
560
561void elv_requeue_request(struct request_queue *q, struct request *rq)
562{
563 /*
564 * it already went through dequeue, we need to decrement the
565 * in_flight count again
566 */
567 if (blk_account_rq(rq)) {
568 q->in_flight[rq_is_sync(rq)]--;
569 if (rq->rq_flags & RQF_SORTED)
570 elv_deactivate_rq(q, rq);
571 }
572
573 rq->rq_flags &= ~RQF_STARTED;
574
575 blk_pm_requeue_request(rq);
576
577 __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
578}
579
580void elv_drain_elevator(struct request_queue *q)
581{
582 struct elevator_queue *e = q->elevator;
583 static int printed;
584
585 if (WARN_ON_ONCE(e->uses_mq))
586 return;
587
588 lockdep_assert_held(q->queue_lock);
589
590 while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
591 ;
592 if (q->nr_sorted && !blk_queue_is_zoned(q) && printed++ < 10 ) {
593 printk(KERN_ERR "%s: forced dispatching is broken "
594 "(nr_sorted=%u), please report this\n",
595 q->elevator->type->elevator_name, q->nr_sorted);
596 }
597}
598
599void __elv_add_request(struct request_queue *q, struct request *rq, int where)
600{
601 trace_block_rq_insert(q, rq);
602
603 blk_pm_add_request(q, rq);
604
605 rq->q = q;
606
607 if (rq->rq_flags & RQF_SOFTBARRIER) {
608 /* barriers are scheduling boundary, update end_sector */
609 if (!blk_rq_is_passthrough(rq)) {
610 q->end_sector = rq_end_sector(rq);
611 q->boundary_rq = rq;
612 }
613 } else if (!(rq->rq_flags & RQF_ELVPRIV) &&
614 (where == ELEVATOR_INSERT_SORT ||
615 where == ELEVATOR_INSERT_SORT_MERGE))
616 where = ELEVATOR_INSERT_BACK;
617
618 switch (where) {
619 case ELEVATOR_INSERT_REQUEUE:
620 case ELEVATOR_INSERT_FRONT:
621 rq->rq_flags |= RQF_SOFTBARRIER;
622 list_add(&rq->queuelist, &q->queue_head);
623 break;
624
625 case ELEVATOR_INSERT_BACK:
626 rq->rq_flags |= RQF_SOFTBARRIER;
627 elv_drain_elevator(q);
628 list_add_tail(&rq->queuelist, &q->queue_head);
629 /*
630 * We kick the queue here for the following reasons.
631 * - The elevator might have returned NULL previously
632 * to delay requests and returned them now. As the
633 * queue wasn't empty before this request, ll_rw_blk
634 * won't run the queue on return, resulting in hang.
635 * - Usually, back inserted requests won't be merged
636 * with anything. There's no point in delaying queue
637 * processing.
638 */
639 __blk_run_queue(q);
640 break;
641
642 case ELEVATOR_INSERT_SORT_MERGE:
643 /*
644 * If we succeed in merging this request with one in the
645 * queue already, we are done - rq has now been freed,
646 * so no need to do anything further.
647 */
648 if (elv_attempt_insert_merge(q, rq))
649 break;
650 /* fall through */
651 case ELEVATOR_INSERT_SORT:
652 BUG_ON(blk_rq_is_passthrough(rq));
653 rq->rq_flags |= RQF_SORTED;
654 q->nr_sorted++;
655 if (rq_mergeable(rq)) {
656 elv_rqhash_add(q, rq);
657 if (!q->last_merge)
658 q->last_merge = rq;
659 }
660
661 /*
662 * Some ioscheds (cfq) run q->request_fn directly, so
663 * rq cannot be accessed after calling
664 * elevator_add_req_fn.
665 */
666 q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
667 break;
668
669 case ELEVATOR_INSERT_FLUSH:
670 rq->rq_flags |= RQF_SOFTBARRIER;
671 blk_insert_flush(rq);
672 break;
673 default:
674 printk(KERN_ERR "%s: bad insertion point %d\n",
675 __func__, where);
676 BUG();
677 }
678}
679EXPORT_SYMBOL(__elv_add_request);
680
681void elv_add_request(struct request_queue *q, struct request *rq, int where)
682{
683 unsigned long flags;
684
685 spin_lock_irqsave(q->queue_lock, flags);
686 __elv_add_request(q, rq, where);
687 spin_unlock_irqrestore(q->queue_lock, flags);
688}
689EXPORT_SYMBOL(elv_add_request);
690
691struct request *elv_latter_request(struct request_queue *q, struct request *rq) 405struct request *elv_latter_request(struct request_queue *q, struct request *rq)
692{ 406{
693 struct elevator_queue *e = q->elevator; 407 struct elevator_queue *e = q->elevator;
694 408
695 if (e->uses_mq && e->type->ops.mq.next_request) 409 if (e->type->ops.next_request)
696 return e->type->ops.mq.next_request(q, rq); 410 return e->type->ops.next_request(q, rq);
697 else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
698 return e->type->ops.sq.elevator_latter_req_fn(q, rq);
699 411
700 return NULL; 412 return NULL;
701} 413}
@@ -704,66 +416,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
704{ 416{
705 struct elevator_queue *e = q->elevator; 417 struct elevator_queue *e = q->elevator;
706 418
707 if (e->uses_mq && e->type->ops.mq.former_request) 419 if (e->type->ops.former_request)
708 return e->type->ops.mq.former_request(q, rq); 420 return e->type->ops.former_request(q, rq);
709 if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
710 return e->type->ops.sq.elevator_former_req_fn(q, rq);
711 return NULL;
712}
713
714int elv_set_request(struct request_queue *q, struct request *rq,
715 struct bio *bio, gfp_t gfp_mask)
716{
717 struct elevator_queue *e = q->elevator;
718
719 if (WARN_ON_ONCE(e->uses_mq))
720 return 0;
721
722 if (e->type->ops.sq.elevator_set_req_fn)
723 return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
724 return 0;
725}
726
727void elv_put_request(struct request_queue *q, struct request *rq)
728{
729 struct elevator_queue *e = q->elevator;
730
731 if (WARN_ON_ONCE(e->uses_mq))
732 return;
733
734 if (e->type->ops.sq.elevator_put_req_fn)
735 e->type->ops.sq.elevator_put_req_fn(rq);
736}
737
738int elv_may_queue(struct request_queue *q, unsigned int op)
739{
740 struct elevator_queue *e = q->elevator;
741
742 if (WARN_ON_ONCE(e->uses_mq))
743 return 0;
744
745 if (e->type->ops.sq.elevator_may_queue_fn)
746 return e->type->ops.sq.elevator_may_queue_fn(q, op);
747
748 return ELV_MQUEUE_MAY;
749}
750
751void elv_completed_request(struct request_queue *q, struct request *rq)
752{
753 struct elevator_queue *e = q->elevator;
754
755 if (WARN_ON_ONCE(e->uses_mq))
756 return;
757 421
758 /* 422 return NULL;
759 * request is released from the driver, io must be done
760 */
761 if (blk_account_rq(rq)) {
762 q->in_flight[rq_is_sync(rq)]--;
763 if ((rq->rq_flags & RQF_SORTED) &&
764 e->type->ops.sq.elevator_completed_req_fn)
765 e->type->ops.sq.elevator_completed_req_fn(q, rq);
766 }
767} 423}
768 424
769#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 425#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
@@ -832,8 +488,6 @@ int elv_register_queue(struct request_queue *q)
832 } 488 }
833 kobject_uevent(&e->kobj, KOBJ_ADD); 489 kobject_uevent(&e->kobj, KOBJ_ADD);
834 e->registered = 1; 490 e->registered = 1;
835 if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
836 e->type->ops.sq.elevator_registered_fn(q);
837 } 491 }
838 return error; 492 return error;
839} 493}
@@ -873,7 +527,7 @@ int elv_register(struct elevator_type *e)
873 527
874 /* register, don't allow duplicate names */ 528 /* register, don't allow duplicate names */
875 spin_lock(&elv_list_lock); 529 spin_lock(&elv_list_lock);
876 if (elevator_find(e->elevator_name, e->uses_mq)) { 530 if (elevator_find(e->elevator_name)) {
877 spin_unlock(&elv_list_lock); 531 spin_unlock(&elv_list_lock);
878 kmem_cache_destroy(e->icq_cache); 532 kmem_cache_destroy(e->icq_cache);
879 return -EBUSY; 533 return -EBUSY;
@@ -881,12 +535,6 @@ int elv_register(struct elevator_type *e)
881 list_add_tail(&e->list, &elv_list); 535 list_add_tail(&e->list, &elv_list);
882 spin_unlock(&elv_list_lock); 536 spin_unlock(&elv_list_lock);
883 537
884 /* print pretty message */
885 if (elevator_match(e, chosen_elevator) ||
886 (!*chosen_elevator &&
887 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
888 def = " (default)";
889
890 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, 538 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
891 def); 539 def);
892 return 0; 540 return 0;
@@ -989,71 +637,17 @@ out_unlock:
989 */ 637 */
990static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 638static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
991{ 639{
992 struct elevator_queue *old = q->elevator;
993 bool old_registered = false;
994 int err; 640 int err;
995 641
996 lockdep_assert_held(&q->sysfs_lock); 642 lockdep_assert_held(&q->sysfs_lock);
997 643
998 if (q->mq_ops) { 644 blk_mq_freeze_queue(q);
999 blk_mq_freeze_queue(q); 645 blk_mq_quiesce_queue(q);
1000 blk_mq_quiesce_queue(q);
1001
1002 err = elevator_switch_mq(q, new_e);
1003
1004 blk_mq_unquiesce_queue(q);
1005 blk_mq_unfreeze_queue(q);
1006
1007 return err;
1008 }
1009
1010 /*
1011 * Turn on BYPASS and drain all requests w/ elevator private data.
1012 * Block layer doesn't call into a quiesced elevator - all requests
1013 * are directly put on the dispatch list without elevator data
1014 * using INSERT_BACK. All requests have SOFTBARRIER set and no
1015 * merge happens either.
1016 */
1017 if (old) {
1018 old_registered = old->registered;
1019
1020 blk_queue_bypass_start(q);
1021
1022 /* unregister and clear all auxiliary data of the old elevator */
1023 if (old_registered)
1024 elv_unregister_queue(q);
1025
1026 ioc_clear_queue(q);
1027 }
1028 646
1029 /* allocate, init and register new elevator */ 647 err = elevator_switch_mq(q, new_e);
1030 err = new_e->ops.sq.elevator_init_fn(q, new_e);
1031 if (err)
1032 goto fail_init;
1033
1034 err = elv_register_queue(q);
1035 if (err)
1036 goto fail_register;
1037
1038 /* done, kill the old one and finish */
1039 if (old) {
1040 elevator_exit(q, old);
1041 blk_queue_bypass_end(q);
1042 }
1043 648
1044 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 649 blk_mq_unquiesce_queue(q);
1045 650 blk_mq_unfreeze_queue(q);
1046 return 0;
1047
1048fail_register:
1049 elevator_exit(q, q->elevator);
1050fail_init:
1051 /* switch failed, restore and re-register old elevator */
1052 if (old) {
1053 q->elevator = old;
1054 elv_register_queue(q);
1055 blk_queue_bypass_end(q);
1056 }
1057 651
1058 return err; 652 return err;
1059} 653}
@@ -1073,7 +667,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
1073 /* 667 /*
1074 * Special case for mq, turn off scheduling 668 * Special case for mq, turn off scheduling
1075 */ 669 */
1076 if (q->mq_ops && !strncmp(name, "none", 4)) 670 if (!strncmp(name, "none", 4))
1077 return elevator_switch(q, NULL); 671 return elevator_switch(q, NULL);
1078 672
1079 strlcpy(elevator_name, name, sizeof(elevator_name)); 673 strlcpy(elevator_name, name, sizeof(elevator_name));
@@ -1091,8 +685,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
1091 685
1092static inline bool elv_support_iosched(struct request_queue *q) 686static inline bool elv_support_iosched(struct request_queue *q)
1093{ 687{
1094 if (q->mq_ops && q->tag_set && (q->tag_set->flags & 688 if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
1095 BLK_MQ_F_NO_SCHED))
1096 return false; 689 return false;
1097 return true; 690 return true;
1098} 691}
@@ -1102,7 +695,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
1102{ 695{
1103 int ret; 696 int ret;
1104 697
1105 if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q)) 698 if (!queue_is_mq(q) || !elv_support_iosched(q))
1106 return count; 699 return count;
1107 700
1108 ret = __elevator_change(q, name); 701 ret = __elevator_change(q, name);
@@ -1117,10 +710,9 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
1117 struct elevator_queue *e = q->elevator; 710 struct elevator_queue *e = q->elevator;
1118 struct elevator_type *elv = NULL; 711 struct elevator_type *elv = NULL;
1119 struct elevator_type *__e; 712 struct elevator_type *__e;
1120 bool uses_mq = q->mq_ops != NULL;
1121 int len = 0; 713 int len = 0;
1122 714
1123 if (!queue_is_rq_based(q)) 715 if (!queue_is_mq(q))
1124 return sprintf(name, "none\n"); 716 return sprintf(name, "none\n");
1125 717
1126 if (!q->elevator) 718 if (!q->elevator)
@@ -1130,19 +722,16 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
1130 722
1131 spin_lock(&elv_list_lock); 723 spin_lock(&elv_list_lock);
1132 list_for_each_entry(__e, &elv_list, list) { 724 list_for_each_entry(__e, &elv_list, list) {
1133 if (elv && elevator_match(elv, __e->elevator_name) && 725 if (elv && elevator_match(elv, __e->elevator_name)) {
1134 (__e->uses_mq == uses_mq)) {
1135 len += sprintf(name+len, "[%s] ", elv->elevator_name); 726 len += sprintf(name+len, "[%s] ", elv->elevator_name);
1136 continue; 727 continue;
1137 } 728 }
1138 if (__e->uses_mq && q->mq_ops && elv_support_iosched(q)) 729 if (elv_support_iosched(q))
1139 len += sprintf(name+len, "%s ", __e->elevator_name);
1140 else if (!__e->uses_mq && !q->mq_ops)
1141 len += sprintf(name+len, "%s ", __e->elevator_name); 730 len += sprintf(name+len, "%s ", __e->elevator_name);
1142 } 731 }
1143 spin_unlock(&elv_list_lock); 732 spin_unlock(&elv_list_lock);
1144 733
1145 if (q->mq_ops && q->elevator) 734 if (q->elevator)
1146 len += sprintf(name+len, "none"); 735 len += sprintf(name+len, "none");
1147 736
1148 len += sprintf(len+name, "\n"); 737 len += sprintf(len+name, "\n");
diff --git a/block/genhd.c b/block/genhd.c
index cff6bdf27226..1dd8fd6613b8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -47,51 +47,64 @@ static void disk_release_events(struct gendisk *disk);
47 47
48void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) 48void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
49{ 49{
50 if (q->mq_ops) 50 if (queue_is_mq(q))
51 return; 51 return;
52 52
53 atomic_inc(&part->in_flight[rw]); 53 part_stat_local_inc(part, in_flight[rw]);
54 if (part->partno) 54 if (part->partno)
55 atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); 55 part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]);
56} 56}
57 57
58void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) 58void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
59{ 59{
60 if (q->mq_ops) 60 if (queue_is_mq(q))
61 return; 61 return;
62 62
63 atomic_dec(&part->in_flight[rw]); 63 part_stat_local_dec(part, in_flight[rw]);
64 if (part->partno) 64 if (part->partno)
65 atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); 65 part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]);
66} 66}
67 67
68void part_in_flight(struct request_queue *q, struct hd_struct *part, 68unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part)
69 unsigned int inflight[2])
70{ 69{
71 if (q->mq_ops) { 70 int cpu;
72 blk_mq_in_flight(q, part, inflight); 71 unsigned int inflight;
73 return; 72
73 if (queue_is_mq(q)) {
74 return blk_mq_in_flight(q, part);
74 } 75 }
75 76
76 inflight[0] = atomic_read(&part->in_flight[0]) + 77 inflight = 0;
77 atomic_read(&part->in_flight[1]); 78 for_each_possible_cpu(cpu) {
78 if (part->partno) { 79 inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
79 part = &part_to_disk(part)->part0; 80 part_stat_local_read_cpu(part, in_flight[1], cpu);
80 inflight[1] = atomic_read(&part->in_flight[0]) +
81 atomic_read(&part->in_flight[1]);
82 } 81 }
82 if ((int)inflight < 0)
83 inflight = 0;
84
85 return inflight;
83} 86}
84 87
85void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, 88void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
86 unsigned int inflight[2]) 89 unsigned int inflight[2])
87{ 90{
88 if (q->mq_ops) { 91 int cpu;
92
93 if (queue_is_mq(q)) {
89 blk_mq_in_flight_rw(q, part, inflight); 94 blk_mq_in_flight_rw(q, part, inflight);
90 return; 95 return;
91 } 96 }
92 97
93 inflight[0] = atomic_read(&part->in_flight[0]); 98 inflight[0] = 0;
94 inflight[1] = atomic_read(&part->in_flight[1]); 99 inflight[1] = 0;
100 for_each_possible_cpu(cpu) {
101 inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
102 inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
103 }
104 if ((int)inflight[0] < 0)
105 inflight[0] = 0;
106 if ((int)inflight[1] < 0)
107 inflight[1] = 0;
95} 108}
96 109
97struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) 110struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
@@ -1325,8 +1338,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1325 struct disk_part_iter piter; 1338 struct disk_part_iter piter;
1326 struct hd_struct *hd; 1339 struct hd_struct *hd;
1327 char buf[BDEVNAME_SIZE]; 1340 char buf[BDEVNAME_SIZE];
1328 unsigned int inflight[2]; 1341 unsigned int inflight;
1329 int cpu;
1330 1342
1331 /* 1343 /*
1332 if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) 1344 if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1338,10 +1350,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1338 1350
1339 disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); 1351 disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
1340 while ((hd = disk_part_iter_next(&piter))) { 1352 while ((hd = disk_part_iter_next(&piter))) {
1341 cpu = part_stat_lock(); 1353 inflight = part_in_flight(gp->queue, hd);
1342 part_round_stats(gp->queue, cpu, hd);
1343 part_stat_unlock();
1344 part_in_flight(gp->queue, hd, inflight);
1345 seq_printf(seqf, "%4d %7d %s " 1354 seq_printf(seqf, "%4d %7d %s "
1346 "%lu %lu %lu %u " 1355 "%lu %lu %lu %u "
1347 "%lu %lu %lu %u " 1356 "%lu %lu %lu %u "
@@ -1357,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1357 part_stat_read(hd, merges[STAT_WRITE]), 1366 part_stat_read(hd, merges[STAT_WRITE]),
1358 part_stat_read(hd, sectors[STAT_WRITE]), 1367 part_stat_read(hd, sectors[STAT_WRITE]),
1359 (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), 1368 (unsigned int)part_stat_read_msecs(hd, STAT_WRITE),
1360 inflight[0], 1369 inflight,
1361 jiffies_to_msecs(part_stat_read(hd, io_ticks)), 1370 jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1362 jiffies_to_msecs(part_stat_read(hd, time_in_queue)), 1371 jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
1363 part_stat_read(hd, ios[STAT_DISCARD]), 1372 part_stat_read(hd, ios[STAT_DISCARD]),
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index eccac01a10b6..ec6a04e01bc1 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -195,7 +195,7 @@ struct kyber_hctx_data {
195 unsigned int batching; 195 unsigned int batching;
196 struct kyber_ctx_queue *kcqs; 196 struct kyber_ctx_queue *kcqs;
197 struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; 197 struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
198 wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; 198 struct sbq_wait domain_wait[KYBER_NUM_DOMAINS];
199 struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; 199 struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
200 atomic_t wait_index[KYBER_NUM_DOMAINS]; 200 atomic_t wait_index[KYBER_NUM_DOMAINS];
201}; 201};
@@ -501,10 +501,11 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
501 501
502 for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 502 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
503 INIT_LIST_HEAD(&khd->rqs[i]); 503 INIT_LIST_HEAD(&khd->rqs[i]);
504 init_waitqueue_func_entry(&khd->domain_wait[i], 504 khd->domain_wait[i].sbq = NULL;
505 init_waitqueue_func_entry(&khd->domain_wait[i].wait,
505 kyber_domain_wake); 506 kyber_domain_wake);
506 khd->domain_wait[i].private = hctx; 507 khd->domain_wait[i].wait.private = hctx;
507 INIT_LIST_HEAD(&khd->domain_wait[i].entry); 508 INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry);
508 atomic_set(&khd->wait_index[i], 0); 509 atomic_set(&khd->wait_index[i], 0);
509 } 510 }
510 511
@@ -576,7 +577,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
576{ 577{
577 struct kyber_hctx_data *khd = hctx->sched_data; 578 struct kyber_hctx_data *khd = hctx->sched_data;
578 struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); 579 struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
579 struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; 580 struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
580 unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); 581 unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
581 struct list_head *rq_list = &kcq->rq_list[sched_domain]; 582 struct list_head *rq_list = &kcq->rq_list[sched_domain];
582 bool merged; 583 bool merged;
@@ -602,7 +603,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
602 603
603 list_for_each_entry_safe(rq, next, rq_list, queuelist) { 604 list_for_each_entry_safe(rq, next, rq_list, queuelist) {
604 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); 605 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
605 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; 606 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];
606 struct list_head *head = &kcq->rq_list[sched_domain]; 607 struct list_head *head = &kcq->rq_list[sched_domain];
607 608
608 spin_lock(&kcq->lock); 609 spin_lock(&kcq->lock);
@@ -611,7 +612,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
611 else 612 else
612 list_move_tail(&rq->queuelist, head); 613 list_move_tail(&rq->queuelist, head);
613 sbitmap_set_bit(&khd->kcq_map[sched_domain], 614 sbitmap_set_bit(&khd->kcq_map[sched_domain],
614 rq->mq_ctx->index_hw); 615 rq->mq_ctx->index_hw[hctx->type]);
615 blk_mq_sched_request_inserted(rq); 616 blk_mq_sched_request_inserted(rq);
616 spin_unlock(&kcq->lock); 617 spin_unlock(&kcq->lock);
617 } 618 }
@@ -698,12 +699,13 @@ static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
698 flush_busy_kcq, &data); 699 flush_busy_kcq, &data);
699} 700}
700 701
701static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, 702static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags,
702 void *key) 703 void *key)
703{ 704{
704 struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); 705 struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private);
706 struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait);
705 707
706 list_del_init(&wait->entry); 708 sbitmap_del_wait_queue(wait);
707 blk_mq_run_hw_queue(hctx, true); 709 blk_mq_run_hw_queue(hctx, true);
708 return 1; 710 return 1;
709} 711}
@@ -714,7 +716,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
714{ 716{
715 unsigned int sched_domain = khd->cur_domain; 717 unsigned int sched_domain = khd->cur_domain;
716 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; 718 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
717 wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; 719 struct sbq_wait *wait = &khd->domain_wait[sched_domain];
718 struct sbq_wait_state *ws; 720 struct sbq_wait_state *ws;
719 int nr; 721 int nr;
720 722
@@ -725,11 +727,11 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
725 * run when one becomes available. Note that this is serialized on 727 * run when one becomes available. Note that this is serialized on
726 * khd->lock, but we still need to be careful about the waker. 728 * khd->lock, but we still need to be careful about the waker.
727 */ 729 */
728 if (nr < 0 && list_empty_careful(&wait->entry)) { 730 if (nr < 0 && list_empty_careful(&wait->wait.entry)) {
729 ws = sbq_wait_ptr(domain_tokens, 731 ws = sbq_wait_ptr(domain_tokens,
730 &khd->wait_index[sched_domain]); 732 &khd->wait_index[sched_domain]);
731 khd->domain_ws[sched_domain] = ws; 733 khd->domain_ws[sched_domain] = ws;
732 add_wait_queue(&ws->wait, wait); 734 sbitmap_add_wait_queue(domain_tokens, ws, wait);
733 735
734 /* 736 /*
735 * Try again in case a token was freed before we got on the wait 737 * Try again in case a token was freed before we got on the wait
@@ -745,10 +747,10 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
745 * between the !list_empty_careful() check and us grabbing the lock, but 747 * between the !list_empty_careful() check and us grabbing the lock, but
746 * list_del_init() is okay with that. 748 * list_del_init() is okay with that.
747 */ 749 */
748 if (nr >= 0 && !list_empty_careful(&wait->entry)) { 750 if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) {
749 ws = khd->domain_ws[sched_domain]; 751 ws = khd->domain_ws[sched_domain];
750 spin_lock_irq(&ws->wait.lock); 752 spin_lock_irq(&ws->wait.lock);
751 list_del_init(&wait->entry); 753 sbitmap_del_wait_queue(wait);
752 spin_unlock_irq(&ws->wait.lock); 754 spin_unlock_irq(&ws->wait.lock);
753 } 755 }
754 756
@@ -951,7 +953,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
951{ \ 953{ \
952 struct blk_mq_hw_ctx *hctx = data; \ 954 struct blk_mq_hw_ctx *hctx = data; \
953 struct kyber_hctx_data *khd = hctx->sched_data; \ 955 struct kyber_hctx_data *khd = hctx->sched_data; \
954 wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ 956 wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \
955 \ 957 \
956 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ 958 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
957 return 0; \ 959 return 0; \
@@ -1017,7 +1019,7 @@ static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
1017#endif 1019#endif
1018 1020
1019static struct elevator_type kyber_sched = { 1021static struct elevator_type kyber_sched = {
1020 .ops.mq = { 1022 .ops = {
1021 .init_sched = kyber_init_sched, 1023 .init_sched = kyber_init_sched,
1022 .exit_sched = kyber_exit_sched, 1024 .exit_sched = kyber_exit_sched,
1023 .init_hctx = kyber_init_hctx, 1025 .init_hctx = kyber_init_hctx,
@@ -1032,7 +1034,6 @@ static struct elevator_type kyber_sched = {
1032 .dispatch_request = kyber_dispatch_request, 1034 .dispatch_request = kyber_dispatch_request,
1033 .has_work = kyber_has_work, 1035 .has_work = kyber_has_work,
1034 }, 1036 },
1035 .uses_mq = true,
1036#ifdef CONFIG_BLK_DEBUG_FS 1037#ifdef CONFIG_BLK_DEBUG_FS
1037 .queue_debugfs_attrs = kyber_queue_debugfs_attrs, 1038 .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
1038 .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, 1039 .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 099a9e05854c..14288f864e94 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -373,9 +373,16 @@ done:
373 373
374/* 374/*
375 * One confusing aspect here is that we get called for a specific 375 * One confusing aspect here is that we get called for a specific
376 * hardware queue, but we return a request that may not be for a 376 * hardware queue, but we may return a request that is for a
377 * different hardware queue. This is because mq-deadline has shared 377 * different hardware queue. This is because mq-deadline has shared
378 * state for all hardware queues, in terms of sorting, FIFOs, etc. 378 * state for all hardware queues, in terms of sorting, FIFOs, etc.
379 *
380 * For a zoned block device, __dd_dispatch_request() may return NULL
381 * if all the queued write requests are directed at zones that are already
382 * locked due to on-going write requests. In this case, make sure to mark
383 * the queue as needing a restart to ensure that the queue is run again
384 * and the pending writes dispatched once the target zones for the ongoing
385 * write requests are unlocked in dd_finish_request().
379 */ 386 */
380static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) 387static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
381{ 388{
@@ -384,6 +391,9 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
384 391
385 spin_lock(&dd->lock); 392 spin_lock(&dd->lock);
386 rq = __dd_dispatch_request(dd); 393 rq = __dd_dispatch_request(dd);
394 if (!rq && blk_queue_is_zoned(hctx->queue) &&
395 !list_empty(&dd->fifo_list[WRITE]))
396 blk_mq_sched_mark_restart_hctx(hctx);
387 spin_unlock(&dd->lock); 397 spin_unlock(&dd->lock);
388 398
389 return rq; 399 return rq;
@@ -761,7 +771,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
761#endif 771#endif
762 772
763static struct elevator_type mq_deadline = { 773static struct elevator_type mq_deadline = {
764 .ops.mq = { 774 .ops = {
765 .insert_requests = dd_insert_requests, 775 .insert_requests = dd_insert_requests,
766 .dispatch_request = dd_dispatch_request, 776 .dispatch_request = dd_dispatch_request,
767 .prepare_request = dd_prepare_request, 777 .prepare_request = dd_prepare_request,
@@ -777,7 +787,6 @@ static struct elevator_type mq_deadline = {
777 .exit_sched = dd_exit_queue, 787 .exit_sched = dd_exit_queue,
778 }, 788 },
779 789
780 .uses_mq = true,
781#ifdef CONFIG_BLK_DEBUG_FS 790#ifdef CONFIG_BLK_DEBUG_FS
782 .queue_debugfs_attrs = deadline_queue_debugfs_attrs, 791 .queue_debugfs_attrs = deadline_queue_debugfs_attrs,
783#endif 792#endif
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
deleted file mode 100644
index 2d1b15d89b45..000000000000
--- a/block/noop-iosched.c
+++ /dev/null
@@ -1,124 +0,0 @@
1/*
2 * elevator noop
3 */
4#include <linux/blkdev.h>
5#include <linux/elevator.h>
6#include <linux/bio.h>
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/init.h>
10
11struct noop_data {
12 struct list_head queue;
13};
14
15static void noop_merged_requests(struct request_queue *q, struct request *rq,
16 struct request *next)
17{
18 list_del_init(&next->queuelist);
19}
20
21static int noop_dispatch(struct request_queue *q, int force)
22{
23 struct noop_data *nd = q->elevator->elevator_data;
24 struct request *rq;
25
26 rq = list_first_entry_or_null(&nd->queue, struct request, queuelist);
27 if (rq) {
28 list_del_init(&rq->queuelist);
29 elv_dispatch_sort(q, rq);
30 return 1;
31 }
32 return 0;
33}
34
35static void noop_add_request(struct request_queue *q, struct request *rq)
36{
37 struct noop_data *nd = q->elevator->elevator_data;
38
39 list_add_tail(&rq->queuelist, &nd->queue);
40}
41
42static struct request *
43noop_former_request(struct request_queue *q, struct request *rq)
44{
45 struct noop_data *nd = q->elevator->elevator_data;
46
47 if (rq->queuelist.prev == &nd->queue)
48 return NULL;
49 return list_prev_entry(rq, queuelist);
50}
51
52static struct request *
53noop_latter_request(struct request_queue *q, struct request *rq)
54{
55 struct noop_data *nd = q->elevator->elevator_data;
56
57 if (rq->queuelist.next == &nd->queue)
58 return NULL;
59 return list_next_entry(rq, queuelist);
60}
61
62static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
63{
64 struct noop_data *nd;
65 struct elevator_queue *eq;
66
67 eq = elevator_alloc(q, e);
68 if (!eq)
69 return -ENOMEM;
70
71 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
72 if (!nd) {
73 kobject_put(&eq->kobj);
74 return -ENOMEM;
75 }
76 eq->elevator_data = nd;
77
78 INIT_LIST_HEAD(&nd->queue);
79
80 spin_lock_irq(q->queue_lock);
81 q->elevator = eq;
82 spin_unlock_irq(q->queue_lock);
83 return 0;
84}
85
86static void noop_exit_queue(struct elevator_queue *e)
87{
88 struct noop_data *nd = e->elevator_data;
89
90 BUG_ON(!list_empty(&nd->queue));
91 kfree(nd);
92}
93
94static struct elevator_type elevator_noop = {
95 .ops.sq = {
96 .elevator_merge_req_fn = noop_merged_requests,
97 .elevator_dispatch_fn = noop_dispatch,
98 .elevator_add_req_fn = noop_add_request,
99 .elevator_former_req_fn = noop_former_request,
100 .elevator_latter_req_fn = noop_latter_request,
101 .elevator_init_fn = noop_init_queue,
102 .elevator_exit_fn = noop_exit_queue,
103 },
104 .elevator_name = "noop",
105 .elevator_owner = THIS_MODULE,
106};
107
108static int __init noop_init(void)
109{
110 return elv_register(&elevator_noop);
111}
112
113static void __exit noop_exit(void)
114{
115 elv_unregister(&elevator_noop);
116}
117
118module_init(noop_init);
119module_exit(noop_exit);
120
121
122MODULE_AUTHOR("Jens Axboe");
123MODULE_LICENSE("GPL");
124MODULE_DESCRIPTION("No-op IO scheduler");
diff --git a/block/partition-generic.c b/block/partition-generic.c
index d3d14e81fb12..8e596a8dff32 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -120,13 +120,9 @@ ssize_t part_stat_show(struct device *dev,
120{ 120{
121 struct hd_struct *p = dev_to_part(dev); 121 struct hd_struct *p = dev_to_part(dev);
122 struct request_queue *q = part_to_disk(p)->queue; 122 struct request_queue *q = part_to_disk(p)->queue;
123 unsigned int inflight[2]; 123 unsigned int inflight;
124 int cpu;
125 124
126 cpu = part_stat_lock(); 125 inflight = part_in_flight(q, p);
127 part_round_stats(q, cpu, p);
128 part_stat_unlock();
129 part_in_flight(q, p, inflight);
130 return sprintf(buf, 126 return sprintf(buf,
131 "%8lu %8lu %8llu %8u " 127 "%8lu %8lu %8llu %8u "
132 "%8lu %8lu %8llu %8u " 128 "%8lu %8lu %8llu %8u "
@@ -141,7 +137,7 @@ ssize_t part_stat_show(struct device *dev,
141 part_stat_read(p, merges[STAT_WRITE]), 137 part_stat_read(p, merges[STAT_WRITE]),
142 (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), 138 (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
143 (unsigned int)part_stat_read_msecs(p, STAT_WRITE), 139 (unsigned int)part_stat_read_msecs(p, STAT_WRITE),
144 inflight[0], 140 inflight,
145 jiffies_to_msecs(part_stat_read(p, io_ticks)), 141 jiffies_to_msecs(part_stat_read(p, io_ticks)),
146 jiffies_to_msecs(part_stat_read(p, time_in_queue)), 142 jiffies_to_msecs(part_stat_read(p, time_in_queue)),
147 part_stat_read(p, ios[STAT_DISCARD]), 143 part_stat_read(p, ios[STAT_DISCARD]),
@@ -249,9 +245,10 @@ struct device_type part_type = {
249 .uevent = part_uevent, 245 .uevent = part_uevent,
250}; 246};
251 247
252static void delete_partition_rcu_cb(struct rcu_head *head) 248static void delete_partition_work_fn(struct work_struct *work)
253{ 249{
254 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); 250 struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct,
251 rcu_work);
255 252
256 part->start_sect = 0; 253 part->start_sect = 0;
257 part->nr_sects = 0; 254 part->nr_sects = 0;
@@ -262,7 +259,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
262void __delete_partition(struct percpu_ref *ref) 259void __delete_partition(struct percpu_ref *ref)
263{ 260{
264 struct hd_struct *part = container_of(ref, struct hd_struct, ref); 261 struct hd_struct *part = container_of(ref, struct hd_struct, ref);
265 call_rcu(&part->rcu_head, delete_partition_rcu_cb); 262 INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn);
263 queue_rcu_work(system_wq, &part->rcu_work);
266} 264}
267 265
268/* 266/*