summaryrefslogtreecommitdiffstats
path: root/block/bfq-iosched.c
diff options
context:
space:
mode:
authorArianna Avanzini <avanzini.arianna@gmail.com>2017-04-12 12:23:16 -0400
committerJens Axboe <axboe@fb.com>2017-04-19 10:30:26 -0400
commit36eca894832351feed9072d0f97eb06fc9482ca4 (patch)
tree765f768c397d9097c83b32d42f62e3f83c96dcd1 /block/bfq-iosched.c
parentcfd69712a101f528caad1529e64834e31e5dff62 (diff)
block, bfq: add Early Queue Merge (EQM)
A set of processes may happen to perform interleaved reads, i.e., read requests whose union would give rise to a sequential read pattern. There are two typical cases: first, processes reading fixed-size chunks of data at a fixed distance from each other; second, processes reading variable-size chunks at variable distances. The latter case occurs for example with QEMU, which splits the I/O generated by a guest into multiple chunks, and lets these chunks be served by a pool of I/O threads, iteratively assigning the next chunk of I/O to the first available thread. CFQ denotes as 'cooperating' a set of processes that are doing interleaved I/O, and when it detects cooperating processes, it merges their queues to obtain a sequential I/O pattern from the union of their I/O requests, and hence boost the throughput. Unfortunately, in the following frequent case, the mechanism implemented in CFQ for detecting cooperating processes and merging their queues is not responsive enough to handle also the fluctuating I/O pattern of the second type of processes. Suppose that one process of the second type issues a request close to the next request to serve of another process of the same type. At that time the two processes would be considered as cooperating. But, if the request issued by the first process is to be merged with some other already-queued request, then, from the moment at which this request arrives, to the moment when CFQ controls whether the two processes are cooperating, the two processes are likely to be already doing I/O in distant zones of the disk surface or device memory. CFQ uses however preemption to get a sequential read pattern out of the read requests performed by the second type of processes too. As a consequence, CFQ uses two different mechanisms to achieve the same goal: boosting the throughput with interleaved I/O. This patch introduces Early Queue Merge (EQM), a unified mechanism to get a sequential read pattern with both types of processes. The main idea is to immediately check whether a newly-arrived request lets some pair of processes become cooperating, both in the case of actual request insertion and, to be responsive with the second type of processes, in the case of request merge. Both types of processes are then handled by just merging their queues. Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com> Signed-off-by: Mauro Andreolini <mauro.andreolini@unimore.it> Signed-off-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r--block/bfq-iosched.c881
1 files changed, 840 insertions, 41 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index deb1f21c535f..6e7388a1d220 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -281,11 +281,12 @@ struct bfq_ttime {
281 * struct bfq_queue - leaf schedulable entity. 281 * struct bfq_queue - leaf schedulable entity.
282 * 282 *
283 * A bfq_queue is a leaf request queue; it can be associated with an 283 * A bfq_queue is a leaf request queue; it can be associated with an
284 * io_context or more, if it is async. @cgroup holds a reference to 284 * io_context or more, if it is async or shared between cooperating
285 * the cgroup, to be sure that it does not disappear while a bfqq 285 * processes. @cgroup holds a reference to the cgroup, to be sure that it
286 * still references it (mostly to avoid races between request issuing 286 * does not disappear while a bfqq still references it (mostly to avoid
287 * and task migration followed by cgroup destruction). All the fields 287 * races between request issuing and task migration followed by cgroup
288 * are protected by the queue lock of the containing bfqd. 288 * destruction).
289 * All the fields are protected by the queue lock of the containing bfqd.
289 */ 290 */
290struct bfq_queue { 291struct bfq_queue {
291 /* reference counter */ 292 /* reference counter */
@@ -298,6 +299,16 @@ struct bfq_queue {
298 /* next ioprio and ioprio class if a change is in progress */ 299 /* next ioprio and ioprio class if a change is in progress */
299 unsigned short new_ioprio, new_ioprio_class; 300 unsigned short new_ioprio, new_ioprio_class;
300 301
302 /*
303 * Shared bfq_queue if queue is cooperating with one or more
304 * other queues.
305 */
306 struct bfq_queue *new_bfqq;
307 /* request-position tree member (see bfq_group's @rq_pos_tree) */
308 struct rb_node pos_node;
309 /* request-position tree root (see bfq_group's @rq_pos_tree) */
310 struct rb_root *pos_root;
311
301 /* sorted list of pending requests */ 312 /* sorted list of pending requests */
302 struct rb_root sort_list; 313 struct rb_root sort_list;
303 /* if fifo isn't expired, next request to serve */ 314 /* if fifo isn't expired, next request to serve */
@@ -347,6 +358,12 @@ struct bfq_queue {
347 /* pid of the process owning the queue, used for logging purposes */ 358 /* pid of the process owning the queue, used for logging purposes */
348 pid_t pid; 359 pid_t pid;
349 360
361 /*
362 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
363 * if the queue is shared.
364 */
365 struct bfq_io_cq *bic;
366
350 /* current maximum weight-raising time for this queue */ 367 /* current maximum weight-raising time for this queue */
351 unsigned long wr_cur_max_time; 368 unsigned long wr_cur_max_time;
352 /* 369 /*
@@ -375,10 +392,13 @@ struct bfq_queue {
375 * last transition from idle to backlogged. 392 * last transition from idle to backlogged.
376 */ 393 */
377 unsigned long service_from_backlogged; 394 unsigned long service_from_backlogged;
395
378 /* 396 /*
379 * Value of wr start time when switching to soft rt 397 * Value of wr start time when switching to soft rt
380 */ 398 */
381 unsigned long wr_start_at_switch_to_srt; 399 unsigned long wr_start_at_switch_to_srt;
400
401 unsigned long split_time; /* time of last split */
382}; 402};
383 403
384/** 404/**
@@ -394,6 +414,26 @@ struct bfq_io_cq {
394#ifdef CONFIG_BFQ_GROUP_IOSCHED 414#ifdef CONFIG_BFQ_GROUP_IOSCHED
395 uint64_t blkcg_serial_nr; /* the current blkcg serial */ 415 uint64_t blkcg_serial_nr; /* the current blkcg serial */
396#endif 416#endif
417 /*
418 * Snapshot of the idle window before merging; taken to
419 * remember this value while the queue is merged, so as to be
420 * able to restore it in case of split.
421 */
422 bool saved_idle_window;
423 /*
424 * Same purpose as the previous two fields for the I/O bound
425 * classification of a queue.
426 */
427 bool saved_IO_bound;
428
429 /*
430 * Similar to previous fields: save wr information.
431 */
432 unsigned long saved_wr_coeff;
433 unsigned long saved_last_wr_start_finish;
434 unsigned long saved_wr_start_at_switch_to_srt;
435 unsigned int saved_wr_cur_max_time;
436 struct bfq_ttime saved_ttime;
397}; 437};
398 438
399enum bfq_device_speed { 439enum bfq_device_speed {
@@ -584,6 +624,15 @@ struct bfq_data {
584 struct bfq_io_cq *bio_bic; 624 struct bfq_io_cq *bio_bic;
585 /* bfqq associated with the task issuing current bio for merging */ 625 /* bfqq associated with the task issuing current bio for merging */
586 struct bfq_queue *bio_bfqq; 626 struct bfq_queue *bio_bfqq;
627
628 /*
629 * io context to put right after bfqd->lock is released. This
630 * filed is used to perform put_io_context, when needed, to
631 * after the scheduler lock has been released, and thus
632 * prevent an ioc->lock from being possibly taken while the
633 * scheduler lock is being held.
634 */
635 struct io_context *ioc_to_put;
587}; 636};
588 637
589enum bfqq_state_flags { 638enum bfqq_state_flags {
@@ -605,6 +654,8 @@ enum bfqq_state_flags {
605 * may need softrt-next-start 654 * may need softrt-next-start
606 * update 655 * update
607 */ 656 */
657 BFQQF_coop, /* bfqq is shared */
658 BFQQF_split_coop /* shared bfqq will be split */
608}; 659};
609 660
610#define BFQ_BFQQ_FNS(name) \ 661#define BFQ_BFQQ_FNS(name) \
@@ -628,6 +679,8 @@ BFQ_BFQQ_FNS(fifo_expire);
628BFQ_BFQQ_FNS(idle_window); 679BFQ_BFQQ_FNS(idle_window);
629BFQ_BFQQ_FNS(sync); 680BFQ_BFQQ_FNS(sync);
630BFQ_BFQQ_FNS(IO_bound); 681BFQ_BFQQ_FNS(IO_bound);
682BFQ_BFQQ_FNS(coop);
683BFQ_BFQQ_FNS(split_coop);
631BFQ_BFQQ_FNS(softrt_update); 684BFQ_BFQQ_FNS(softrt_update);
632#undef BFQ_BFQQ_FNS 685#undef BFQ_BFQQ_FNS
633 686
@@ -738,6 +791,9 @@ struct bfq_group_data {
738 * to avoid too many special cases during group creation/ 791 * to avoid too many special cases during group creation/
739 * migration. 792 * migration.
740 * @stats: stats for this bfqg. 793 * @stats: stats for this bfqg.
794 * @rq_pos_tree: rbtree sorted by next_request position, used when
795 * determining if two or more queues have interleaving
796 * requests (see bfq_find_close_cooperator()).
741 * 797 *
742 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup 798 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
743 * there is a set of bfq_groups, each one collecting the lower-level 799 * there is a set of bfq_groups, each one collecting the lower-level
@@ -762,6 +818,8 @@ struct bfq_group {
762 818
763 struct bfq_entity *my_entity; 819 struct bfq_entity *my_entity;
764 820
821 struct rb_root rq_pos_tree;
822
765 struct bfqg_stats stats; 823 struct bfqg_stats stats;
766}; 824};
767 825
@@ -811,6 +869,27 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
811 return bic->icq.q->elevator->elevator_data; 869 return bic->icq.q->elevator->elevator_data;
812} 870}
813 871
872#ifdef CONFIG_BFQ_GROUP_IOSCHED
873
874static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
875{
876 struct bfq_entity *group_entity = bfqq->entity.parent;
877
878 if (!group_entity)
879 group_entity = &bfqq->bfqd->root_group->entity;
880
881 return container_of(group_entity, struct bfq_group, entity);
882}
883
884#else
885
886static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
887{
888 return bfqq->bfqd->root_group;
889}
890
891#endif
892
814static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); 893static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
815static void bfq_put_queue(struct bfq_queue *bfqq); 894static void bfq_put_queue(struct bfq_queue *bfqq);
816static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, 895static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
@@ -975,6 +1054,34 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd)
975 } 1054 }
976} 1055}
977 1056
1057/*
1058 * Next two functions release bfqd->lock and put the io context
1059 * pointed by bfqd->ioc_to_put. This delayed put is used to not risk
1060 * to take an ioc->lock while the scheduler lock is being held.
1061 */
1062static void bfq_unlock_put_ioc(struct bfq_data *bfqd)
1063{
1064 struct io_context *ioc_to_put = bfqd->ioc_to_put;
1065
1066 bfqd->ioc_to_put = NULL;
1067 spin_unlock_irq(&bfqd->lock);
1068
1069 if (ioc_to_put)
1070 put_io_context(ioc_to_put);
1071}
1072
1073static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd,
1074 unsigned long flags)
1075{
1076 struct io_context *ioc_to_put = bfqd->ioc_to_put;
1077
1078 bfqd->ioc_to_put = NULL;
1079 spin_unlock_irqrestore(&bfqd->lock, flags);
1080
1081 if (ioc_to_put)
1082 put_io_context(ioc_to_put);
1083}
1084
978/** 1085/**
979 * bfq_gt - compare two timestamps. 1086 * bfq_gt - compare two timestamps.
980 * @a: first ts. 1087 * @a: first ts.
@@ -2425,7 +2532,14 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
2425 struct bfq_entity *entity = in_serv_entity; 2532 struct bfq_entity *entity = in_serv_entity;
2426 2533
2427 if (bfqd->in_service_bic) { 2534 if (bfqd->in_service_bic) {
2428 put_io_context(bfqd->in_service_bic->icq.ioc); 2535 /*
2536 * Schedule the release of a reference to
2537 * bfqd->in_service_bic->icq.ioc to right after the
2538 * scheduler lock is released. This ioc is not
2539 * released immediately, to not risk to possibly take
2540 * an ioc->lock while holding the scheduler lock.
2541 */
2542 bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc;
2429 bfqd->in_service_bic = NULL; 2543 bfqd->in_service_bic = NULL;
2430 } 2544 }
2431 2545
@@ -2914,6 +3028,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
2914 * in bfq_init_queue() 3028 * in bfq_init_queue()
2915 */ 3029 */
2916 bfqg->bfqd = bfqd; 3030 bfqg->bfqd = bfqd;
3031 bfqg->rq_pos_tree = RB_ROOT;
2917} 3032}
2918 3033
2919static void bfq_pd_free(struct blkg_policy_data *pd) 3034static void bfq_pd_free(struct blkg_policy_data *pd)
@@ -2982,6 +3097,8 @@ static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
2982 return bfqg; 3097 return bfqg;
2983} 3098}
2984 3099
3100static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
3101 struct bfq_queue *bfqq);
2985static void bfq_bfqq_expire(struct bfq_data *bfqd, 3102static void bfq_bfqq_expire(struct bfq_data *bfqd,
2986 struct bfq_queue *bfqq, 3103 struct bfq_queue *bfqq,
2987 bool compensate, 3104 bool compensate,
@@ -3030,8 +3147,10 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3030 entity->sched_data = &bfqg->sched_data; 3147 entity->sched_data = &bfqg->sched_data;
3031 bfqg_get(bfqg); 3148 bfqg_get(bfqg);
3032 3149
3033 if (bfq_bfqq_busy(bfqq)) 3150 if (bfq_bfqq_busy(bfqq)) {
3151 bfq_pos_tree_add_move(bfqd, bfqq);
3034 bfq_activate_bfqq(bfqd, bfqq); 3152 bfq_activate_bfqq(bfqd, bfqq);
3153 }
3035 3154
3036 if (!bfqd->in_service_queue && !bfqd->rq_in_driver) 3155 if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
3037 bfq_schedule_dispatch(bfqd); 3156 bfq_schedule_dispatch(bfqd);
@@ -3071,8 +3190,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
3071 bic_set_bfqq(bic, NULL, 0); 3190 bic_set_bfqq(bic, NULL, 0);
3072 bfq_log_bfqq(bfqd, async_bfqq, 3191 bfq_log_bfqq(bfqd, async_bfqq,
3073 "bic_change_group: %p %d", 3192 "bic_change_group: %p %d",
3074 async_bfqq, 3193 async_bfqq, async_bfqq->ref);
3075 async_bfqq->ref);
3076 bfq_put_queue(async_bfqq); 3194 bfq_put_queue(async_bfqq);
3077 } 3195 }
3078 } 3196 }
@@ -3214,7 +3332,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
3214 __bfq_deactivate_entity(entity, false); 3332 __bfq_deactivate_entity(entity, false);
3215 bfq_put_async_queues(bfqd, bfqg); 3333 bfq_put_async_queues(bfqd, bfqg);
3216 3334
3217 spin_unlock_irqrestore(&bfqd->lock, flags); 3335 bfq_unlock_put_ioc_restore(bfqd, flags);
3218 /* 3336 /*
3219 * @blkg is going offline and will be ignored by 3337 * @blkg is going offline and will be ignored by
3220 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so 3338 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
@@ -3731,6 +3849,72 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
3731 } 3849 }
3732} 3850}
3733 3851
3852static struct bfq_queue *
3853bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
3854 sector_t sector, struct rb_node **ret_parent,
3855 struct rb_node ***rb_link)
3856{
3857 struct rb_node **p, *parent;
3858 struct bfq_queue *bfqq = NULL;
3859
3860 parent = NULL;
3861 p = &root->rb_node;
3862 while (*p) {
3863 struct rb_node **n;
3864
3865 parent = *p;
3866 bfqq = rb_entry(parent, struct bfq_queue, pos_node);
3867
3868 /*
3869 * Sort strictly based on sector. Smallest to the left,
3870 * largest to the right.
3871 */
3872 if (sector > blk_rq_pos(bfqq->next_rq))
3873 n = &(*p)->rb_right;
3874 else if (sector < blk_rq_pos(bfqq->next_rq))
3875 n = &(*p)->rb_left;
3876 else
3877 break;
3878 p = n;
3879 bfqq = NULL;
3880 }
3881
3882 *ret_parent = parent;
3883 if (rb_link)
3884 *rb_link = p;
3885
3886 bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
3887 (unsigned long long)sector,
3888 bfqq ? bfqq->pid : 0);
3889
3890 return bfqq;
3891}
3892
3893static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3894{
3895 struct rb_node **p, *parent;
3896 struct bfq_queue *__bfqq;
3897
3898 if (bfqq->pos_root) {
3899 rb_erase(&bfqq->pos_node, bfqq->pos_root);
3900 bfqq->pos_root = NULL;
3901 }
3902
3903 if (bfq_class_idle(bfqq))
3904 return;
3905 if (!bfqq->next_rq)
3906 return;
3907
3908 bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
3909 __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
3910 blk_rq_pos(bfqq->next_rq), &parent, &p);
3911 if (!__bfqq) {
3912 rb_link_node(&bfqq->pos_node, parent, p);
3913 rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
3914 } else
3915 bfqq->pos_root = NULL;
3916}
3917
3734/* 3918/*
3735 * Return expired entry, or NULL to just start from scratch in rbtree. 3919 * Return expired entry, or NULL to just start from scratch in rbtree.
3736 */ 3920 */
@@ -3837,6 +4021,43 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
3837 } 4021 }
3838} 4022}
3839 4023
4024static void
4025bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
4026{
4027 if (bic->saved_idle_window)
4028 bfq_mark_bfqq_idle_window(bfqq);
4029 else
4030 bfq_clear_bfqq_idle_window(bfqq);
4031
4032 if (bic->saved_IO_bound)
4033 bfq_mark_bfqq_IO_bound(bfqq);
4034 else
4035 bfq_clear_bfqq_IO_bound(bfqq);
4036
4037 bfqq->ttime = bic->saved_ttime;
4038 bfqq->wr_coeff = bic->saved_wr_coeff;
4039 bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
4040 bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
4041 bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
4042
4043 if (bfqq->wr_coeff > 1 &&
4044 time_is_before_jiffies(bfqq->last_wr_start_finish +
4045 bfqq->wr_cur_max_time)) {
4046 bfq_log_bfqq(bfqq->bfqd, bfqq,
4047 "resume state: switching off wr");
4048
4049 bfqq->wr_coeff = 1;
4050 }
4051
4052 /* make sure weight will be updated, however we got here */
4053 bfqq->entity.prio_changed = 1;
4054}
4055
4056static int bfqq_process_refs(struct bfq_queue *bfqq)
4057{
4058 return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
4059}
4060
3840static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) 4061static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
3841{ 4062{
3842 struct bfq_entity *entity = &bfqq->entity; 4063 struct bfq_entity *entity = &bfqq->entity;
@@ -4157,14 +4378,16 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
4157 /* 4378 /*
4158 * bfqq deserves to be weight-raised if: 4379 * bfqq deserves to be weight-raised if:
4159 * - it is sync, 4380 * - it is sync,
4160 * - it has been idle for enough time or is soft real-time. 4381 * - it has been idle for enough time or is soft real-time,
4382 * - is linked to a bfq_io_cq (it is not shared in any sense).
4161 */ 4383 */
4162 soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && 4384 soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
4163 time_is_before_jiffies(bfqq->soft_rt_next_start); 4385 time_is_before_jiffies(bfqq->soft_rt_next_start);
4164 *interactive = idle_for_long_time; 4386 *interactive = idle_for_long_time;
4165 wr_or_deserves_wr = bfqd->low_latency && 4387 wr_or_deserves_wr = bfqd->low_latency &&
4166 (bfqq->wr_coeff > 1 || 4388 (bfqq->wr_coeff > 1 ||
4167 (bfq_bfqq_sync(bfqq) && (*interactive || soft_rt))); 4389 (bfq_bfqq_sync(bfqq) &&
4390 bfqq->bic && (*interactive || soft_rt)));
4168 4391
4169 /* 4392 /*
4170 * Using the last flag, update budget and check whether bfqq 4393 * Using the last flag, update budget and check whether bfqq
@@ -4186,14 +4409,22 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
4186 } 4409 }
4187 4410
4188 if (bfqd->low_latency) { 4411 if (bfqd->low_latency) {
4189 bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, 4412 if (unlikely(time_is_after_jiffies(bfqq->split_time)))
4190 old_wr_coeff, 4413 /* wraparound */
4191 wr_or_deserves_wr, 4414 bfqq->split_time =
4192 *interactive, 4415 jiffies - bfqd->bfq_wr_min_idle_time - 1;
4193 soft_rt); 4416
4194 4417 if (time_is_before_jiffies(bfqq->split_time +
4195 if (old_wr_coeff != bfqq->wr_coeff) 4418 bfqd->bfq_wr_min_idle_time)) {
4196 bfqq->entity.prio_changed = 1; 4419 bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
4420 old_wr_coeff,
4421 wr_or_deserves_wr,
4422 *interactive,
4423 soft_rt);
4424
4425 if (old_wr_coeff != bfqq->wr_coeff)
4426 bfqq->entity.prio_changed = 1;
4427 }
4197 } 4428 }
4198 4429
4199 bfqq->last_idle_bklogged = jiffies; 4430 bfqq->last_idle_bklogged = jiffies;
@@ -4240,6 +4471,12 @@ static void bfq_add_request(struct request *rq)
4240 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); 4471 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
4241 bfqq->next_rq = next_rq; 4472 bfqq->next_rq = next_rq;
4242 4473
4474 /*
4475 * Adjust priority tree position, if next_rq changes.
4476 */
4477 if (prev != bfqq->next_rq)
4478 bfq_pos_tree_add_move(bfqd, bfqq);
4479
4243 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ 4480 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
4244 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, 4481 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
4245 rq, &interactive); 4482 rq, &interactive);
@@ -4368,6 +4605,14 @@ static void bfq_remove_request(struct request_queue *q,
4368 */ 4605 */
4369 bfqq->entity.budget = bfqq->entity.service = 0; 4606 bfqq->entity.budget = bfqq->entity.service = 0;
4370 } 4607 }
4608
4609 /*
4610 * Remove queue from request-position tree as it is empty.
4611 */
4612 if (bfqq->pos_root) {
4613 rb_erase(&bfqq->pos_node, bfqq->pos_root);
4614 bfqq->pos_root = NULL;
4615 }
4371 } 4616 }
4372 4617
4373 if (rq->cmd_flags & REQ_META) 4618 if (rq->cmd_flags & REQ_META)
@@ -4445,11 +4690,14 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
4445 bfqd->last_position); 4690 bfqd->last_position);
4446 bfqq->next_rq = next_rq; 4691 bfqq->next_rq = next_rq;
4447 /* 4692 /*
4448 * If next_rq changes, update the queue's budget to fit 4693 * If next_rq changes, update both the queue's budget to
4449 * the new request. 4694 * fit the new request and the queue's position in its
4695 * rq_pos_tree.
4450 */ 4696 */
4451 if (prev != bfqq->next_rq) 4697 if (prev != bfqq->next_rq) {
4452 bfq_updated_next_req(bfqd, bfqq); 4698 bfq_updated_next_req(bfqd, bfqq);
4699 bfq_pos_tree_add_move(bfqd, bfqq);
4700 }
4453 } 4701 }
4454} 4702}
4455 4703
@@ -4532,12 +4780,364 @@ static void bfq_end_wr(struct bfq_data *bfqd)
4532 spin_unlock_irq(&bfqd->lock); 4780 spin_unlock_irq(&bfqd->lock);
4533} 4781}
4534 4782
4783static sector_t bfq_io_struct_pos(void *io_struct, bool request)
4784{
4785 if (request)
4786 return blk_rq_pos(io_struct);
4787 else
4788 return ((struct bio *)io_struct)->bi_iter.bi_sector;
4789}
4790
4791static int bfq_rq_close_to_sector(void *io_struct, bool request,
4792 sector_t sector)
4793{
4794 return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
4795 BFQQ_CLOSE_THR;
4796}
4797
4798static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
4799 struct bfq_queue *bfqq,
4800 sector_t sector)
4801{
4802 struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
4803 struct rb_node *parent, *node;
4804 struct bfq_queue *__bfqq;
4805
4806 if (RB_EMPTY_ROOT(root))
4807 return NULL;
4808
4809 /*
4810 * First, if we find a request starting at the end of the last
4811 * request, choose it.
4812 */
4813 __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
4814 if (__bfqq)
4815 return __bfqq;
4816
4817 /*
4818 * If the exact sector wasn't found, the parent of the NULL leaf
4819 * will contain the closest sector (rq_pos_tree sorted by
4820 * next_request position).
4821 */
4822 __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
4823 if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
4824 return __bfqq;
4825
4826 if (blk_rq_pos(__bfqq->next_rq) < sector)
4827 node = rb_next(&__bfqq->pos_node);
4828 else
4829 node = rb_prev(&__bfqq->pos_node);
4830 if (!node)
4831 return NULL;
4832
4833 __bfqq = rb_entry(node, struct bfq_queue, pos_node);
4834 if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
4835 return __bfqq;
4836
4837 return NULL;
4838}
4839
4840static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
4841 struct bfq_queue *cur_bfqq,
4842 sector_t sector)
4843{
4844 struct bfq_queue *bfqq;
4845
4846 /*
4847 * We shall notice if some of the queues are cooperating,
4848 * e.g., working closely on the same area of the device. In
4849 * that case, we can group them together and: 1) don't waste
4850 * time idling, and 2) serve the union of their requests in
4851 * the best possible order for throughput.
4852 */
4853 bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
4854 if (!bfqq || bfqq == cur_bfqq)
4855 return NULL;
4856
4857 return bfqq;
4858}
4859
4860static struct bfq_queue *
4861bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
4862{
4863 int process_refs, new_process_refs;
4864 struct bfq_queue *__bfqq;
4865
4866 /*
4867 * If there are no process references on the new_bfqq, then it is
4868 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
4869 * may have dropped their last reference (not just their last process
4870 * reference).
4871 */
4872 if (!bfqq_process_refs(new_bfqq))
4873 return NULL;
4874
4875 /* Avoid a circular list and skip interim queue merges. */
4876 while ((__bfqq = new_bfqq->new_bfqq)) {
4877 if (__bfqq == bfqq)
4878 return NULL;
4879 new_bfqq = __bfqq;
4880 }
4881
4882 process_refs = bfqq_process_refs(bfqq);
4883 new_process_refs = bfqq_process_refs(new_bfqq);
4884 /*
4885 * If the process for the bfqq has gone away, there is no
4886 * sense in merging the queues.
4887 */
4888 if (process_refs == 0 || new_process_refs == 0)
4889 return NULL;
4890
4891 bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
4892 new_bfqq->pid);
4893
4894 /*
4895 * Merging is just a redirection: the requests of the process
4896 * owning one of the two queues are redirected to the other queue.
4897 * The latter queue, in its turn, is set as shared if this is the
4898 * first time that the requests of some process are redirected to
4899 * it.
4900 *
4901 * We redirect bfqq to new_bfqq and not the opposite, because we
4902 * are in the context of the process owning bfqq, hence we have
4903 * the io_cq of this process. So we can immediately configure this
4904 * io_cq to redirect the requests of the process to new_bfqq.
4905 *
4906 * NOTE, even if new_bfqq coincides with the in-service queue, the
4907 * io_cq of new_bfqq is not available, because, if the in-service
4908 * queue is shared, bfqd->in_service_bic may not point to the
4909 * io_cq of the in-service queue.
4910 * Redirecting the requests of the process owning bfqq to the
4911 * currently in-service queue is in any case the best option, as
4912 * we feed the in-service queue with new requests close to the
4913 * last request served and, by doing so, hopefully increase the
4914 * throughput.
4915 */
4916 bfqq->new_bfqq = new_bfqq;
4917 new_bfqq->ref += process_refs;
4918 return new_bfqq;
4919}
4920
4921static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
4922 struct bfq_queue *new_bfqq)
4923{
4924 if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
4925 (bfqq->ioprio_class != new_bfqq->ioprio_class))
4926 return false;
4927
4928 /*
4929 * If either of the queues has already been detected as seeky,
4930 * then merging it with the other queue is unlikely to lead to
4931 * sequential I/O.
4932 */
4933 if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
4934 return false;
4935
4936 /*
4937 * Interleaved I/O is known to be done by (some) applications
4938 * only for reads, so it does not make sense to merge async
4939 * queues.
4940 */
4941 if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
4942 return false;
4943
4944 return true;
4945}
4946
4947/*
4948 * If this function returns true, then bfqq cannot be merged. The idea
4949 * is that true cooperation happens very early after processes start
4950 * to do I/O. Usually, late cooperations are just accidental false
4951 * positives. In case bfqq is weight-raised, such false positives
4952 * would evidently degrade latency guarantees for bfqq.
4953 */
4954static bool wr_from_too_long(struct bfq_queue *bfqq)
4955{
4956 return bfqq->wr_coeff > 1 &&
4957 time_is_before_jiffies(bfqq->last_wr_start_finish +
4958 msecs_to_jiffies(100));
4959}
4960
4961/*
4962 * Attempt to schedule a merge of bfqq with the currently in-service
4963 * queue or with a close queue among the scheduled queues. Return
4964 * NULL if no merge was scheduled, a pointer to the shared bfq_queue
4965 * structure otherwise.
4966 *
4967 * The OOM queue is not allowed to participate to cooperation: in fact, since
4968 * the requests temporarily redirected to the OOM queue could be redirected
4969 * again to dedicated queues at any time, the state needed to correctly
4970 * handle merging with the OOM queue would be quite complex and expensive
4971 * to maintain. Besides, in such a critical condition as an out of memory,
4972 * the benefits of queue merging may be little relevant, or even negligible.
4973 *
4974 * Weight-raised queues can be merged only if their weight-raising
4975 * period has just started. In fact cooperating processes are usually
4976 * started together. Thus, with this filter we avoid false positives
4977 * that would jeopardize low-latency guarantees.
4978 *
4979 * WARNING: queue merging may impair fairness among non-weight raised
4980 * queues, for at least two reasons: 1) the original weight of a
4981 * merged queue may change during the merged state, 2) even being the
4982 * weight the same, a merged queue may be bloated with many more
4983 * requests than the ones produced by its originally-associated
4984 * process.
4985 */
4986static struct bfq_queue *
4987bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4988 void *io_struct, bool request)
4989{
4990 struct bfq_queue *in_service_bfqq, *new_bfqq;
4991
4992 if (bfqq->new_bfqq)
4993 return bfqq->new_bfqq;
4994
4995 if (!io_struct ||
4996 wr_from_too_long(bfqq) ||
4997 unlikely(bfqq == &bfqd->oom_bfqq))
4998 return NULL;
4999
5000 /* If there is only one backlogged queue, don't search. */
5001 if (bfqd->busy_queues == 1)
5002 return NULL;
5003
5004 in_service_bfqq = bfqd->in_service_queue;
5005
5006 if (!in_service_bfqq || in_service_bfqq == bfqq ||
5007 !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||
5008 unlikely(in_service_bfqq == &bfqd->oom_bfqq))
5009 goto check_scheduled;
5010
5011 if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
5012 bfqq->entity.parent == in_service_bfqq->entity.parent &&
5013 bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
5014 new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
5015 if (new_bfqq)
5016 return new_bfqq;
5017 }
5018 /*
5019 * Check whether there is a cooperator among currently scheduled
5020 * queues. The only thing we need is that the bio/request is not
5021 * NULL, as we need it to establish whether a cooperator exists.
5022 */
5023check_scheduled:
5024 new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
5025 bfq_io_struct_pos(io_struct, request));
5026
5027 if (new_bfqq && !wr_from_too_long(new_bfqq) &&
5028 likely(new_bfqq != &bfqd->oom_bfqq) &&
5029 bfq_may_be_close_cooperator(bfqq, new_bfqq))
5030 return bfq_setup_merge(bfqq, new_bfqq);
5031
5032 return NULL;
5033}
5034
5035static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
5036{
5037 struct bfq_io_cq *bic = bfqq->bic;
5038
5039 /*
5040 * If !bfqq->bic, the queue is already shared or its requests
5041 * have already been redirected to a shared queue; both idle window
5042 * and weight raising state have already been saved. Do nothing.
5043 */
5044 if (!bic)
5045 return;
5046
5047 bic->saved_ttime = bfqq->ttime;
5048 bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
5049 bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
5050 bic->saved_wr_coeff = bfqq->wr_coeff;
5051 bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
5052 bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
5053 bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
5054}
5055
5056static void bfq_get_bic_reference(struct bfq_queue *bfqq)
5057{
5058 /*
5059 * If bfqq->bic has a non-NULL value, the bic to which it belongs
5060 * is about to begin using a shared bfq_queue.
5061 */
5062 if (bfqq->bic)
5063 atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
5064}
5065
5066static void
5067bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
5068 struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
5069{
5070 bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
5071 (unsigned long)new_bfqq->pid);
5072 /* Save weight raising and idle window of the merged queues */
5073 bfq_bfqq_save_state(bfqq);
5074 bfq_bfqq_save_state(new_bfqq);
5075 if (bfq_bfqq_IO_bound(bfqq))
5076 bfq_mark_bfqq_IO_bound(new_bfqq);
5077 bfq_clear_bfqq_IO_bound(bfqq);
5078
5079 /*
5080 * If bfqq is weight-raised, then let new_bfqq inherit
5081 * weight-raising. To reduce false positives, neglect the case
5082 * where bfqq has just been created, but has not yet made it
5083 * to be weight-raised (which may happen because EQM may merge
5084 * bfqq even before bfq_add_request is executed for the first
5085 * time for bfqq).
5086 */
5087 if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
5088 new_bfqq->wr_coeff = bfqq->wr_coeff;
5089 new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
5090 new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
5091 new_bfqq->wr_start_at_switch_to_srt =
5092 bfqq->wr_start_at_switch_to_srt;
5093 if (bfq_bfqq_busy(new_bfqq))
5094 bfqd->wr_busy_queues++;
5095 new_bfqq->entity.prio_changed = 1;
5096 }
5097
5098 if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
5099 bfqq->wr_coeff = 1;
5100 bfqq->entity.prio_changed = 1;
5101 if (bfq_bfqq_busy(bfqq))
5102 bfqd->wr_busy_queues--;
5103 }
5104
5105 bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
5106 bfqd->wr_busy_queues);
5107
5108 /*
5109 * Grab a reference to the bic, to prevent it from being destroyed
5110 * before being possibly touched by a bfq_split_bfqq().
5111 */
5112 bfq_get_bic_reference(bfqq);
5113 bfq_get_bic_reference(new_bfqq);
5114 /*
5115 * Merge queues (that is, let bic redirect its requests to new_bfqq)
5116 */
5117 bic_set_bfqq(bic, new_bfqq, 1);
5118 bfq_mark_bfqq_coop(new_bfqq);
5119 /*
5120 * new_bfqq now belongs to at least two bics (it is a shared queue):
5121 * set new_bfqq->bic to NULL. bfqq either:
5122 * - does not belong to any bic any more, and hence bfqq->bic must
5123 * be set to NULL, or
5124 * - is a queue whose owning bics have already been redirected to a
5125 * different queue, hence the queue is destined to not belong to
5126 * any bic soon and bfqq->bic is already NULL (therefore the next
5127 * assignment causes no harm).
5128 */
5129 new_bfqq->bic = NULL;
5130 bfqq->bic = NULL;
5131 /* release process reference to bfqq */
5132 bfq_put_queue(bfqq);
5133}
5134
4535static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, 5135static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
4536 struct bio *bio) 5136 struct bio *bio)
4537{ 5137{
4538 struct bfq_data *bfqd = q->elevator->elevator_data; 5138 struct bfq_data *bfqd = q->elevator->elevator_data;
4539 bool is_sync = op_is_sync(bio->bi_opf); 5139 bool is_sync = op_is_sync(bio->bi_opf);
4540 struct bfq_queue *bfqq = bfqd->bio_bfqq; 5140 struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq;
4541 5141
4542 /* 5142 /*
4543 * Disallow merge of a sync bio into an async request. 5143 * Disallow merge of a sync bio into an async request.
@@ -4552,6 +5152,37 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
4552 if (!bfqq) 5152 if (!bfqq)
4553 return false; 5153 return false;
4554 5154
5155 /*
5156 * We take advantage of this function to perform an early merge
5157 * of the queues of possible cooperating processes.
5158 */
5159 new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
5160 if (new_bfqq) {
5161 /*
5162 * bic still points to bfqq, then it has not yet been
5163 * redirected to some other bfq_queue, and a queue
5164 * merge beween bfqq and new_bfqq can be safely
5165 * fulfillled, i.e., bic can be redirected to new_bfqq
5166 * and bfqq can be put.
5167 */
5168 bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
5169 new_bfqq);
5170 /*
5171 * If we get here, bio will be queued into new_queue,
5172 * so use new_bfqq to decide whether bio and rq can be
5173 * merged.
5174 */
5175 bfqq = new_bfqq;
5176
5177 /*
5178 * Change also bqfd->bio_bfqq, as
5179 * bfqd->bio_bic now points to new_bfqq, and
5180 * this function may be invoked again (and then may
5181 * use again bqfd->bio_bfqq).
5182 */
5183 bfqd->bio_bfqq = bfqq;
5184 }
5185
4555 return bfqq == RQ_BFQQ(rq); 5186 return bfqq == RQ_BFQQ(rq);
4556} 5187}
4557 5188
@@ -4959,6 +5590,15 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
4959 5590
4960static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) 5591static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4961{ 5592{
5593 /*
5594 * If this bfqq is shared between multiple processes, check
5595 * to make sure that those processes are still issuing I/Os
5596 * within the mean seek distance. If not, it may be time to
5597 * break the queues apart again.
5598 */
5599 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
5600 bfq_mark_bfqq_split_coop(bfqq);
5601
4962 if (RB_EMPTY_ROOT(&bfqq->sort_list)) { 5602 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
4963 if (bfqq->dispatched == 0) 5603 if (bfqq->dispatched == 0)
4964 /* 5604 /*
@@ -4970,8 +5610,13 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4970 bfqq->budget_timeout = jiffies; 5610 bfqq->budget_timeout = jiffies;
4971 5611
4972 bfq_del_bfqq_busy(bfqd, bfqq, true); 5612 bfq_del_bfqq_busy(bfqd, bfqq, true);
4973 } else 5613 } else {
4974 bfq_requeue_bfqq(bfqd, bfqq); 5614 bfq_requeue_bfqq(bfqd, bfqq);
5615 /*
5616 * Resort priority tree of potential close cooperators.
5617 */
5618 bfq_pos_tree_add_move(bfqd, bfqq);
5619 }
4975 5620
4976 /* 5621 /*
4977 * All in-service entities must have been properly deactivated 5622 * All in-service entities must have been properly deactivated
@@ -5792,8 +6437,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5792 6437
5793 /* 6438 /*
5794 * If too much time has elapsed from the beginning of 6439 * If too much time has elapsed from the beginning of
5795 * this weight-raising period, then end weight 6440 * this weight-raising period, then end weight raising.
5796 * raising.
5797 */ 6441 */
5798 if (time_is_before_jiffies(bfqq->last_wr_start_finish + 6442 if (time_is_before_jiffies(bfqq->last_wr_start_finish +
5799 bfqq->wr_cur_max_time)) { 6443 bfqq->wr_cur_max_time)) {
@@ -5969,8 +6613,9 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
5969 struct request *rq; 6613 struct request *rq;
5970 6614
5971 spin_lock_irq(&bfqd->lock); 6615 spin_lock_irq(&bfqd->lock);
6616
5972 rq = __bfq_dispatch_request(hctx); 6617 rq = __bfq_dispatch_request(hctx);
5973 spin_unlock_irq(&bfqd->lock); 6618 bfq_unlock_put_ioc(bfqd);
5974 6619
5975 return rq; 6620 return rq;
5976} 6621}
@@ -6004,6 +6649,25 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
6004#endif 6649#endif
6005} 6650}
6006 6651
6652static void bfq_put_cooperator(struct bfq_queue *bfqq)
6653{
6654 struct bfq_queue *__bfqq, *next;
6655
6656 /*
6657 * If this queue was scheduled to merge with another queue, be
6658 * sure to drop the reference taken on that queue (and others in
6659 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
6660 */
6661 __bfqq = bfqq->new_bfqq;
6662 while (__bfqq) {
6663 if (__bfqq == bfqq)
6664 break;
6665 next = __bfqq->new_bfqq;
6666 bfq_put_queue(__bfqq);
6667 __bfqq = next;
6668 }
6669}
6670
6007static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) 6671static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6008{ 6672{
6009 if (bfqq == bfqd->in_service_queue) { 6673 if (bfqq == bfqd->in_service_queue) {
@@ -6013,6 +6677,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6013 6677
6014 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); 6678 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
6015 6679
6680 bfq_put_cooperator(bfqq);
6681
6016 bfq_put_queue(bfqq); /* release process reference */ 6682 bfq_put_queue(bfqq); /* release process reference */
6017} 6683}
6018 6684
@@ -6028,9 +6694,20 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
6028 unsigned long flags; 6694 unsigned long flags;
6029 6695
6030 spin_lock_irqsave(&bfqd->lock, flags); 6696 spin_lock_irqsave(&bfqd->lock, flags);
6697 /*
6698 * If the bic is using a shared queue, put the
6699 * reference taken on the io_context when the bic
6700 * started using a shared bfq_queue. This put cannot
6701 * make ioc->ref_count reach 0, then no ioc->lock
6702 * risks to be taken (leading to possible deadlock
6703 * scenarios).
6704 */
6705 if (is_sync && bfq_bfqq_coop(bfqq))
6706 put_io_context(bic->icq.ioc);
6707
6031 bfq_exit_bfqq(bfqd, bfqq); 6708 bfq_exit_bfqq(bfqd, bfqq);
6032 bic_set_bfqq(bic, NULL, is_sync); 6709 bic_set_bfqq(bic, NULL, is_sync);
6033 spin_unlock_irq(&bfqd->lock); 6710 bfq_unlock_put_ioc_restore(bfqd, flags);
6034 } 6711 }
6035} 6712}
6036 6713
@@ -6152,8 +6829,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6152 bfqq->budget_timeout = bfq_smallest_from_now(); 6829 bfqq->budget_timeout = bfq_smallest_from_now();
6153 6830
6154 bfqq->wr_coeff = 1; 6831 bfqq->wr_coeff = 1;
6155 bfqq->last_wr_start_finish = bfq_smallest_from_now(); 6832 bfqq->last_wr_start_finish = jiffies;
6156 bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); 6833 bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
6834 bfqq->split_time = bfq_smallest_from_now();
6157 6835
6158 /* 6836 /*
6159 * Set to the value for which bfqq will not be deemed as 6837 * Set to the value for which bfqq will not be deemed as
@@ -6288,6 +6966,11 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
6288 if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) 6966 if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
6289 return; 6967 return;
6290 6968
6969 /* Idle window just restored, statistics are meaningless. */
6970 if (time_is_after_eq_jiffies(bfqq->split_time +
6971 bfqd->bfq_wr_min_idle_time))
6972 return;
6973
6291 enable_idle = bfq_bfqq_idle_window(bfqq); 6974 enable_idle = bfq_bfqq_idle_window(bfqq);
6292 6975
6293 if (atomic_read(&bic->icq.ioc->active_ref) == 0 || 6976 if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
@@ -6383,7 +7066,38 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6383 7066
6384static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) 7067static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
6385{ 7068{
6386 struct bfq_queue *bfqq = RQ_BFQQ(rq); 7069 struct bfq_queue *bfqq = RQ_BFQQ(rq),
7070 *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
7071
7072 if (new_bfqq) {
7073 if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
7074 new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
7075 /*
7076 * Release the request's reference to the old bfqq
7077 * and make sure one is taken to the shared queue.
7078 */
7079 new_bfqq->allocated++;
7080 bfqq->allocated--;
7081 new_bfqq->ref++;
7082 /*
7083 * If the bic associated with the process
7084 * issuing this request still points to bfqq
7085 * (and thus has not been already redirected
7086 * to new_bfqq or even some other bfq_queue),
7087 * then complete the merge and redirect it to
7088 * new_bfqq.
7089 */
7090 if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
7091 bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
7092 bfqq, new_bfqq);
7093 /*
7094 * rq is about to be enqueued into new_bfqq,
7095 * release rq reference on bfqq
7096 */
7097 bfq_put_queue(bfqq);
7098 rq->elv.priv[1] = new_bfqq;
7099 bfqq = new_bfqq;
7100 }
6387 7101
6388 bfq_add_request(rq); 7102 bfq_add_request(rq);
6389 7103
@@ -6425,7 +7139,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
6425 } 7139 }
6426 } 7140 }
6427 7141
6428 spin_unlock_irq(&bfqd->lock); 7142 bfq_unlock_put_ioc(bfqd);
6429} 7143}
6430 7144
6431static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, 7145static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
@@ -6576,7 +7290,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
6576 bfq_completed_request(bfqq, bfqd); 7290 bfq_completed_request(bfqq, bfqd);
6577 bfq_put_rq_priv_body(bfqq); 7291 bfq_put_rq_priv_body(bfqq);
6578 7292
6579 spin_unlock_irqrestore(&bfqd->lock, flags); 7293 bfq_unlock_put_ioc_restore(bfqd, flags);
6580 } else { 7294 } else {
6581 /* 7295 /*
6582 * Request rq may be still/already in the scheduler, 7296 * Request rq may be still/already in the scheduler,
@@ -6600,6 +7314,55 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
6600} 7314}
6601 7315
6602/* 7316/*
7317 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
7318 * was the last process referring to that bfqq.
7319 */
7320static struct bfq_queue *
7321bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
7322{
7323 bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
7324
7325 if (bfqq_process_refs(bfqq) == 1) {
7326 bfqq->pid = current->pid;
7327 bfq_clear_bfqq_coop(bfqq);
7328 bfq_clear_bfqq_split_coop(bfqq);
7329 return bfqq;
7330 }
7331
7332 bic_set_bfqq(bic, NULL, 1);
7333
7334 bfq_put_cooperator(bfqq);
7335
7336 bfq_put_queue(bfqq);
7337 return NULL;
7338}
7339
7340static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
7341 struct bfq_io_cq *bic,
7342 struct bio *bio,
7343 bool split, bool is_sync,
7344 bool *new_queue)
7345{
7346 struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
7347
7348 if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
7349 return bfqq;
7350
7351 if (new_queue)
7352 *new_queue = true;
7353
7354 if (bfqq)
7355 bfq_put_queue(bfqq);
7356 bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
7357
7358 bic_set_bfqq(bic, bfqq, is_sync);
7359 if (split && is_sync)
7360 bfqq->split_time = jiffies;
7361
7362 return bfqq;
7363}
7364
7365/*
6603 * Allocate bfq data structures associated with this request. 7366 * Allocate bfq data structures associated with this request.
6604 */ 7367 */
6605static int bfq_get_rq_private(struct request_queue *q, struct request *rq, 7368static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
@@ -6609,6 +7372,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
6609 struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); 7372 struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
6610 const int is_sync = rq_is_sync(rq); 7373 const int is_sync = rq_is_sync(rq);
6611 struct bfq_queue *bfqq; 7374 struct bfq_queue *bfqq;
7375 bool new_queue = false;
6612 7376
6613 spin_lock_irq(&bfqd->lock); 7377 spin_lock_irq(&bfqd->lock);
6614 7378
@@ -6619,12 +7383,28 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
6619 7383
6620 bfq_bic_update_cgroup(bic, bio); 7384 bfq_bic_update_cgroup(bic, bio);
6621 7385
6622 bfqq = bic_to_bfqq(bic, is_sync); 7386 bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
6623 if (!bfqq || bfqq == &bfqd->oom_bfqq) { 7387 &new_queue);
6624 if (bfqq) 7388
6625 bfq_put_queue(bfqq); 7389 if (likely(!new_queue)) {
6626 bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); 7390 /* If the queue was seeky for too long, break it apart. */
6627 bic_set_bfqq(bic, bfqq, is_sync); 7391 if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
7392 bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
7393 bfqq = bfq_split_bfqq(bic, bfqq);
7394 /*
7395 * A reference to bic->icq.ioc needs to be
7396 * released after a queue split. Do not do it
7397 * immediately, to not risk to possibly take
7398 * an ioc->lock while holding the scheduler
7399 * lock.
7400 */
7401 bfqd->ioc_to_put = bic->icq.ioc;
7402
7403 if (!bfqq)
7404 bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
7405 true, is_sync,
7406 NULL);
7407 }
6628 } 7408 }
6629 7409
6630 bfqq->allocated++; 7410 bfqq->allocated++;
@@ -6635,7 +7415,25 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
6635 rq->elv.priv[0] = bic; 7415 rq->elv.priv[0] = bic;
6636 rq->elv.priv[1] = bfqq; 7416 rq->elv.priv[1] = bfqq;
6637 7417
6638 spin_unlock_irq(&bfqd->lock); 7418 /*
7419 * If a bfq_queue has only one process reference, it is owned
7420 * by only this bic: we can then set bfqq->bic = bic. in
7421 * addition, if the queue has also just been split, we have to
7422 * resume its state.
7423 */
7424 if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
7425 bfqq->bic = bic;
7426 if (bfqd->ioc_to_put) { /* if true, there has been a split */
7427 /*
7428 * The queue has just been split from a shared
7429 * queue: restore the idle window and the
7430 * possible weight raising period.
7431 */
7432 bfq_bfqq_resume_state(bfqq, bic);
7433 }
7434 }
7435
7436 bfq_unlock_put_ioc(bfqd);
6639 7437
6640 return 0; 7438 return 0;
6641 7439
@@ -6680,7 +7478,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
6680 bfq_bfqq_expire(bfqd, bfqq, true, reason); 7478 bfq_bfqq_expire(bfqd, bfqq, true, reason);
6681 7479
6682schedule_dispatch: 7480schedule_dispatch:
6683 spin_unlock_irqrestore(&bfqd->lock, flags); 7481 bfq_unlock_put_ioc_restore(bfqd, flags);
6684 bfq_schedule_dispatch(bfqd); 7482 bfq_schedule_dispatch(bfqd);
6685} 7483}
6686 7484
@@ -6777,6 +7575,7 @@ static void bfq_init_root_group(struct bfq_group *root_group,
6777 root_group->my_entity = NULL; 7575 root_group->my_entity = NULL;
6778 root_group->bfqd = bfqd; 7576 root_group->bfqd = bfqd;
6779#endif 7577#endif
7578 root_group->rq_pos_tree = RB_ROOT;
6780 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) 7579 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
6781 root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; 7580 root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
6782 root_group->sched_data.bfq_class_idle_last_service = jiffies; 7581 root_group->sched_data.bfq_class_idle_last_service = jiffies;