summaryrefslogtreecommitdiffstats
path: root/block/bfq-iosched.c
diff options
context:
space:
mode:
authorArianna Avanzini <avanzini.arianna@gmail.com>2017-04-12 12:23:08 -0400
committerJens Axboe <axboe@fb.com>2017-04-19 10:30:26 -0400
commite21b7a0b988772e82e7147e1c659a5afe2ae003c (patch)
treea3958b4ce872269fcbdabf4f20c40b1c7c181de5 /block/bfq-iosched.c
parentaee69d78dec0ffdf82e35d57c626e80dddc314d5 (diff)
block, bfq: add full hierarchical scheduling and cgroups support
Add complete support for full hierarchical scheduling, with a cgroups interface. Full hierarchical scheduling is implemented through the 'entity' abstraction: both bfq_queues, i.e., the internal BFQ queues associated with processes, and groups are represented in general by entities. Given the bfq_queues associated with the processes belonging to a given group, the entities representing these queues are sons of the entity representing the group. At higher levels, if a group, say G, contains other groups, then the entity representing G is the parent entity of the entities representing the groups in G. Hierarchical scheduling is performed as follows: if the timestamps of a leaf entity (i.e., of a bfq_queue) change, and such a change lets the entity become the next-to-serve entity for its parent entity, then the timestamps of the parent entity are recomputed as a function of the budget of its new next-to-serve leaf entity. If the parent entity belongs, in its turn, to a group, and its new timestamps let it become the next-to-serve for its parent entity, then the timestamps of the latter parent entity are recomputed as well, and so on. When a new bfq_queue must be set in service, the reverse path is followed: the next-to-serve highest-level entity is chosen, then its next-to-serve child entity, and so on, until the next-to-serve leaf entity is reached, and the bfq_queue that this entity represents is set in service. Writeback is accounted for on a per-group basis, i.e., for each group, the async I/O requests of the processes of the group are enqueued in a distinct bfq_queue, and the entity associated with this queue is a child of the entity associated with the group. Weights can be assigned explicitly to groups and processes through the cgroups interface, differently from what happens, for single processes, if the cgroups interface is not used (as explained in the description of the previous patch). In particular, since each node has a full scheduler, each group can be assigned its own weight. Signed-off-by: Fabio Checconi <fchecconi@gmail.com> Signed-off-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r--block/bfq-iosched.c2424
1 files changed, 2119 insertions, 305 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c4e7d8db796a..af1740a1d453 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -90,6 +90,7 @@
90#include <linux/module.h> 90#include <linux/module.h>
91#include <linux/slab.h> 91#include <linux/slab.h>
92#include <linux/blkdev.h> 92#include <linux/blkdev.h>
93#include <linux/cgroup.h>
93#include <linux/elevator.h> 94#include <linux/elevator.h>
94#include <linux/ktime.h> 95#include <linux/ktime.h>
95#include <linux/rbtree.h> 96#include <linux/rbtree.h>
@@ -114,7 +115,7 @@
114 115
115#define BFQ_DEFAULT_QUEUE_IOPRIO 4 116#define BFQ_DEFAULT_QUEUE_IOPRIO 4
116 117
117#define BFQ_DEFAULT_GRP_WEIGHT 10 118#define BFQ_WEIGHT_LEGACY_DFL 100
118#define BFQ_DEFAULT_GRP_IOPRIO 0 119#define BFQ_DEFAULT_GRP_IOPRIO 0
119#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE 120#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
120 121
@@ -149,10 +150,11 @@ struct bfq_service_tree {
149 * struct bfq_sched_data - multi-class scheduler. 150 * struct bfq_sched_data - multi-class scheduler.
150 * 151 *
151 * bfq_sched_data is the basic scheduler queue. It supports three 152 * bfq_sched_data is the basic scheduler queue. It supports three
152 * ioprio_classes, and can be used either as a toplevel queue or as 153 * ioprio_classes, and can be used either as a toplevel queue or as an
153 * an intermediate queue on a hierarchical setup. 154 * intermediate queue on a hierarchical setup. @next_in_service
154 * @next_in_service points to the active entity of the sched_data 155 * points to the active entity of the sched_data service trees that
155 * service trees that will be scheduled next. 156 * will be scheduled next. It is used to reduce the number of steps
157 * needed for each hierarchical-schedule update.
156 * 158 *
157 * The supported ioprio_classes are the same as in CFQ, in descending 159 * The supported ioprio_classes are the same as in CFQ, in descending
158 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. 160 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
@@ -164,19 +166,23 @@ struct bfq_service_tree {
164struct bfq_sched_data { 166struct bfq_sched_data {
165 /* entity in service */ 167 /* entity in service */
166 struct bfq_entity *in_service_entity; 168 struct bfq_entity *in_service_entity;
167 /* head-of-the-line entity in the scheduler */ 169 /* head-of-line entity (see comments above) */
168 struct bfq_entity *next_in_service; 170 struct bfq_entity *next_in_service;
169 /* array of service trees, one per ioprio_class */ 171 /* array of service trees, one per ioprio_class */
170 struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; 172 struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
173 /* last time CLASS_IDLE was served */
174 unsigned long bfq_class_idle_last_service;
175
171}; 176};
172 177
173/** 178/**
174 * struct bfq_entity - schedulable entity. 179 * struct bfq_entity - schedulable entity.
175 * 180 *
176 * A bfq_entity is used to represent a bfq_queue (leaf node in the upper 181 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
177 * level scheduler). Each entity belongs to the sched_data of the parent 182 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
178 * group hierarchy. Non-leaf entities have also their own sched_data, 183 * entity belongs to the sched_data of the parent group in the cgroup
179 * stored in @my_sched_data. 184 * hierarchy. Non-leaf entities have also their own sched_data, stored
185 * in @my_sched_data.
180 * 186 *
181 * Each entity stores independently its priority values; this would 187 * Each entity stores independently its priority values; this would
182 * allow different weights on different devices, but this 188 * allow different weights on different devices, but this
@@ -187,23 +193,24 @@ struct bfq_sched_data {
187 * update to take place the effective and the requested priority 193 * update to take place the effective and the requested priority
188 * values are synchronized. 194 * values are synchronized.
189 * 195 *
190 * The weight value is calculated from the ioprio to export the same 196 * Unless cgroups are used, the weight value is calculated from the
191 * interface as CFQ. When dealing with ``well-behaved'' queues (i.e., 197 * ioprio to export the same interface as CFQ. When dealing with
192 * queues that do not spend too much time to consume their budget 198 * ``well-behaved'' queues (i.e., queues that do not spend too much
193 * and have true sequential behavior, and when there are no external 199 * time to consume their budget and have true sequential behavior, and
194 * factors breaking anticipation) the relative weights at each level 200 * when there are no external factors breaking anticipation) the
195 * of the hierarchy should be guaranteed. All the fields are 201 * relative weights at each level of the cgroups hierarchy should be
196 * protected by the queue lock of the containing bfqd. 202 * guaranteed. All the fields are protected by the queue lock of the
203 * containing bfqd.
197 */ 204 */
198struct bfq_entity { 205struct bfq_entity {
199 /* service_tree member */ 206 /* service_tree member */
200 struct rb_node rb_node; 207 struct rb_node rb_node;
201 208
202 /* 209 /*
203 * flag, true if the entity is on a tree (either the active or 210 * Flag, true if the entity is on a tree (either the active or
204 * the idle one of its service_tree). 211 * the idle one of its service_tree) or is in service.
205 */ 212 */
206 int on_st; 213 bool on_st;
207 214
208 /* B-WF2Q+ start and finish timestamps [sectors/weight] */ 215 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
209 u64 start, finish; 216 u64 start, finish;
@@ -246,6 +253,8 @@ struct bfq_entity {
246 int prio_changed; 253 int prio_changed;
247}; 254};
248 255
256struct bfq_group;
257
249/** 258/**
250 * struct bfq_ttime - per process thinktime stats. 259 * struct bfq_ttime - per process thinktime stats.
251 */ 260 */
@@ -265,7 +274,11 @@ struct bfq_ttime {
265 * struct bfq_queue - leaf schedulable entity. 274 * struct bfq_queue - leaf schedulable entity.
266 * 275 *
267 * A bfq_queue is a leaf request queue; it can be associated with an 276 * A bfq_queue is a leaf request queue; it can be associated with an
268 * io_context or more, if it is async. 277 * io_context or more, if it is async. @cgroup holds a reference to
278 * the cgroup, to be sure that it does not disappear while a bfqq
279 * still references it (mostly to avoid races between request issuing
280 * and task migration followed by cgroup destruction). All the fields
281 * are protected by the queue lock of the containing bfqd.
269 */ 282 */
270struct bfq_queue { 283struct bfq_queue {
271 /* reference counter */ 284 /* reference counter */
@@ -338,6 +351,9 @@ struct bfq_io_cq {
338 struct bfq_queue *bfqq[2]; 351 struct bfq_queue *bfqq[2];
339 /* per (request_queue, blkcg) ioprio */ 352 /* per (request_queue, blkcg) ioprio */
340 int ioprio; 353 int ioprio;
354#ifdef CONFIG_BFQ_GROUP_IOSCHED
355 uint64_t blkcg_serial_nr; /* the current blkcg serial */
356#endif
341}; 357};
342 358
343/** 359/**
@@ -351,8 +367,8 @@ struct bfq_data {
351 /* dispatch queue */ 367 /* dispatch queue */
352 struct list_head dispatch; 368 struct list_head dispatch;
353 369
354 /* root @bfq_sched_data for the device */ 370 /* root bfq_group for the device */
355 struct bfq_sched_data sched_data; 371 struct bfq_group *root_group;
356 372
357 /* 373 /*
358 * Number of bfq_queues containing requests (including the 374 * Number of bfq_queues containing requests (including the
@@ -423,8 +439,6 @@ struct bfq_data {
423 unsigned int bfq_back_max; 439 unsigned int bfq_back_max;
424 /* maximum idling time */ 440 /* maximum idling time */
425 u32 bfq_slice_idle; 441 u32 bfq_slice_idle;
426 /* last time CLASS_IDLE was served */
427 u64 bfq_class_idle_last_service;
428 442
429 /* user-configured max budget value (0 for auto-tuning) */ 443 /* user-configured max budget value (0 for auto-tuning) */
430 int bfq_user_max_budget; 444 int bfq_user_max_budget;
@@ -516,8 +530,35 @@ BFQ_BFQQ_FNS(IO_bound);
516#undef BFQ_BFQQ_FNS 530#undef BFQ_BFQQ_FNS
517 531
518/* Logging facilities. */ 532/* Logging facilities. */
519#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ 533#ifdef CONFIG_BFQ_GROUP_IOSCHED
520 blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) 534static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
535static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
536
537#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
538 char __pbuf[128]; \
539 \
540 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
541 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
542 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
543 __pbuf, ##args); \
544} while (0)
545
546#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
547 char __pbuf[128]; \
548 \
549 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
550 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
551} while (0)
552
553#else /* CONFIG_BFQ_GROUP_IOSCHED */
554
555#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
556 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
557 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
558 ##args)
559#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
560
561#endif /* CONFIG_BFQ_GROUP_IOSCHED */
521 562
522#define bfq_log(bfqd, fmt, args...) \ 563#define bfq_log(bfqd, fmt, args...) \
523 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) 564 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
@@ -534,15 +575,120 @@ enum bfqq_expiration {
534 BFQQE_PREEMPTED /* preemption in progress */ 575 BFQQE_PREEMPTED /* preemption in progress */
535}; 576};
536 577
578struct bfqg_stats {
579#ifdef CONFIG_BFQ_GROUP_IOSCHED
580 /* number of ios merged */
581 struct blkg_rwstat merged;
582 /* total time spent on device in ns, may not be accurate w/ queueing */
583 struct blkg_rwstat service_time;
584 /* total time spent waiting in scheduler queue in ns */
585 struct blkg_rwstat wait_time;
586 /* number of IOs queued up */
587 struct blkg_rwstat queued;
588 /* total disk time and nr sectors dispatched by this group */
589 struct blkg_stat time;
590 /* sum of number of ios queued across all samples */
591 struct blkg_stat avg_queue_size_sum;
592 /* count of samples taken for average */
593 struct blkg_stat avg_queue_size_samples;
594 /* how many times this group has been removed from service tree */
595 struct blkg_stat dequeue;
596 /* total time spent waiting for it to be assigned a timeslice. */
597 struct blkg_stat group_wait_time;
598 /* time spent idling for this blkcg_gq */
599 struct blkg_stat idle_time;
600 /* total time with empty current active q with other requests queued */
601 struct blkg_stat empty_time;
602 /* fields after this shouldn't be cleared on stat reset */
603 uint64_t start_group_wait_time;
604 uint64_t start_idle_time;
605 uint64_t start_empty_time;
606 uint16_t flags;
607#endif /* CONFIG_BFQ_GROUP_IOSCHED */
608};
609
610#ifdef CONFIG_BFQ_GROUP_IOSCHED
611
612/*
613 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
614 *
615 * @ps: @blkcg_policy_storage that this structure inherits
616 * @weight: weight of the bfq_group
617 */
618struct bfq_group_data {
619 /* must be the first member */
620 struct blkcg_policy_data pd;
621
622 unsigned short weight;
623};
624
625/**
626 * struct bfq_group - per (device, cgroup) data structure.
627 * @entity: schedulable entity to insert into the parent group sched_data.
628 * @sched_data: own sched_data, to contain child entities (they may be
629 * both bfq_queues and bfq_groups).
630 * @bfqd: the bfq_data for the device this group acts upon.
631 * @async_bfqq: array of async queues for all the tasks belonging to
632 * the group, one queue per ioprio value per ioprio_class,
633 * except for the idle class that has only one queue.
634 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
635 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
636 * to avoid too many special cases during group creation/
637 * migration.
638 * @stats: stats for this bfqg.
639 *
640 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
641 * there is a set of bfq_groups, each one collecting the lower-level
642 * entities belonging to the group that are acting on the same device.
643 *
644 * Locking works as follows:
645 * o @bfqd is protected by the queue lock, RCU is used to access it
646 * from the readers.
647 * o All the other fields are protected by the @bfqd queue lock.
648 */
649struct bfq_group {
650 /* must be the first member */
651 struct blkg_policy_data pd;
652
653 struct bfq_entity entity;
654 struct bfq_sched_data sched_data;
655
656 void *bfqd;
657
658 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
659 struct bfq_queue *async_idle_bfqq;
660
661 struct bfq_entity *my_entity;
662
663 struct bfqg_stats stats;
664};
665
666#else
667struct bfq_group {
668 struct bfq_sched_data sched_data;
669
670 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
671 struct bfq_queue *async_idle_bfqq;
672
673 struct rb_root rq_pos_tree;
674};
675#endif
676
537static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); 677static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
538 678
679static unsigned int bfq_class_idx(struct bfq_entity *entity)
680{
681 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
682
683 return bfqq ? bfqq->ioprio_class - 1 :
684 BFQ_DEFAULT_GRP_CLASS - 1;
685}
686
539static struct bfq_service_tree * 687static struct bfq_service_tree *
540bfq_entity_service_tree(struct bfq_entity *entity) 688bfq_entity_service_tree(struct bfq_entity *entity)
541{ 689{
542 struct bfq_sched_data *sched_data = entity->sched_data; 690 struct bfq_sched_data *sched_data = entity->sched_data;
543 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); 691 unsigned int idx = bfq_class_idx(entity);
544 unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
545 BFQ_DEFAULT_GRP_CLASS - 1;
546 692
547 return sched_data->service_tree + idx; 693 return sched_data->service_tree + idx;
548} 694}
@@ -568,16 +714,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq);
568static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, 714static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
569 struct bio *bio, bool is_sync, 715 struct bio *bio, bool is_sync,
570 struct bfq_io_cq *bic); 716 struct bfq_io_cq *bic);
717static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
571static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); 718static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
572 719
573/*
574 * Array of async queues for all the processes, one queue
575 * per ioprio value per ioprio_class.
576 */
577struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
578/* Async queue for the idle class (ioprio is ignored) */
579struct bfq_queue *async_idle_bfqq;
580
581/* Expiration time of sync (0) and async (1) requests, in ns. */ 720/* Expiration time of sync (0) and async (1) requests, in ns. */
582static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; 721static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
583 722
@@ -663,30 +802,222 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
663} 802}
664 803
665/* 804/*
666 * Next two macros are just fake loops for the moment. They will 805 * Scheduler run of queue, if there are requests pending and no one in the
667 * become true loops in the cgroups-enabled variant of the code. Such 806 * driver that will restart queueing.
668 * a variant, in its turn, will be introduced by next commit. 807 */
808static void bfq_schedule_dispatch(struct bfq_data *bfqd)
809{
810 if (bfqd->queued != 0) {
811 bfq_log(bfqd, "schedule dispatch");
812 blk_mq_run_hw_queues(bfqd->queue, true);
813 }
814}
815
816/**
817 * bfq_gt - compare two timestamps.
818 * @a: first ts.
819 * @b: second ts.
820 *
821 * Return @a > @b, dealing with wrapping correctly.
822 */
823static int bfq_gt(u64 a, u64 b)
824{
825 return (s64)(a - b) > 0;
826}
827
828static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
829{
830 struct rb_node *node = tree->rb_node;
831
832 return rb_entry(node, struct bfq_entity, rb_node);
833}
834
835static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
836
837static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
838
839/**
840 * bfq_update_next_in_service - update sd->next_in_service
841 * @sd: sched_data for which to perform the update.
842 * @new_entity: if not NULL, pointer to the entity whose activation,
843 * requeueing or repositionig triggered the invocation of
844 * this function.
845 *
846 * This function is called to update sd->next_in_service, which, in
847 * its turn, may change as a consequence of the insertion or
848 * extraction of an entity into/from one of the active trees of
849 * sd. These insertions/extractions occur as a consequence of
850 * activations/deactivations of entities, with some activations being
851 * 'true' activations, and other activations being requeueings (i.e.,
852 * implementing the second, requeueing phase of the mechanism used to
853 * reposition an entity in its active tree; see comments on
854 * __bfq_activate_entity and __bfq_requeue_entity for details). In
855 * both the last two activation sub-cases, new_entity points to the
856 * just activated or requeued entity.
857 *
858 * Returns true if sd->next_in_service changes in such a way that
859 * entity->parent may become the next_in_service for its parent
860 * entity.
669 */ 861 */
862static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
863 struct bfq_entity *new_entity)
864{
865 struct bfq_entity *next_in_service = sd->next_in_service;
866 bool parent_sched_may_change = false;
867
868 /*
869 * If this update is triggered by the activation, requeueing
870 * or repositiong of an entity that does not coincide with
871 * sd->next_in_service, then a full lookup in the active tree
872 * can be avoided. In fact, it is enough to check whether the
873 * just-modified entity has a higher priority than
874 * sd->next_in_service, or, even if it has the same priority
875 * as sd->next_in_service, is eligible and has a lower virtual
876 * finish time than sd->next_in_service. If this compound
877 * condition holds, then the new entity becomes the new
878 * next_in_service. Otherwise no change is needed.
879 */
880 if (new_entity && new_entity != sd->next_in_service) {
881 /*
882 * Flag used to decide whether to replace
883 * sd->next_in_service with new_entity. Tentatively
884 * set to true, and left as true if
885 * sd->next_in_service is NULL.
886 */
887 bool replace_next = true;
888
889 /*
890 * If there is already a next_in_service candidate
891 * entity, then compare class priorities or timestamps
892 * to decide whether to replace sd->service_tree with
893 * new_entity.
894 */
895 if (next_in_service) {
896 unsigned int new_entity_class_idx =
897 bfq_class_idx(new_entity);
898 struct bfq_service_tree *st =
899 sd->service_tree + new_entity_class_idx;
900
901 /*
902 * For efficiency, evaluate the most likely
903 * sub-condition first.
904 */
905 replace_next =
906 (new_entity_class_idx ==
907 bfq_class_idx(next_in_service)
908 &&
909 !bfq_gt(new_entity->start, st->vtime)
910 &&
911 bfq_gt(next_in_service->finish,
912 new_entity->finish))
913 ||
914 new_entity_class_idx <
915 bfq_class_idx(next_in_service);
916 }
917
918 if (replace_next)
919 next_in_service = new_entity;
920 } else /* invoked because of a deactivation: lookup needed */
921 next_in_service = bfq_lookup_next_entity(sd);
922
923 if (next_in_service) {
924 parent_sched_may_change = !sd->next_in_service ||
925 bfq_update_parent_budget(next_in_service);
926 }
927
928 sd->next_in_service = next_in_service;
929
930 if (!next_in_service)
931 return parent_sched_may_change;
932
933 return parent_sched_may_change;
934}
935
936#ifdef CONFIG_BFQ_GROUP_IOSCHED
937/* both next loops stop at one of the child entities of the root group */
670#define for_each_entity(entity) \ 938#define for_each_entity(entity) \
671 for (; entity ; entity = NULL) 939 for (; entity ; entity = entity->parent)
672 940
941/*
942 * For each iteration, compute parent in advance, so as to be safe if
943 * entity is deallocated during the iteration. Such a deallocation may
944 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
945 * containing entity.
946 */
673#define for_each_entity_safe(entity, parent) \ 947#define for_each_entity_safe(entity, parent) \
674 for (parent = NULL; entity ; entity = parent) 948 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
675 949
676static int bfq_update_next_in_service(struct bfq_sched_data *sd) 950/*
951 * Returns true if this budget changes may let next_in_service->parent
952 * become the next_in_service entity for its parent entity.
953 */
954static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
677{ 955{
678 return 0; 956 struct bfq_entity *bfqg_entity;
957 struct bfq_group *bfqg;
958 struct bfq_sched_data *group_sd;
959 bool ret = false;
960
961 group_sd = next_in_service->sched_data;
962
963 bfqg = container_of(group_sd, struct bfq_group, sched_data);
964 /*
965 * bfq_group's my_entity field is not NULL only if the group
966 * is not the root group. We must not touch the root entity
967 * as it must never become an in-service entity.
968 */
969 bfqg_entity = bfqg->my_entity;
970 if (bfqg_entity) {
971 if (bfqg_entity->budget > next_in_service->budget)
972 ret = true;
973 bfqg_entity->budget = next_in_service->budget;
974 }
975
976 return ret;
977}
978
979/*
980 * This function tells whether entity stops being a candidate for next
981 * service, according to the following logic.
982 *
983 * This function is invoked for an entity that is about to be set in
984 * service. If such an entity is a queue, then the entity is no longer
985 * a candidate for next service (i.e, a candidate entity to serve
986 * after the in-service entity is expired). The function then returns
987 * true.
988 */
989static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
990{
991 if (bfq_entity_to_bfqq(entity))
992 return true;
993
994 return false;
679} 995}
680 996
681static void bfq_check_next_in_service(struct bfq_sched_data *sd, 997#else /* CONFIG_BFQ_GROUP_IOSCHED */
682 struct bfq_entity *entity) 998/*
999 * Next two macros are fake loops when cgroups support is not
1000 * enabled. I fact, in such a case, there is only one level to go up
1001 * (to reach the root group).
1002 */
1003#define for_each_entity(entity) \
1004 for (; entity ; entity = NULL)
1005
1006#define for_each_entity_safe(entity, parent) \
1007 for (parent = NULL; entity ; entity = parent)
1008
1009static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
683{ 1010{
1011 return false;
684} 1012}
685 1013
686static void bfq_update_budget(struct bfq_entity *next_in_service) 1014static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
687{ 1015{
1016 return true;
688} 1017}
689 1018
1019#endif /* CONFIG_BFQ_GROUP_IOSCHED */
1020
690/* 1021/*
691 * Shift for timestamp calculations. This actually limits the maximum 1022 * Shift for timestamp calculations. This actually limits the maximum
692 * service allowed in one timestamp delta (small shift values increase it), 1023 * service allowed in one timestamp delta (small shift values increase it),
@@ -696,18 +1027,6 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
696 */ 1027 */
697#define WFQ_SERVICE_SHIFT 22 1028#define WFQ_SERVICE_SHIFT 22
698 1029
699/**
700 * bfq_gt - compare two timestamps.
701 * @a: first ts.
702 * @b: second ts.
703 *
704 * Return @a > @b, dealing with wrapping correctly.
705 */
706static int bfq_gt(u64 a, u64 b)
707{
708 return (s64)(a - b) > 0;
709}
710
711static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) 1030static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
712{ 1031{
713 struct bfq_queue *bfqq = NULL; 1032 struct bfq_queue *bfqq = NULL;
@@ -926,6 +1245,11 @@ static void bfq_active_insert(struct bfq_service_tree *st,
926{ 1245{
927 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); 1246 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
928 struct rb_node *node = &entity->rb_node; 1247 struct rb_node *node = &entity->rb_node;
1248#ifdef CONFIG_BFQ_GROUP_IOSCHED
1249 struct bfq_sched_data *sd = NULL;
1250 struct bfq_group *bfqg = NULL;
1251 struct bfq_data *bfqd = NULL;
1252#endif
929 1253
930 bfq_insert(&st->active, entity); 1254 bfq_insert(&st->active, entity);
931 1255
@@ -936,6 +1260,11 @@ static void bfq_active_insert(struct bfq_service_tree *st,
936 1260
937 bfq_update_active_tree(node); 1261 bfq_update_active_tree(node);
938 1262
1263#ifdef CONFIG_BFQ_GROUP_IOSCHED
1264 sd = entity->sched_data;
1265 bfqg = container_of(sd, struct bfq_group, sched_data);
1266 bfqd = (struct bfq_data *)bfqg->bfqd;
1267#endif
939 if (bfqq) 1268 if (bfqq)
940 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); 1269 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
941} 1270}
@@ -1014,6 +1343,11 @@ static void bfq_active_extract(struct bfq_service_tree *st,
1014{ 1343{
1015 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); 1344 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1016 struct rb_node *node; 1345 struct rb_node *node;
1346#ifdef CONFIG_BFQ_GROUP_IOSCHED
1347 struct bfq_sched_data *sd = NULL;
1348 struct bfq_group *bfqg = NULL;
1349 struct bfq_data *bfqd = NULL;
1350#endif
1017 1351
1018 node = bfq_find_deepest(&entity->rb_node); 1352 node = bfq_find_deepest(&entity->rb_node);
1019 bfq_extract(&st->active, entity); 1353 bfq_extract(&st->active, entity);
@@ -1021,6 +1355,11 @@ static void bfq_active_extract(struct bfq_service_tree *st,
1021 if (node) 1355 if (node)
1022 bfq_update_active_tree(node); 1356 bfq_update_active_tree(node);
1023 1357
1358#ifdef CONFIG_BFQ_GROUP_IOSCHED
1359 sd = entity->sched_data;
1360 bfqg = container_of(sd, struct bfq_group, sched_data);
1361 bfqd = (struct bfq_data *)bfqg->bfqd;
1362#endif
1024 if (bfqq) 1363 if (bfqq)
1025 list_del(&bfqq->bfqq_list); 1364 list_del(&bfqq->bfqq_list);
1026} 1365}
@@ -1069,7 +1408,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
1069{ 1408{
1070 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); 1409 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1071 1410
1072 entity->on_st = 0; 1411 entity->on_st = false;
1073 st->wsum -= entity->weight; 1412 st->wsum -= entity->weight;
1074 if (bfqq && !is_in_service) 1413 if (bfqq && !is_in_service)
1075 bfq_put_queue(bfqq); 1414 bfq_put_queue(bfqq);
@@ -1115,7 +1454,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
1115 1454
1116static struct bfq_service_tree * 1455static struct bfq_service_tree *
1117__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, 1456__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1118 struct bfq_entity *entity) 1457 struct bfq_entity *entity)
1119{ 1458{
1120 struct bfq_service_tree *new_st = old_st; 1459 struct bfq_service_tree *new_st = old_st;
1121 1460
@@ -1123,9 +1462,20 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1123 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); 1462 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1124 unsigned short prev_weight, new_weight; 1463 unsigned short prev_weight, new_weight;
1125 struct bfq_data *bfqd = NULL; 1464 struct bfq_data *bfqd = NULL;
1465#ifdef CONFIG_BFQ_GROUP_IOSCHED
1466 struct bfq_sched_data *sd;
1467 struct bfq_group *bfqg;
1468#endif
1126 1469
1127 if (bfqq) 1470 if (bfqq)
1128 bfqd = bfqq->bfqd; 1471 bfqd = bfqq->bfqd;
1472#ifdef CONFIG_BFQ_GROUP_IOSCHED
1473 else {
1474 sd = entity->my_sched_data;
1475 bfqg = container_of(sd, struct bfq_group, sched_data);
1476 bfqd = (struct bfq_data *)bfqg->bfqd;
1477 }
1478#endif
1129 1479
1130 old_st->wsum -= entity->weight; 1480 old_st->wsum -= entity->weight;
1131 1481
@@ -1171,6 +1521,9 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1171 return new_st; 1521 return new_st;
1172} 1522}
1173 1523
1524static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
1525static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
1526
1174/** 1527/**
1175 * bfq_bfqq_served - update the scheduler status after selection for 1528 * bfq_bfqq_served - update the scheduler status after selection for
1176 * service. 1529 * service.
@@ -1194,6 +1547,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
1194 st->vtime += bfq_delta(served, st->wsum); 1547 st->vtime += bfq_delta(served, st->wsum);
1195 bfq_forget_idle(st); 1548 bfq_forget_idle(st);
1196 } 1549 }
1550 bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
1197 bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); 1551 bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
1198} 1552}
1199 1553
@@ -1216,78 +1570,10 @@ static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
1216 bfq_bfqq_served(bfqq, entity->budget - entity->service); 1570 bfq_bfqq_served(bfqq, entity->budget - entity->service);
1217} 1571}
1218 1572
1219/** 1573static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
1220 * __bfq_activate_entity - activate an entity. 1574 struct bfq_service_tree *st,
1221 * @entity: the entity being activated. 1575 bool backshifted)
1222 * @non_blocking_wait_rq: true if this entity was waiting for a request
1223 *
1224 * Called whenever an entity is activated, i.e., it is not active and one
1225 * of its children receives a new request, or has to be reactivated due to
1226 * budget exhaustion. It uses the current budget of the entity (and the
1227 * service received if @entity is active) of the queue to calculate its
1228 * timestamps.
1229 */
1230static void __bfq_activate_entity(struct bfq_entity *entity,
1231 bool non_blocking_wait_rq)
1232{ 1576{
1233 struct bfq_sched_data *sd = entity->sched_data;
1234 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1235 bool backshifted = false;
1236
1237 if (entity == sd->in_service_entity) {
1238 /*
1239 * If we are requeueing the current entity we have
1240 * to take care of not charging to it service it has
1241 * not received.
1242 */
1243 bfq_calc_finish(entity, entity->service);
1244 entity->start = entity->finish;
1245 sd->in_service_entity = NULL;
1246 } else if (entity->tree == &st->active) {
1247 /*
1248 * Requeueing an entity due to a change of some
1249 * next_in_service entity below it. We reuse the
1250 * old start time.
1251 */
1252 bfq_active_extract(st, entity);
1253 } else {
1254 unsigned long long min_vstart;
1255
1256 /* See comments on bfq_fqq_update_budg_for_activation */
1257 if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
1258 backshifted = true;
1259 min_vstart = entity->finish;
1260 } else
1261 min_vstart = st->vtime;
1262
1263 if (entity->tree == &st->idle) {
1264 /*
1265 * Must be on the idle tree, bfq_idle_extract() will
1266 * check for that.
1267 */
1268 bfq_idle_extract(st, entity);
1269 entity->start = bfq_gt(min_vstart, entity->finish) ?
1270 min_vstart : entity->finish;
1271 } else {
1272 /*
1273 * The finish time of the entity may be invalid, and
1274 * it is in the past for sure, otherwise the queue
1275 * would have been on the idle tree.
1276 */
1277 entity->start = min_vstart;
1278 st->wsum += entity->weight;
1279 /*
1280 * entity is about to be inserted into a service tree,
1281 * and then set in service: get a reference to make
1282 * sure entity does not disappear until it is no
1283 * longer in service or scheduled for service.
1284 */
1285 bfq_get_entity(entity);
1286
1287 entity->on_st = 1;
1288 }
1289 }
1290
1291 st = __bfq_entity_update_weight_prio(st, entity); 1577 st = __bfq_entity_update_weight_prio(st, entity);
1292 bfq_calc_finish(entity, entity->budget); 1578 bfq_calc_finish(entity, entity->budget);
1293 1579
@@ -1329,27 +1615,185 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
1329} 1615}
1330 1616
1331/** 1617/**
1332 * bfq_activate_entity - activate an entity and its ancestors if necessary. 1618 * __bfq_activate_entity - handle activation of entity.
1619 * @entity: the entity being activated.
1620 * @non_blocking_wait_rq: true if entity was waiting for a request
1621 *
1622 * Called for a 'true' activation, i.e., if entity is not active and
1623 * one of its children receives a new request.
1624 *
1625 * Basically, this function updates the timestamps of entity and
1626 * inserts entity into its active tree, ater possible extracting it
1627 * from its idle tree.
1628 */
1629static void __bfq_activate_entity(struct bfq_entity *entity,
1630 bool non_blocking_wait_rq)
1631{
1632 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1633 bool backshifted = false;
1634 unsigned long long min_vstart;
1635
1636 /* See comments on bfq_fqq_update_budg_for_activation */
1637 if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
1638 backshifted = true;
1639 min_vstart = entity->finish;
1640 } else
1641 min_vstart = st->vtime;
1642
1643 if (entity->tree == &st->idle) {
1644 /*
1645 * Must be on the idle tree, bfq_idle_extract() will
1646 * check for that.
1647 */
1648 bfq_idle_extract(st, entity);
1649 entity->start = bfq_gt(min_vstart, entity->finish) ?
1650 min_vstart : entity->finish;
1651 } else {
1652 /*
1653 * The finish time of the entity may be invalid, and
1654 * it is in the past for sure, otherwise the queue
1655 * would have been on the idle tree.
1656 */
1657 entity->start = min_vstart;
1658 st->wsum += entity->weight;
1659 /*
1660 * entity is about to be inserted into a service tree,
1661 * and then set in service: get a reference to make
1662 * sure entity does not disappear until it is no
1663 * longer in service or scheduled for service.
1664 */
1665 bfq_get_entity(entity);
1666
1667 entity->on_st = true;
1668 }
1669
1670 bfq_update_fin_time_enqueue(entity, st, backshifted);
1671}
1672
1673/**
1674 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1675 * @entity: the entity being requeued or repositioned.
1676 *
1677 * Requeueing is needed if this entity stops being served, which
1678 * happens if a leaf descendant entity has expired. On the other hand,
1679 * repositioning is needed if the next_inservice_entity for the child
1680 * entity has changed. See the comments inside the function for
1681 * details.
1682 *
1683 * Basically, this function: 1) removes entity from its active tree if
1684 * present there, 2) updates the timestamps of entity and 3) inserts
1685 * entity back into its active tree (in the new, right position for
1686 * the new values of the timestamps).
1687 */
1688static void __bfq_requeue_entity(struct bfq_entity *entity)
1689{
1690 struct bfq_sched_data *sd = entity->sched_data;
1691 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1692
1693 if (entity == sd->in_service_entity) {
1694 /*
1695 * We are requeueing the current in-service entity,
1696 * which may have to be done for one of the following
1697 * reasons:
1698 * - entity represents the in-service queue, and the
1699 * in-service queue is being requeued after an
1700 * expiration;
1701 * - entity represents a group, and its budget has
1702 * changed because one of its child entities has
1703 * just been either activated or requeued for some
1704 * reason; the timestamps of the entity need then to
1705 * be updated, and the entity needs to be enqueued
1706 * or repositioned accordingly.
1707 *
1708 * In particular, before requeueing, the start time of
1709 * the entity must be moved forward to account for the
1710 * service that the entity has received while in
1711 * service. This is done by the next instructions. The
1712 * finish time will then be updated according to this
1713 * new value of the start time, and to the budget of
1714 * the entity.
1715 */
1716 bfq_calc_finish(entity, entity->service);
1717 entity->start = entity->finish;
1718 /*
1719 * In addition, if the entity had more than one child
1720 * when set in service, then was not extracted from
1721 * the active tree. This implies that the position of
1722 * the entity in the active tree may need to be
1723 * changed now, because we have just updated the start
1724 * time of the entity, and we will update its finish
1725 * time in a moment (the requeueing is then, more
1726 * precisely, a repositioning in this case). To
1727 * implement this repositioning, we: 1) dequeue the
1728 * entity here, 2) update the finish time and
1729 * requeue the entity according to the new
1730 * timestamps below.
1731 */
1732 if (entity->tree)
1733 bfq_active_extract(st, entity);
1734 } else { /* The entity is already active, and not in service */
1735 /*
1736 * In this case, this function gets called only if the
1737 * next_in_service entity below this entity has
1738 * changed, and this change has caused the budget of
1739 * this entity to change, which, finally implies that
1740 * the finish time of this entity must be
1741 * updated. Such an update may cause the scheduling,
1742 * i.e., the position in the active tree, of this
1743 * entity to change. We handle this change by: 1)
1744 * dequeueing the entity here, 2) updating the finish
1745 * time and requeueing the entity according to the new
1746 * timestamps below. This is the same approach as the
1747 * non-extracted-entity sub-case above.
1748 */
1749 bfq_active_extract(st, entity);
1750 }
1751
1752 bfq_update_fin_time_enqueue(entity, st, false);
1753}
1754
1755static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
1756 struct bfq_sched_data *sd,
1757 bool non_blocking_wait_rq)
1758{
1759 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1760
1761 if (sd->in_service_entity == entity || entity->tree == &st->active)
1762 /*
1763 * in service or already queued on the active tree,
1764 * requeue or reposition
1765 */
1766 __bfq_requeue_entity(entity);
1767 else
1768 /*
1769 * Not in service and not queued on its active tree:
1770 * the activity is idle and this is a true activation.
1771 */
1772 __bfq_activate_entity(entity, non_blocking_wait_rq);
1773}
1774
1775
1776/**
1777 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1778 * and activate, requeue or reposition all ancestors
1779 * for which such an update becomes necessary.
1333 * @entity: the entity to activate. 1780 * @entity: the entity to activate.
1334 * @non_blocking_wait_rq: true if this entity was waiting for a request 1781 * @non_blocking_wait_rq: true if this entity was waiting for a request
1335 * 1782 * @requeue: true if this is a requeue, which implies that bfqq is
1336 * Activate @entity and all the entities on the path from it to the root. 1783 * being expired; thus ALL its ancestors stop being served and must
1784 * therefore be requeued
1337 */ 1785 */
1338static void bfq_activate_entity(struct bfq_entity *entity, 1786static void bfq_activate_requeue_entity(struct bfq_entity *entity,
1339 bool non_blocking_wait_rq) 1787 bool non_blocking_wait_rq,
1788 bool requeue)
1340{ 1789{
1341 struct bfq_sched_data *sd; 1790 struct bfq_sched_data *sd;
1342 1791
1343 for_each_entity(entity) { 1792 for_each_entity(entity) {
1344 __bfq_activate_entity(entity, non_blocking_wait_rq);
1345
1346 sd = entity->sched_data; 1793 sd = entity->sched_data;
1347 if (!bfq_update_next_in_service(sd)) 1794 __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
1348 /* 1795
1349 * No need to propagate the activation to the 1796 if (!bfq_update_next_in_service(sd, entity) && !requeue)
1350 * upper entities, as they will be updated when
1351 * the in-service entity is rescheduled.
1352 */
1353 break; 1797 break;
1354 } 1798 }
1355} 1799}
@@ -1357,52 +1801,48 @@ static void bfq_activate_entity(struct bfq_entity *entity,
1357/** 1801/**
1358 * __bfq_deactivate_entity - deactivate an entity from its service tree. 1802 * __bfq_deactivate_entity - deactivate an entity from its service tree.
1359 * @entity: the entity to deactivate. 1803 * @entity: the entity to deactivate.
1360 * @requeue: if false, the entity will not be put into the idle tree. 1804 * @ins_into_idle_tree: if false, the entity will not be put into the
1805 * idle tree.
1361 * 1806 *
1362 * Deactivate an entity, independently from its previous state. If the 1807 * Deactivates an entity, independently from its previous state. Must
1363 * entity was not on a service tree just return, otherwise if it is on 1808 * be invoked only if entity is on a service tree. Extracts the entity
1364 * any scheduler tree, extract it from that tree, and if necessary 1809 * from that tree, and if necessary and allowed, puts it on the idle
1365 * and if the caller did not specify @requeue, put it on the idle tree. 1810 * tree.
1366 *
1367 * Return %1 if the caller should update the entity hierarchy, i.e.,
1368 * if the entity was in service or if it was the next_in_service for
1369 * its sched_data; return %0 otherwise.
1370 */ 1811 */
1371static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) 1812static bool __bfq_deactivate_entity(struct bfq_entity *entity,
1813 bool ins_into_idle_tree)
1372{ 1814{
1373 struct bfq_sched_data *sd = entity->sched_data; 1815 struct bfq_sched_data *sd = entity->sched_data;
1374 struct bfq_service_tree *st = bfq_entity_service_tree(entity); 1816 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1375 int is_in_service = entity == sd->in_service_entity; 1817 int is_in_service = entity == sd->in_service_entity;
1376 int ret = 0;
1377 1818
1378 if (!entity->on_st) 1819 if (!entity->on_st) /* entity never activated, or already inactive */
1379 return 0; 1820 return false;
1380 1821
1381 if (is_in_service) { 1822 if (is_in_service)
1382 bfq_calc_finish(entity, entity->service); 1823 bfq_calc_finish(entity, entity->service);
1383 sd->in_service_entity = NULL; 1824
1384 } else if (entity->tree == &st->active) 1825 if (entity->tree == &st->active)
1385 bfq_active_extract(st, entity); 1826 bfq_active_extract(st, entity);
1386 else if (entity->tree == &st->idle) 1827 else if (!is_in_service && entity->tree == &st->idle)
1387 bfq_idle_extract(st, entity); 1828 bfq_idle_extract(st, entity);
1388 1829
1389 if (is_in_service || sd->next_in_service == entity) 1830 if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
1390 ret = bfq_update_next_in_service(sd);
1391
1392 if (!requeue || !bfq_gt(entity->finish, st->vtime))
1393 bfq_forget_entity(st, entity, is_in_service); 1831 bfq_forget_entity(st, entity, is_in_service);
1394 else 1832 else
1395 bfq_idle_insert(st, entity); 1833 bfq_idle_insert(st, entity);
1396 1834
1397 return ret; 1835 return true;
1398} 1836}
1399 1837
1400/** 1838/**
1401 * bfq_deactivate_entity - deactivate an entity. 1839 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
1402 * @entity: the entity to deactivate. 1840 * @entity: the entity to deactivate.
1403 * @requeue: true if the entity can be put on the idle tree 1841 * @ins_into_idle_tree: true if the entity can be put on the idle tree
1404 */ 1842 */
1405static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) 1843static void bfq_deactivate_entity(struct bfq_entity *entity,
1844 bool ins_into_idle_tree,
1845 bool expiration)
1406{ 1846{
1407 struct bfq_sched_data *sd; 1847 struct bfq_sched_data *sd;
1408 struct bfq_entity *parent = NULL; 1848 struct bfq_entity *parent = NULL;
@@ -1410,63 +1850,102 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
1410 for_each_entity_safe(entity, parent) { 1850 for_each_entity_safe(entity, parent) {
1411 sd = entity->sched_data; 1851 sd = entity->sched_data;
1412 1852
1413 if (!__bfq_deactivate_entity(entity, requeue)) 1853 if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
1414 /* 1854 /*
1415 * The parent entity is still backlogged, and 1855 * entity is not in any tree any more, so
1416 * we don't need to update it as it is still 1856 * this deactivation is a no-op, and there is
1417 * in service. 1857 * nothing to change for upper-level entities
1858 * (in case of expiration, this can never
1859 * happen).
1418 */ 1860 */
1419 break; 1861 return;
1862 }
1863
1864 if (sd->next_in_service == entity)
1865 /*
1866 * entity was the next_in_service entity,
1867 * then, since entity has just been
1868 * deactivated, a new one must be found.
1869 */
1870 bfq_update_next_in_service(sd, NULL);
1420 1871
1421 if (sd->next_in_service) 1872 if (sd->next_in_service)
1422 /* 1873 /*
1423 * The parent entity is still backlogged and 1874 * The parent entity is still backlogged,
1424 * the budgets on the path towards the root 1875 * because next_in_service is not NULL. So, no
1425 * need to be updated. 1876 * further upwards deactivation must be
1877 * performed. Yet, next_in_service has
1878 * changed. Then the schedule does need to be
1879 * updated upwards.
1426 */ 1880 */
1427 goto update; 1881 break;
1428 1882
1429 /* 1883 /*
1430 * If we get here, then the parent is no more backlogged and 1884 * If we get here, then the parent is no more
1431 * we want to propagate the deactivation upwards. 1885 * backlogged and we need to propagate the
1886 * deactivation upwards. Thus let the loop go on.
1432 */ 1887 */
1433 requeue = 1;
1434 }
1435 1888
1436 return; 1889 /*
1890 * Also let parent be queued into the idle tree on
1891 * deactivation, to preserve service guarantees, and
1892 * assuming that who invoked this function does not
1893 * need parent entities too to be removed completely.
1894 */
1895 ins_into_idle_tree = true;
1896 }
1437 1897
1438update: 1898 /*
1899 * If the deactivation loop is fully executed, then there are
1900 * no more entities to touch and next loop is not executed at
1901 * all. Otherwise, requeue remaining entities if they are
1902 * about to stop receiving service, or reposition them if this
1903 * is not the case.
1904 */
1439 entity = parent; 1905 entity = parent;
1440 for_each_entity(entity) { 1906 for_each_entity(entity) {
1441 __bfq_activate_entity(entity, false); 1907 /*
1908 * Invoke __bfq_requeue_entity on entity, even if
1909 * already active, to requeue/reposition it in the
1910 * active tree (because sd->next_in_service has
1911 * changed)
1912 */
1913 __bfq_requeue_entity(entity);
1442 1914
1443 sd = entity->sched_data; 1915 sd = entity->sched_data;
1444 if (!bfq_update_next_in_service(sd)) 1916 if (!bfq_update_next_in_service(sd, entity) &&
1917 !expiration)
1918 /*
1919 * next_in_service unchanged or not causing
1920 * any change in entity->parent->sd, and no
1921 * requeueing needed for expiration: stop
1922 * here.
1923 */
1445 break; 1924 break;
1446 } 1925 }
1447} 1926}
1448 1927
1449/** 1928/**
1450 * bfq_update_vtime - update vtime if necessary. 1929 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
1930 * if needed, to have at least one entity eligible.
1451 * @st: the service tree to act upon. 1931 * @st: the service tree to act upon.
1452 * 1932 *
1453 * If necessary update the service tree vtime to have at least one 1933 * Assumes that st is not empty.
1454 * eligible entity, skipping to its start time. Assumes that the
1455 * active tree of the device is not empty.
1456 *
1457 * NOTE: this hierarchical implementation updates vtimes quite often,
1458 * we may end up with reactivated processes getting timestamps after a
1459 * vtime skip done because we needed a ->first_active entity on some
1460 * intermediate node.
1461 */ 1934 */
1462static void bfq_update_vtime(struct bfq_service_tree *st) 1935static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
1463{ 1936{
1464 struct bfq_entity *entry; 1937 struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
1465 struct rb_node *node = st->active.rb_node; 1938
1939 if (bfq_gt(root_entity->min_start, st->vtime))
1940 return root_entity->min_start;
1941
1942 return st->vtime;
1943}
1466 1944
1467 entry = rb_entry(node, struct bfq_entity, rb_node); 1945static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
1468 if (bfq_gt(entry->min_start, st->vtime)) { 1946{
1469 st->vtime = entry->min_start; 1947 if (new_value > st->vtime) {
1948 st->vtime = new_value;
1470 bfq_forget_idle(st); 1949 bfq_forget_idle(st);
1471 } 1950 }
1472} 1951}
@@ -1475,6 +1954,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
1475 * bfq_first_active_entity - find the eligible entity with 1954 * bfq_first_active_entity - find the eligible entity with
1476 * the smallest finish time 1955 * the smallest finish time
1477 * @st: the service tree to select from. 1956 * @st: the service tree to select from.
1957 * @vtime: the system virtual to use as a reference for eligibility
1478 * 1958 *
1479 * This function searches the first schedulable entity, starting from the 1959 * This function searches the first schedulable entity, starting from the
1480 * root of the tree and going on the left every time on this side there is 1960 * root of the tree and going on the left every time on this side there is
@@ -1482,7 +1962,8 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
1482 * the right is followed only if a) the left subtree contains no eligible 1962 * the right is followed only if a) the left subtree contains no eligible
1483 * entities and b) no eligible entity has been found yet. 1963 * entities and b) no eligible entity has been found yet.
1484 */ 1964 */
1485static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) 1965static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
1966 u64 vtime)
1486{ 1967{
1487 struct bfq_entity *entry, *first = NULL; 1968 struct bfq_entity *entry, *first = NULL;
1488 struct rb_node *node = st->active.rb_node; 1969 struct rb_node *node = st->active.rb_node;
@@ -1490,13 +1971,13 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
1490 while (node) { 1971 while (node) {
1491 entry = rb_entry(node, struct bfq_entity, rb_node); 1972 entry = rb_entry(node, struct bfq_entity, rb_node);
1492left: 1973left:
1493 if (!bfq_gt(entry->start, st->vtime)) 1974 if (!bfq_gt(entry->start, vtime))
1494 first = entry; 1975 first = entry;
1495 1976
1496 if (node->rb_left) { 1977 if (node->rb_left) {
1497 entry = rb_entry(node->rb_left, 1978 entry = rb_entry(node->rb_left,
1498 struct bfq_entity, rb_node); 1979 struct bfq_entity, rb_node);
1499 if (!bfq_gt(entry->min_start, st->vtime)) { 1980 if (!bfq_gt(entry->min_start, vtime)) {
1500 node = node->rb_left; 1981 node = node->rb_left;
1501 goto left; 1982 goto left;
1502 } 1983 }
@@ -1513,30 +1994,53 @@ left:
1513 * __bfq_lookup_next_entity - return the first eligible entity in @st. 1994 * __bfq_lookup_next_entity - return the first eligible entity in @st.
1514 * @st: the service tree. 1995 * @st: the service tree.
1515 * 1996 *
1516 * Update the virtual time in @st and return the first eligible entity 1997 * If there is no in-service entity for the sched_data st belongs to,
1517 * it contains. 1998 * then return the entity that will be set in service if:
1999 * 1) the parent entity this st belongs to is set in service;
2000 * 2) no entity belonging to such parent entity undergoes a state change
2001 * that would influence the timestamps of the entity (e.g., becomes idle,
2002 * becomes backlogged, changes its budget, ...).
2003 *
2004 * In this first case, update the virtual time in @st too (see the
2005 * comments on this update inside the function).
2006 *
2007 * In constrast, if there is an in-service entity, then return the
2008 * entity that would be set in service if not only the above
2009 * conditions, but also the next one held true: the currently
2010 * in-service entity, on expiration,
2011 * 1) gets a finish time equal to the current one, or
2012 * 2) is not eligible any more, or
2013 * 3) is idle.
1518 */ 2014 */
1519static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, 2015static struct bfq_entity *
1520 bool force) 2016__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
1521{ 2017{
1522 struct bfq_entity *entity, *new_next_in_service = NULL; 2018 struct bfq_entity *entity;
2019 u64 new_vtime;
1523 2020
1524 if (RB_EMPTY_ROOT(&st->active)) 2021 if (RB_EMPTY_ROOT(&st->active))
1525 return NULL; 2022 return NULL;
1526 2023
1527 bfq_update_vtime(st); 2024 /*
1528 entity = bfq_first_active_entity(st); 2025 * Get the value of the system virtual time for which at
2026 * least one entity is eligible.
2027 */
2028 new_vtime = bfq_calc_vtime_jump(st);
1529 2029
1530 /* 2030 /*
1531 * If the chosen entity does not match with the sched_data's 2031 * If there is no in-service entity for the sched_data this
1532 * next_in_service and we are forcedly serving the IDLE priority 2032 * active tree belongs to, then push the system virtual time
1533 * class tree, bubble up budget update. 2033 * up to the value that guarantees that at least one entity is
2034 * eligible. If, instead, there is an in-service entity, then
2035 * do not make any such update, because there is already an
2036 * eligible entity, namely the in-service one (even if the
2037 * entity is not on st, because it was extracted when set in
2038 * service).
1534 */ 2039 */
1535 if (unlikely(force && entity != entity->sched_data->next_in_service)) { 2040 if (!in_service)
1536 new_next_in_service = entity; 2041 bfq_update_vtime(st, new_vtime);
1537 for_each_entity(new_next_in_service) 2042
1538 bfq_update_budget(new_next_in_service); 2043 entity = bfq_first_active_entity(st, new_vtime);
1539 }
1540 2044
1541 return entity; 2045 return entity;
1542} 2046}
@@ -1544,63 +2048,58 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
1544/** 2048/**
1545 * bfq_lookup_next_entity - return the first eligible entity in @sd. 2049 * bfq_lookup_next_entity - return the first eligible entity in @sd.
1546 * @sd: the sched_data. 2050 * @sd: the sched_data.
1547 * @extract: if true the returned entity will be also extracted from @sd.
1548 * 2051 *
1549 * NOTE: since we cache the next_in_service entity at each level of the 2052 * This function is invoked when there has been a change in the trees
1550 * hierarchy, the complexity of the lookup can be decreased with 2053 * for sd, and we need know what is the new next entity after this
1551 * absolutely no effort just returning the cached next_in_service value; 2054 * change.
1552 * we prefer to do full lookups to test the consistency of the data
1553 * structures.
1554 */ 2055 */
1555static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, 2056static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
1556 int extract,
1557 struct bfq_data *bfqd)
1558{ 2057{
1559 struct bfq_service_tree *st = sd->service_tree; 2058 struct bfq_service_tree *st = sd->service_tree;
1560 struct bfq_entity *entity; 2059 struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
1561 int i = 0; 2060 struct bfq_entity *entity = NULL;
2061 int class_idx = 0;
1562 2062
1563 /* 2063 /*
1564 * Choose from idle class, if needed to guarantee a minimum 2064 * Choose from idle class, if needed to guarantee a minimum
1565 * bandwidth to this class. This should also mitigate 2065 * bandwidth to this class (and if there is some active entity
2066 * in idle class). This should also mitigate
1566 * priority-inversion problems in case a low priority task is 2067 * priority-inversion problems in case a low priority task is
1567 * holding file system resources. 2068 * holding file system resources.
1568 */ 2069 */
1569 if (bfqd && 2070 if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
1570 jiffies - bfqd->bfq_class_idle_last_service > 2071 BFQ_CL_IDLE_TIMEOUT)) {
1571 BFQ_CL_IDLE_TIMEOUT) { 2072 if (!RB_EMPTY_ROOT(&idle_class_st->active))
1572 entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, 2073 class_idx = BFQ_IOPRIO_CLASSES - 1;
1573 true); 2074 /* About to be served if backlogged, or not yet backlogged */
1574 if (entity) { 2075 sd->bfq_class_idle_last_service = jiffies;
1575 i = BFQ_IOPRIO_CLASSES - 1;
1576 bfqd->bfq_class_idle_last_service = jiffies;
1577 sd->next_in_service = entity;
1578 }
1579 } 2076 }
1580 for (; i < BFQ_IOPRIO_CLASSES; i++) { 2077
1581 entity = __bfq_lookup_next_entity(st + i, false); 2078 /*
1582 if (entity) { 2079 * Find the next entity to serve for the highest-priority
1583 if (extract) { 2080 * class, unless the idle class needs to be served.
1584 bfq_check_next_in_service(sd, entity); 2081 */
1585 bfq_active_extract(st + i, entity); 2082 for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
1586 sd->in_service_entity = entity; 2083 entity = __bfq_lookup_next_entity(st + class_idx,
1587 sd->next_in_service = NULL; 2084 sd->in_service_entity);
1588 } 2085
2086 if (entity)
1589 break; 2087 break;
1590 }
1591 } 2088 }
1592 2089
2090 if (!entity)
2091 return NULL;
2092
1593 return entity; 2093 return entity;
1594} 2094}
1595 2095
1596static bool next_queue_may_preempt(struct bfq_data *bfqd) 2096static bool next_queue_may_preempt(struct bfq_data *bfqd)
1597{ 2097{
1598 struct bfq_sched_data *sd = &bfqd->sched_data; 2098 struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
1599 2099
1600 return sd->next_in_service != sd->in_service_entity; 2100 return sd->next_in_service != sd->in_service_entity;
1601} 2101}
1602 2102
1603
1604/* 2103/*
1605 * Get next queue for service. 2104 * Get next queue for service.
1606 */ 2105 */
@@ -1613,14 +2112,105 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
1613 if (bfqd->busy_queues == 0) 2112 if (bfqd->busy_queues == 0)
1614 return NULL; 2113 return NULL;
1615 2114
1616 sd = &bfqd->sched_data; 2115 /*
2116 * Traverse the path from the root to the leaf entity to
2117 * serve. Set in service all the entities visited along the
2118 * way.
2119 */
2120 sd = &bfqd->root_group->sched_data;
1617 for (; sd ; sd = entity->my_sched_data) { 2121 for (; sd ; sd = entity->my_sched_data) {
1618 entity = bfq_lookup_next_entity(sd, 1, bfqd); 2122 /*
2123 * WARNING. We are about to set the in-service entity
2124 * to sd->next_in_service, i.e., to the (cached) value
2125 * returned by bfq_lookup_next_entity(sd) the last
2126 * time it was invoked, i.e., the last time when the
2127 * service order in sd changed as a consequence of the
2128 * activation or deactivation of an entity. In this
2129 * respect, if we execute bfq_lookup_next_entity(sd)
2130 * in this very moment, it may, although with low
2131 * probability, yield a different entity than that
2132 * pointed to by sd->next_in_service. This rare event
2133 * happens in case there was no CLASS_IDLE entity to
2134 * serve for sd when bfq_lookup_next_entity(sd) was
2135 * invoked for the last time, while there is now one
2136 * such entity.
2137 *
2138 * If the above event happens, then the scheduling of
2139 * such entity in CLASS_IDLE is postponed until the
2140 * service of the sd->next_in_service entity
2141 * finishes. In fact, when the latter is expired,
2142 * bfq_lookup_next_entity(sd) gets called again,
2143 * exactly to update sd->next_in_service.
2144 */
2145
2146 /* Make next_in_service entity become in_service_entity */
2147 entity = sd->next_in_service;
2148 sd->in_service_entity = entity;
2149
2150 /*
2151 * Reset the accumulator of the amount of service that
2152 * the entity is about to receive.
2153 */
1619 entity->service = 0; 2154 entity->service = 0;
2155
2156 /*
2157 * If entity is no longer a candidate for next
2158 * service, then we extract it from its active tree,
2159 * for the following reason. To further boost the
2160 * throughput in some special case, BFQ needs to know
2161 * which is the next candidate entity to serve, while
2162 * there is already an entity in service. In this
2163 * respect, to make it easy to compute/update the next
2164 * candidate entity to serve after the current
2165 * candidate has been set in service, there is a case
2166 * where it is necessary to extract the current
2167 * candidate from its service tree. Such a case is
2168 * when the entity just set in service cannot be also
2169 * a candidate for next service. Details about when
2170 * this conditions holds are reported in the comments
2171 * on the function bfq_no_longer_next_in_service()
2172 * invoked below.
2173 */
2174 if (bfq_no_longer_next_in_service(entity))
2175 bfq_active_extract(bfq_entity_service_tree(entity),
2176 entity);
2177
2178 /*
2179 * For the same reason why we may have just extracted
2180 * entity from its active tree, we may need to update
2181 * next_in_service for the sched_data of entity too,
2182 * regardless of whether entity has been extracted.
2183 * In fact, even if entity has not been extracted, a
2184 * descendant entity may get extracted. Such an event
2185 * would cause a change in next_in_service for the
2186 * level of the descendant entity, and thus possibly
2187 * back to upper levels.
2188 *
2189 * We cannot perform the resulting needed update
2190 * before the end of this loop, because, to know which
2191 * is the correct next-to-serve candidate entity for
2192 * each level, we need first to find the leaf entity
2193 * to set in service. In fact, only after we know
2194 * which is the next-to-serve leaf entity, we can
2195 * discover whether the parent entity of the leaf
2196 * entity becomes the next-to-serve, and so on.
2197 */
2198
1620 } 2199 }
1621 2200
1622 bfqq = bfq_entity_to_bfqq(entity); 2201 bfqq = bfq_entity_to_bfqq(entity);
1623 2202
2203 /*
2204 * We can finally update all next-to-serve entities along the
2205 * path from the leaf entity just set in service to the root.
2206 */
2207 for_each_entity(entity) {
2208 struct bfq_sched_data *sd = entity->sched_data;
2209
2210 if (!bfq_update_next_in_service(sd, NULL))
2211 break;
2212 }
2213
1624 return bfqq; 2214 return bfqq;
1625} 2215}
1626 2216
@@ -1628,6 +2218,7 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
1628{ 2218{
1629 struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; 2219 struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
1630 struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; 2220 struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
2221 struct bfq_entity *entity = in_serv_entity;
1631 2222
1632 if (bfqd->in_service_bic) { 2223 if (bfqd->in_service_bic) {
1633 put_io_context(bfqd->in_service_bic->icq.ioc); 2224 put_io_context(bfqd->in_service_bic->icq.ioc);
@@ -1639,6 +2230,15 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
1639 bfqd->in_service_queue = NULL; 2230 bfqd->in_service_queue = NULL;
1640 2231
1641 /* 2232 /*
2233 * When this function is called, all in-service entities have
2234 * been properly deactivated or requeued, so we can safely
2235 * execute the final step: reset in_service_entity along the
2236 * path from entity to the root.
2237 */
2238 for_each_entity(entity)
2239 entity->sched_data->in_service_entity = NULL;
2240
2241 /*
1642 * in_serv_entity is no longer in service, so, if it is in no 2242 * in_serv_entity is no longer in service, so, if it is in no
1643 * service tree either, then release the service reference to 2243 * service tree either, then release the service reference to
1644 * the queue it represents (taken with bfq_get_entity). 2244 * the queue it represents (taken with bfq_get_entity).
@@ -1648,27 +2248,39 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
1648} 2248}
1649 2249
1650static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, 2250static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1651 int requeue) 2251 bool ins_into_idle_tree, bool expiration)
1652{ 2252{
1653 struct bfq_entity *entity = &bfqq->entity; 2253 struct bfq_entity *entity = &bfqq->entity;
1654 2254
1655 bfq_deactivate_entity(entity, requeue); 2255 bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
1656} 2256}
1657 2257
1658static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) 2258static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1659{ 2259{
1660 struct bfq_entity *entity = &bfqq->entity; 2260 struct bfq_entity *entity = &bfqq->entity;
1661 2261
1662 bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq)); 2262 bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
2263 false);
1663 bfq_clear_bfqq_non_blocking_wait_rq(bfqq); 2264 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1664} 2265}
1665 2266
2267static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2268{
2269 struct bfq_entity *entity = &bfqq->entity;
2270
2271 bfq_activate_requeue_entity(entity, false,
2272 bfqq == bfqd->in_service_queue);
2273}
2274
2275static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
2276
1666/* 2277/*
1667 * Called when the bfqq no longer has requests pending, remove it from 2278 * Called when the bfqq no longer has requests pending, remove it from
1668 * the service tree. 2279 * the service tree. As a special case, it can be invoked during an
2280 * expiration.
1669 */ 2281 */
1670static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, 2282static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1671 int requeue) 2283 bool expiration)
1672{ 2284{
1673 bfq_log_bfqq(bfqd, bfqq, "del from busy"); 2285 bfq_log_bfqq(bfqd, bfqq, "del from busy");
1674 2286
@@ -1676,7 +2288,9 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1676 2288
1677 bfqd->busy_queues--; 2289 bfqd->busy_queues--;
1678 2290
1679 bfq_deactivate_bfqq(bfqd, bfqq, requeue); 2291 bfqg_stats_update_dequeue(bfqq_group(bfqq));
2292
2293 bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
1680} 2294}
1681 2295
1682/* 2296/*
@@ -1692,36 +2306,1110 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1692 bfqd->busy_queues++; 2306 bfqd->busy_queues++;
1693} 2307}
1694 2308
1695static void bfq_init_entity(struct bfq_entity *entity) 2309#ifdef CONFIG_BFQ_GROUP_IOSCHED
2310
2311/* bfqg stats flags */
2312enum bfqg_stats_flags {
2313 BFQG_stats_waiting = 0,
2314 BFQG_stats_idling,
2315 BFQG_stats_empty,
2316};
2317
2318#define BFQG_FLAG_FNS(name) \
2319static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2320{ \
2321 stats->flags |= (1 << BFQG_stats_##name); \
2322} \
2323static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2324{ \
2325 stats->flags &= ~(1 << BFQG_stats_##name); \
2326} \
2327static int bfqg_stats_##name(struct bfqg_stats *stats) \
2328{ \
2329 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2330} \
2331
2332BFQG_FLAG_FNS(waiting)
2333BFQG_FLAG_FNS(idling)
2334BFQG_FLAG_FNS(empty)
2335#undef BFQG_FLAG_FNS
2336
2337/* This should be called with the queue_lock held. */
2338static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
2339{
2340 unsigned long long now;
2341
2342 if (!bfqg_stats_waiting(stats))
2343 return;
2344
2345 now = sched_clock();
2346 if (time_after64(now, stats->start_group_wait_time))
2347 blkg_stat_add(&stats->group_wait_time,
2348 now - stats->start_group_wait_time);
2349 bfqg_stats_clear_waiting(stats);
2350}
2351
2352/* This should be called with the queue_lock held. */
2353static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
2354 struct bfq_group *curr_bfqg)
2355{
2356 struct bfqg_stats *stats = &bfqg->stats;
2357
2358 if (bfqg_stats_waiting(stats))
2359 return;
2360 if (bfqg == curr_bfqg)
2361 return;
2362 stats->start_group_wait_time = sched_clock();
2363 bfqg_stats_mark_waiting(stats);
2364}
2365
2366/* This should be called with the queue_lock held. */
2367static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
2368{
2369 unsigned long long now;
2370
2371 if (!bfqg_stats_empty(stats))
2372 return;
2373
2374 now = sched_clock();
2375 if (time_after64(now, stats->start_empty_time))
2376 blkg_stat_add(&stats->empty_time,
2377 now - stats->start_empty_time);
2378 bfqg_stats_clear_empty(stats);
2379}
2380
2381static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
2382{
2383 blkg_stat_add(&bfqg->stats.dequeue, 1);
2384}
2385
2386static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
2387{
2388 struct bfqg_stats *stats = &bfqg->stats;
2389
2390 if (blkg_rwstat_total(&stats->queued))
2391 return;
2392
2393 /*
2394 * group is already marked empty. This can happen if bfqq got new
2395 * request in parent group and moved to this group while being added
2396 * to service tree. Just ignore the event and move on.
2397 */
2398 if (bfqg_stats_empty(stats))
2399 return;
2400
2401 stats->start_empty_time = sched_clock();
2402 bfqg_stats_mark_empty(stats);
2403}
2404
2405static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
2406{
2407 struct bfqg_stats *stats = &bfqg->stats;
2408
2409 if (bfqg_stats_idling(stats)) {
2410 unsigned long long now = sched_clock();
2411
2412 if (time_after64(now, stats->start_idle_time))
2413 blkg_stat_add(&stats->idle_time,
2414 now - stats->start_idle_time);
2415 bfqg_stats_clear_idling(stats);
2416 }
2417}
2418
2419static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
2420{
2421 struct bfqg_stats *stats = &bfqg->stats;
2422
2423 stats->start_idle_time = sched_clock();
2424 bfqg_stats_mark_idling(stats);
2425}
2426
2427static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
2428{
2429 struct bfqg_stats *stats = &bfqg->stats;
2430
2431 blkg_stat_add(&stats->avg_queue_size_sum,
2432 blkg_rwstat_total(&stats->queued));
2433 blkg_stat_add(&stats->avg_queue_size_samples, 1);
2434 bfqg_stats_update_group_wait_time(stats);
2435}
2436
2437/*
2438 * blk-cgroup policy-related handlers
2439 * The following functions help in converting between blk-cgroup
2440 * internal structures and BFQ-specific structures.
2441 */
2442
2443static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
2444{
2445 return pd ? container_of(pd, struct bfq_group, pd) : NULL;
2446}
2447
2448static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
2449{
2450 return pd_to_blkg(&bfqg->pd);
2451}
2452
2453static struct blkcg_policy blkcg_policy_bfq;
2454
2455static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
2456{
2457 return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
2458}
2459
2460/*
2461 * bfq_group handlers
2462 * The following functions help in navigating the bfq_group hierarchy
2463 * by allowing to find the parent of a bfq_group or the bfq_group
2464 * associated to a bfq_queue.
2465 */
2466
2467static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
2468{
2469 struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
2470
2471 return pblkg ? blkg_to_bfqg(pblkg) : NULL;
2472}
2473
2474static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
2475{
2476 struct bfq_entity *group_entity = bfqq->entity.parent;
2477
2478 return group_entity ? container_of(group_entity, struct bfq_group,
2479 entity) :
2480 bfqq->bfqd->root_group;
2481}
2482
2483/*
2484 * The following two functions handle get and put of a bfq_group by
2485 * wrapping the related blk-cgroup hooks.
2486 */
2487
2488static void bfqg_get(struct bfq_group *bfqg)
2489{
2490 return blkg_get(bfqg_to_blkg(bfqg));
2491}
2492
2493static void bfqg_put(struct bfq_group *bfqg)
2494{
2495 return blkg_put(bfqg_to_blkg(bfqg));
2496}
2497
2498static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
2499 struct bfq_queue *bfqq,
2500 unsigned int op)
2501{
2502 blkg_rwstat_add(&bfqg->stats.queued, op, 1);
2503 bfqg_stats_end_empty_time(&bfqg->stats);
2504 if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
2505 bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
2506}
2507
2508static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
2509{
2510 blkg_rwstat_add(&bfqg->stats.queued, op, -1);
2511}
2512
2513static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
2514{
2515 blkg_rwstat_add(&bfqg->stats.merged, op, 1);
2516}
2517
2518static void bfqg_stats_update_completion(struct bfq_group *bfqg,
2519 uint64_t start_time, uint64_t io_start_time,
2520 unsigned int op)
2521{
2522 struct bfqg_stats *stats = &bfqg->stats;
2523 unsigned long long now = sched_clock();
2524
2525 if (time_after64(now, io_start_time))
2526 blkg_rwstat_add(&stats->service_time, op,
2527 now - io_start_time);
2528 if (time_after64(io_start_time, start_time))
2529 blkg_rwstat_add(&stats->wait_time, op,
2530 io_start_time - start_time);
2531}
2532
2533/* @stats = 0 */
2534static void bfqg_stats_reset(struct bfqg_stats *stats)
2535{
2536 /* queued stats shouldn't be cleared */
2537 blkg_rwstat_reset(&stats->merged);
2538 blkg_rwstat_reset(&stats->service_time);
2539 blkg_rwstat_reset(&stats->wait_time);
2540 blkg_stat_reset(&stats->time);
2541 blkg_stat_reset(&stats->avg_queue_size_sum);
2542 blkg_stat_reset(&stats->avg_queue_size_samples);
2543 blkg_stat_reset(&stats->dequeue);
2544 blkg_stat_reset(&stats->group_wait_time);
2545 blkg_stat_reset(&stats->idle_time);
2546 blkg_stat_reset(&stats->empty_time);
2547}
2548
2549/* @to += @from */
2550static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
2551{
2552 if (!to || !from)
2553 return;
2554
2555 /* queued stats shouldn't be cleared */
2556 blkg_rwstat_add_aux(&to->merged, &from->merged);
2557 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
2558 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
2559 blkg_stat_add_aux(&from->time, &from->time);
2560 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
2561 blkg_stat_add_aux(&to->avg_queue_size_samples,
2562 &from->avg_queue_size_samples);
2563 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
2564 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
2565 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
2566 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
2567}
2568
2569/*
2570 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2571 * recursive stats can still account for the amount used by this bfqg after
2572 * it's gone.
2573 */
2574static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
2575{
2576 struct bfq_group *parent;
2577
2578 if (!bfqg) /* root_group */
2579 return;
2580
2581 parent = bfqg_parent(bfqg);
2582
2583 lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
2584
2585 if (unlikely(!parent))
2586 return;
2587
2588 bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
2589 bfqg_stats_reset(&bfqg->stats);
2590}
2591
2592static void bfq_init_entity(struct bfq_entity *entity,
2593 struct bfq_group *bfqg)
1696{ 2594{
1697 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); 2595 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1698 2596
1699 entity->weight = entity->new_weight; 2597 entity->weight = entity->new_weight;
1700 entity->orig_weight = entity->new_weight; 2598 entity->orig_weight = entity->new_weight;
2599 if (bfqq) {
2600 bfqq->ioprio = bfqq->new_ioprio;
2601 bfqq->ioprio_class = bfqq->new_ioprio_class;
2602 bfqg_get(bfqg);
2603 }
2604 entity->parent = bfqg->my_entity; /* NULL for root group */
2605 entity->sched_data = &bfqg->sched_data;
2606}
2607
2608static void bfqg_stats_exit(struct bfqg_stats *stats)
2609{
2610 blkg_rwstat_exit(&stats->merged);
2611 blkg_rwstat_exit(&stats->service_time);
2612 blkg_rwstat_exit(&stats->wait_time);
2613 blkg_rwstat_exit(&stats->queued);
2614 blkg_stat_exit(&stats->time);
2615 blkg_stat_exit(&stats->avg_queue_size_sum);
2616 blkg_stat_exit(&stats->avg_queue_size_samples);
2617 blkg_stat_exit(&stats->dequeue);
2618 blkg_stat_exit(&stats->group_wait_time);
2619 blkg_stat_exit(&stats->idle_time);
2620 blkg_stat_exit(&stats->empty_time);
2621}
2622
2623static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
2624{
2625 if (blkg_rwstat_init(&stats->merged, gfp) ||
2626 blkg_rwstat_init(&stats->service_time, gfp) ||
2627 blkg_rwstat_init(&stats->wait_time, gfp) ||
2628 blkg_rwstat_init(&stats->queued, gfp) ||
2629 blkg_stat_init(&stats->time, gfp) ||
2630 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
2631 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
2632 blkg_stat_init(&stats->dequeue, gfp) ||
2633 blkg_stat_init(&stats->group_wait_time, gfp) ||
2634 blkg_stat_init(&stats->idle_time, gfp) ||
2635 blkg_stat_init(&stats->empty_time, gfp)) {
2636 bfqg_stats_exit(stats);
2637 return -ENOMEM;
2638 }
1701 2639
1702 bfqq->ioprio = bfqq->new_ioprio; 2640 return 0;
1703 bfqq->ioprio_class = bfqq->new_ioprio_class; 2641}
1704 2642
1705 entity->sched_data = &bfqq->bfqd->sched_data; 2643static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
2644{
2645 return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
1706} 2646}
1707 2647
1708#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 2648static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
1709#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) 2649{
2650 return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
2651}
1710 2652
1711#define bfq_sample_valid(samples) ((samples) > 80) 2653static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
2654{
2655 struct bfq_group_data *bgd;
1712 2656
1713/* 2657 bgd = kzalloc(sizeof(*bgd), gfp);
1714 * Scheduler run of queue, if there are requests pending and no one in the 2658 if (!bgd)
1715 * driver that will restart queueing. 2659 return NULL;
2660 return &bgd->pd;
2661}
2662
2663static void bfq_cpd_init(struct blkcg_policy_data *cpd)
2664{
2665 struct bfq_group_data *d = cpd_to_bfqgd(cpd);
2666
2667 d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
2668 CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
2669}
2670
2671static void bfq_cpd_free(struct blkcg_policy_data *cpd)
2672{
2673 kfree(cpd_to_bfqgd(cpd));
2674}
2675
2676static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
2677{
2678 struct bfq_group *bfqg;
2679
2680 bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
2681 if (!bfqg)
2682 return NULL;
2683
2684 if (bfqg_stats_init(&bfqg->stats, gfp)) {
2685 kfree(bfqg);
2686 return NULL;
2687 }
2688
2689 return &bfqg->pd;
2690}
2691
2692static void bfq_pd_init(struct blkg_policy_data *pd)
2693{
2694 struct blkcg_gq *blkg = pd_to_blkg(pd);
2695 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
2696 struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
2697 struct bfq_entity *entity = &bfqg->entity;
2698 struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
2699
2700 entity->orig_weight = entity->weight = entity->new_weight = d->weight;
2701 entity->my_sched_data = &bfqg->sched_data;
2702 bfqg->my_entity = entity; /*
2703 * the root_group's will be set to NULL
2704 * in bfq_init_queue()
2705 */
2706 bfqg->bfqd = bfqd;
2707}
2708
2709static void bfq_pd_free(struct blkg_policy_data *pd)
2710{
2711 struct bfq_group *bfqg = pd_to_bfqg(pd);
2712
2713 bfqg_stats_exit(&bfqg->stats);
2714 return kfree(bfqg);
2715}
2716
2717static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
2718{
2719 struct bfq_group *bfqg = pd_to_bfqg(pd);
2720
2721 bfqg_stats_reset(&bfqg->stats);
2722}
2723
2724static void bfq_group_set_parent(struct bfq_group *bfqg,
2725 struct bfq_group *parent)
2726{
2727 struct bfq_entity *entity;
2728
2729 entity = &bfqg->entity;
2730 entity->parent = parent->my_entity;
2731 entity->sched_data = &parent->sched_data;
2732}
2733
2734static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
2735 struct blkcg *blkcg)
2736{
2737 struct blkcg_gq *blkg;
2738
2739 blkg = blkg_lookup(blkcg, bfqd->queue);
2740 if (likely(blkg))
2741 return blkg_to_bfqg(blkg);
2742 return NULL;
2743}
2744
2745static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
2746 struct blkcg *blkcg)
2747{
2748 struct bfq_group *bfqg, *parent;
2749 struct bfq_entity *entity;
2750
2751 bfqg = bfq_lookup_bfqg(bfqd, blkcg);
2752
2753 if (unlikely(!bfqg))
2754 return NULL;
2755
2756 /*
2757 * Update chain of bfq_groups as we might be handling a leaf group
2758 * which, along with some of its relatives, has not been hooked yet
2759 * to the private hierarchy of BFQ.
2760 */
2761 entity = &bfqg->entity;
2762 for_each_entity(entity) {
2763 bfqg = container_of(entity, struct bfq_group, entity);
2764 if (bfqg != bfqd->root_group) {
2765 parent = bfqg_parent(bfqg);
2766 if (!parent)
2767 parent = bfqd->root_group;
2768 bfq_group_set_parent(bfqg, parent);
2769 }
2770 }
2771
2772 return bfqg;
2773}
2774
2775static void bfq_bfqq_expire(struct bfq_data *bfqd,
2776 struct bfq_queue *bfqq,
2777 bool compensate,
2778 enum bfqq_expiration reason);
2779
2780/**
2781 * bfq_bfqq_move - migrate @bfqq to @bfqg.
2782 * @bfqd: queue descriptor.
2783 * @bfqq: the queue to move.
2784 * @bfqg: the group to move to.
2785 *
2786 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
2787 * it on the new one. Avoid putting the entity on the old group idle tree.
2788 *
2789 * Must be called under the queue lock; the cgroup owning @bfqg must
2790 * not disappear (by now this just means that we are called under
2791 * rcu_read_lock()).
1716 */ 2792 */
1717static void bfq_schedule_dispatch(struct bfq_data *bfqd) 2793static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2794 struct bfq_group *bfqg)
1718{ 2795{
1719 if (bfqd->queued != 0) { 2796 struct bfq_entity *entity = &bfqq->entity;
1720 bfq_log(bfqd, "schedule dispatch"); 2797
1721 blk_mq_run_hw_queues(bfqd->queue, true); 2798 /* If bfqq is empty, then bfq_bfqq_expire also invokes
2799 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
2800 * from data structures related to current group. Otherwise we
2801 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
2802 * we do below.
2803 */
2804 if (bfqq == bfqd->in_service_queue)
2805 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
2806 false, BFQQE_PREEMPTED);
2807
2808 if (bfq_bfqq_busy(bfqq))
2809 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
2810 else if (entity->on_st)
2811 bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
2812 bfqg_put(bfqq_group(bfqq));
2813
2814 /*
2815 * Here we use a reference to bfqg. We don't need a refcounter
2816 * as the cgroup reference will not be dropped, so that its
2817 * destroy() callback will not be invoked.
2818 */
2819 entity->parent = bfqg->my_entity;
2820 entity->sched_data = &bfqg->sched_data;
2821 bfqg_get(bfqg);
2822
2823 if (bfq_bfqq_busy(bfqq))
2824 bfq_activate_bfqq(bfqd, bfqq);
2825
2826 if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
2827 bfq_schedule_dispatch(bfqd);
2828}
2829
2830/**
2831 * __bfq_bic_change_cgroup - move @bic to @cgroup.
2832 * @bfqd: the queue descriptor.
2833 * @bic: the bic to move.
2834 * @blkcg: the blk-cgroup to move to.
2835 *
2836 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
2837 * has to make sure that the reference to cgroup is valid across the call.
2838 *
2839 * NOTE: an alternative approach might have been to store the current
2840 * cgroup in bfqq and getting a reference to it, reducing the lookup
2841 * time here, at the price of slightly more complex code.
2842 */
2843static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
2844 struct bfq_io_cq *bic,
2845 struct blkcg *blkcg)
2846{
2847 struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
2848 struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
2849 struct bfq_group *bfqg;
2850 struct bfq_entity *entity;
2851
2852 bfqg = bfq_find_set_group(bfqd, blkcg);
2853
2854 if (unlikely(!bfqg))
2855 bfqg = bfqd->root_group;
2856
2857 if (async_bfqq) {
2858 entity = &async_bfqq->entity;
2859
2860 if (entity->sched_data != &bfqg->sched_data) {
2861 bic_set_bfqq(bic, NULL, 0);
2862 bfq_log_bfqq(bfqd, async_bfqq,
2863 "bic_change_group: %p %d",
2864 async_bfqq,
2865 async_bfqq->ref);
2866 bfq_put_queue(async_bfqq);
2867 }
2868 }
2869
2870 if (sync_bfqq) {
2871 entity = &sync_bfqq->entity;
2872 if (entity->sched_data != &bfqg->sched_data)
2873 bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
2874 }
2875
2876 return bfqg;
2877}
2878
2879static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
2880{
2881 struct bfq_data *bfqd = bic_to_bfqd(bic);
2882 struct bfq_group *bfqg = NULL;
2883 uint64_t serial_nr;
2884
2885 rcu_read_lock();
2886 serial_nr = bio_blkcg(bio)->css.serial_nr;
2887
2888 /*
2889 * Check whether blkcg has changed. The condition may trigger
2890 * spuriously on a newly created cic but there's no harm.
2891 */
2892 if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
2893 goto out;
2894
2895 bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
2896 bic->blkcg_serial_nr = serial_nr;
2897out:
2898 rcu_read_unlock();
2899}
2900
2901/**
2902 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
2903 * @st: the service tree being flushed.
2904 */
2905static void bfq_flush_idle_tree(struct bfq_service_tree *st)
2906{
2907 struct bfq_entity *entity = st->first_idle;
2908
2909 for (; entity ; entity = st->first_idle)
2910 __bfq_deactivate_entity(entity, false);
2911}
2912
2913/**
2914 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
2915 * @bfqd: the device data structure with the root group.
2916 * @entity: the entity to move.
2917 */
2918static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
2919 struct bfq_entity *entity)
2920{
2921 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
2922
2923 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
2924}
2925
2926/**
2927 * bfq_reparent_active_entities - move to the root group all active
2928 * entities.
2929 * @bfqd: the device data structure with the root group.
2930 * @bfqg: the group to move from.
2931 * @st: the service tree with the entities.
2932 *
2933 * Needs queue_lock to be taken and reference to be valid over the call.
2934 */
2935static void bfq_reparent_active_entities(struct bfq_data *bfqd,
2936 struct bfq_group *bfqg,
2937 struct bfq_service_tree *st)
2938{
2939 struct rb_root *active = &st->active;
2940 struct bfq_entity *entity = NULL;
2941
2942 if (!RB_EMPTY_ROOT(&st->active))
2943 entity = bfq_entity_of(rb_first(active));
2944
2945 for (; entity ; entity = bfq_entity_of(rb_first(active)))
2946 bfq_reparent_leaf_entity(bfqd, entity);
2947
2948 if (bfqg->sched_data.in_service_entity)
2949 bfq_reparent_leaf_entity(bfqd,
2950 bfqg->sched_data.in_service_entity);
2951}
2952
2953/**
2954 * bfq_pd_offline - deactivate the entity associated with @pd,
2955 * and reparent its children entities.
2956 * @pd: descriptor of the policy going offline.
2957 *
2958 * blkio already grabs the queue_lock for us, so no need to use
2959 * RCU-based magic
2960 */
2961static void bfq_pd_offline(struct blkg_policy_data *pd)
2962{
2963 struct bfq_service_tree *st;
2964 struct bfq_group *bfqg = pd_to_bfqg(pd);
2965 struct bfq_data *bfqd = bfqg->bfqd;
2966 struct bfq_entity *entity = bfqg->my_entity;
2967 unsigned long flags;
2968 int i;
2969
2970 if (!entity) /* root group */
2971 return;
2972
2973 spin_lock_irqsave(&bfqd->lock, flags);
2974 /*
2975 * Empty all service_trees belonging to this group before
2976 * deactivating the group itself.
2977 */
2978 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
2979 st = bfqg->sched_data.service_tree + i;
2980
2981 /*
2982 * The idle tree may still contain bfq_queues belonging
2983 * to exited task because they never migrated to a different
2984 * cgroup from the one being destroyed now. No one else
2985 * can access them so it's safe to act without any lock.
2986 */
2987 bfq_flush_idle_tree(st);
2988
2989 /*
2990 * It may happen that some queues are still active
2991 * (busy) upon group destruction (if the corresponding
2992 * processes have been forced to terminate). We move
2993 * all the leaf entities corresponding to these queues
2994 * to the root_group.
2995 * Also, it may happen that the group has an entity
2996 * in service, which is disconnected from the active
2997 * tree: it must be moved, too.
2998 * There is no need to put the sync queues, as the
2999 * scheduler has taken no reference.
3000 */
3001 bfq_reparent_active_entities(bfqd, bfqg, st);
3002 }
3003
3004 __bfq_deactivate_entity(entity, false);
3005 bfq_put_async_queues(bfqd, bfqg);
3006
3007 spin_unlock_irqrestore(&bfqd->lock, flags);
3008 /*
3009 * @blkg is going offline and will be ignored by
3010 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3011 * that they don't get lost. If IOs complete after this point, the
3012 * stats for them will be lost. Oh well...
3013 */
3014 bfqg_stats_xfer_dead(bfqg);
3015}
3016
3017static int bfq_io_show_weight(struct seq_file *sf, void *v)
3018{
3019 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3020 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3021 unsigned int val = 0;
3022
3023 if (bfqgd)
3024 val = bfqgd->weight;
3025
3026 seq_printf(sf, "%u\n", val);
3027
3028 return 0;
3029}
3030
3031static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
3032 struct cftype *cftype,
3033 u64 val)
3034{
3035 struct blkcg *blkcg = css_to_blkcg(css);
3036 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3037 struct blkcg_gq *blkg;
3038 int ret = -ERANGE;
3039
3040 if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
3041 return ret;
3042
3043 ret = 0;
3044 spin_lock_irq(&blkcg->lock);
3045 bfqgd->weight = (unsigned short)val;
3046 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3047 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3048
3049 if (!bfqg)
3050 continue;
3051 /*
3052 * Setting the prio_changed flag of the entity
3053 * to 1 with new_weight == weight would re-set
3054 * the value of the weight to its ioprio mapping.
3055 * Set the flag only if necessary.
3056 */
3057 if ((unsigned short)val != bfqg->entity.new_weight) {
3058 bfqg->entity.new_weight = (unsigned short)val;
3059 /*
3060 * Make sure that the above new value has been
3061 * stored in bfqg->entity.new_weight before
3062 * setting the prio_changed flag. In fact,
3063 * this flag may be read asynchronously (in
3064 * critical sections protected by a different
3065 * lock than that held here), and finding this
3066 * flag set may cause the execution of the code
3067 * for updating parameters whose value may
3068 * depend also on bfqg->entity.new_weight (in
3069 * __bfq_entity_update_weight_prio).
3070 * This barrier makes sure that the new value
3071 * of bfqg->entity.new_weight is correctly
3072 * seen in that code.
3073 */
3074 smp_wmb();
3075 bfqg->entity.prio_changed = 1;
3076 }
3077 }
3078 spin_unlock_irq(&blkcg->lock);
3079
3080 return ret;
3081}
3082
3083static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
3084 char *buf, size_t nbytes,
3085 loff_t off)
3086{
3087 u64 weight;
3088 /* First unsigned long found in the file is used */
3089 int ret = kstrtoull(strim(buf), 0, &weight);
3090
3091 if (ret)
3092 return ret;
3093
3094 return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
3095}
3096
3097static int bfqg_print_stat(struct seq_file *sf, void *v)
3098{
3099 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
3100 &blkcg_policy_bfq, seq_cft(sf)->private, false);
3101 return 0;
3102}
3103
3104static int bfqg_print_rwstat(struct seq_file *sf, void *v)
3105{
3106 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
3107 &blkcg_policy_bfq, seq_cft(sf)->private, true);
3108 return 0;
3109}
3110
3111static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
3112 struct blkg_policy_data *pd, int off)
3113{
3114 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
3115 &blkcg_policy_bfq, off);
3116 return __blkg_prfill_u64(sf, pd, sum);
3117}
3118
3119static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
3120 struct blkg_policy_data *pd, int off)
3121{
3122 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
3123 &blkcg_policy_bfq,
3124 off);
3125 return __blkg_prfill_rwstat(sf, pd, &sum);
3126}
3127
3128static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
3129{
3130 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3131 bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
3132 seq_cft(sf)->private, false);
3133 return 0;
3134}
3135
3136static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
3137{
3138 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3139 bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
3140 seq_cft(sf)->private, true);
3141 return 0;
3142}
3143
3144static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
3145 int off)
3146{
3147 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
3148
3149 return __blkg_prfill_u64(sf, pd, sum >> 9);
3150}
3151
3152static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
3153{
3154 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3155 bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
3156 return 0;
3157}
3158
3159static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
3160 struct blkg_policy_data *pd, int off)
3161{
3162 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
3163 offsetof(struct blkcg_gq, stat_bytes));
3164 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
3165 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
3166
3167 return __blkg_prfill_u64(sf, pd, sum >> 9);
3168}
3169
3170static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
3171{
3172 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3173 bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
3174 false);
3175 return 0;
3176}
3177
3178static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
3179 struct blkg_policy_data *pd, int off)
3180{
3181 struct bfq_group *bfqg = pd_to_bfqg(pd);
3182 u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
3183 u64 v = 0;
3184
3185 if (samples) {
3186 v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
3187 v = div64_u64(v, samples);
1722 } 3188 }
3189 __blkg_prfill_u64(sf, pd, v);
3190 return 0;
3191}
3192
3193/* print avg_queue_size */
3194static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
3195{
3196 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3197 bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
3198 0, false);
3199 return 0;
3200}
3201
3202static struct bfq_group *
3203bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
3204{
3205 int ret;
3206
3207 ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
3208 if (ret)
3209 return NULL;
3210
3211 return blkg_to_bfqg(bfqd->queue->root_blkg);
1723} 3212}
1724 3213
3214static struct cftype bfq_blkcg_legacy_files[] = {
3215 {
3216 .name = "bfq.weight",
3217 .flags = CFTYPE_NOT_ON_ROOT,
3218 .seq_show = bfq_io_show_weight,
3219 .write_u64 = bfq_io_set_weight_legacy,
3220 },
3221
3222 /* statistics, covers only the tasks in the bfqg */
3223 {
3224 .name = "bfq.time",
3225 .private = offsetof(struct bfq_group, stats.time),
3226 .seq_show = bfqg_print_stat,
3227 },
3228 {
3229 .name = "bfq.sectors",
3230 .seq_show = bfqg_print_stat_sectors,
3231 },
3232 {
3233 .name = "bfq.io_service_bytes",
3234 .private = (unsigned long)&blkcg_policy_bfq,
3235 .seq_show = blkg_print_stat_bytes,
3236 },
3237 {
3238 .name = "bfq.io_serviced",
3239 .private = (unsigned long)&blkcg_policy_bfq,
3240 .seq_show = blkg_print_stat_ios,
3241 },
3242 {
3243 .name = "bfq.io_service_time",
3244 .private = offsetof(struct bfq_group, stats.service_time),
3245 .seq_show = bfqg_print_rwstat,
3246 },
3247 {
3248 .name = "bfq.io_wait_time",
3249 .private = offsetof(struct bfq_group, stats.wait_time),
3250 .seq_show = bfqg_print_rwstat,
3251 },
3252 {
3253 .name = "bfq.io_merged",
3254 .private = offsetof(struct bfq_group, stats.merged),
3255 .seq_show = bfqg_print_rwstat,
3256 },
3257 {
3258 .name = "bfq.io_queued",
3259 .private = offsetof(struct bfq_group, stats.queued),
3260 .seq_show = bfqg_print_rwstat,
3261 },
3262
3263 /* the same statictics which cover the bfqg and its descendants */
3264 {
3265 .name = "bfq.time_recursive",
3266 .private = offsetof(struct bfq_group, stats.time),
3267 .seq_show = bfqg_print_stat_recursive,
3268 },
3269 {
3270 .name = "bfq.sectors_recursive",
3271 .seq_show = bfqg_print_stat_sectors_recursive,
3272 },
3273 {
3274 .name = "bfq.io_service_bytes_recursive",
3275 .private = (unsigned long)&blkcg_policy_bfq,
3276 .seq_show = blkg_print_stat_bytes_recursive,
3277 },
3278 {
3279 .name = "bfq.io_serviced_recursive",
3280 .private = (unsigned long)&blkcg_policy_bfq,
3281 .seq_show = blkg_print_stat_ios_recursive,
3282 },
3283 {
3284 .name = "bfq.io_service_time_recursive",
3285 .private = offsetof(struct bfq_group, stats.service_time),
3286 .seq_show = bfqg_print_rwstat_recursive,
3287 },
3288 {
3289 .name = "bfq.io_wait_time_recursive",
3290 .private = offsetof(struct bfq_group, stats.wait_time),
3291 .seq_show = bfqg_print_rwstat_recursive,
3292 },
3293 {
3294 .name = "bfq.io_merged_recursive",
3295 .private = offsetof(struct bfq_group, stats.merged),
3296 .seq_show = bfqg_print_rwstat_recursive,
3297 },
3298 {
3299 .name = "bfq.io_queued_recursive",
3300 .private = offsetof(struct bfq_group, stats.queued),
3301 .seq_show = bfqg_print_rwstat_recursive,
3302 },
3303 {
3304 .name = "bfq.avg_queue_size",
3305 .seq_show = bfqg_print_avg_queue_size,
3306 },
3307 {
3308 .name = "bfq.group_wait_time",
3309 .private = offsetof(struct bfq_group, stats.group_wait_time),
3310 .seq_show = bfqg_print_stat,
3311 },
3312 {
3313 .name = "bfq.idle_time",
3314 .private = offsetof(struct bfq_group, stats.idle_time),
3315 .seq_show = bfqg_print_stat,
3316 },
3317 {
3318 .name = "bfq.empty_time",
3319 .private = offsetof(struct bfq_group, stats.empty_time),
3320 .seq_show = bfqg_print_stat,
3321 },
3322 {
3323 .name = "bfq.dequeue",
3324 .private = offsetof(struct bfq_group, stats.dequeue),
3325 .seq_show = bfqg_print_stat,
3326 },
3327 { } /* terminate */
3328};
3329
3330static struct cftype bfq_blkg_files[] = {
3331 {
3332 .name = "bfq.weight",
3333 .flags = CFTYPE_NOT_ON_ROOT,
3334 .seq_show = bfq_io_show_weight,
3335 .write = bfq_io_set_weight,
3336 },
3337 {} /* terminate */
3338};
3339
3340#else /* CONFIG_BFQ_GROUP_IOSCHED */
3341
3342static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
3343 struct bfq_queue *bfqq, unsigned int op) { }
3344static inline void
3345bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
3346static inline void
3347bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
3348static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
3349 uint64_t start_time, uint64_t io_start_time,
3350 unsigned int op) { }
3351static inline void
3352bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
3353 struct bfq_group *curr_bfqg) { }
3354static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
3355static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
3356static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
3357static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
3358static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
3359static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
3360
3361static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3362 struct bfq_group *bfqg) {}
3363
3364static void bfq_init_entity(struct bfq_entity *entity,
3365 struct bfq_group *bfqg)
3366{
3367 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3368
3369 entity->weight = entity->new_weight;
3370 entity->orig_weight = entity->new_weight;
3371 if (bfqq) {
3372 bfqq->ioprio = bfqq->new_ioprio;
3373 bfqq->ioprio_class = bfqq->new_ioprio_class;
3374 }
3375 entity->sched_data = &bfqg->sched_data;
3376}
3377
3378static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
3379
3380static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
3381 struct blkcg *blkcg)
3382{
3383 return bfqd->root_group;
3384}
3385
3386static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
3387{
3388 return bfqq->bfqd->root_group;
3389}
3390
3391static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd,
3392 int node)
3393{
3394 struct bfq_group *bfqg;
3395 int i;
3396
3397 bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
3398 if (!bfqg)
3399 return NULL;
3400
3401 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
3402 bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
3403
3404 return bfqg;
3405}
3406#endif /* CONFIG_BFQ_GROUP_IOSCHED */
3407
3408#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3409#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3410
3411#define bfq_sample_valid(samples) ((samples) > 80)
3412
1725/* 3413/*
1726 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 3414 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1727 * We choose the request that is closesr to the head right now. Distance 3415 * We choose the request that is closesr to the head right now. Distance
@@ -1905,7 +3593,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
1905 entity->budget = new_budget; 3593 entity->budget = new_budget;
1906 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", 3594 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
1907 new_budget); 3595 new_budget);
1908 bfq_activate_bfqq(bfqd, bfqq); 3596 bfq_requeue_bfqq(bfqd, bfqq);
1909 } 3597 }
1910} 3598}
1911 3599
@@ -2076,6 +3764,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
2076 bfqq->ttime.last_end_request + 3764 bfqq->ttime.last_end_request +
2077 bfqd->bfq_slice_idle * 3; 3765 bfqd->bfq_slice_idle * 3;
2078 3766
3767 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
3768
2079 /* 3769 /*
2080 * Update budget and check whether bfqq may want to preempt 3770 * Update budget and check whether bfqq may want to preempt
2081 * the in-service queue. 3771 * the in-service queue.
@@ -2195,7 +3885,7 @@ static void bfq_remove_request(struct request_queue *q,
2195 bfqq->next_rq = NULL; 3885 bfqq->next_rq = NULL;
2196 3886
2197 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { 3887 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
2198 bfq_del_bfqq_busy(bfqd, bfqq, 1); 3888 bfq_del_bfqq_busy(bfqd, bfqq, false);
2199 /* 3889 /*
2200 * bfqq emptied. In normal operation, when 3890 * bfqq emptied. In normal operation, when
2201 * bfqq is empty, bfqq->entity.service and 3891 * bfqq is empty, bfqq->entity.service and
@@ -2215,6 +3905,8 @@ static void bfq_remove_request(struct request_queue *q,
2215 3905
2216 if (rq->cmd_flags & REQ_META) 3906 if (rq->cmd_flags & REQ_META)
2217 bfqq->meta_pending--; 3907 bfqq->meta_pending--;
3908
3909 bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
2218} 3910}
2219 3911
2220static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) 3912static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
@@ -2300,7 +3992,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
2300 struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); 3992 struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
2301 3993
2302 if (!RB_EMPTY_NODE(&rq->rb_node)) 3994 if (!RB_EMPTY_NODE(&rq->rb_node))
2303 return; 3995 goto end;
2304 spin_lock_irq(&bfqq->bfqd->lock); 3996 spin_lock_irq(&bfqq->bfqd->lock);
2305 3997
2306 /* 3998 /*
@@ -2326,6 +4018,8 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
2326 bfq_remove_request(q, next); 4018 bfq_remove_request(q, next);
2327 4019
2328 spin_unlock_irq(&bfqq->bfqd->lock); 4020 spin_unlock_irq(&bfqq->bfqd->lock);
4021end:
4022 bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
2329} 4023}
2330 4024
2331static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, 4025static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
@@ -2355,6 +4049,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
2355 struct bfq_queue *bfqq) 4049 struct bfq_queue *bfqq)
2356{ 4050{
2357 if (bfqq) { 4051 if (bfqq) {
4052 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
2358 bfq_mark_bfqq_budget_new(bfqq); 4053 bfq_mark_bfqq_budget_new(bfqq);
2359 bfq_clear_bfqq_fifo_expire(bfqq); 4054 bfq_clear_bfqq_fifo_expire(bfqq);
2360 4055
@@ -2441,6 +4136,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2441 bfqd->last_idling_start = ktime_get(); 4136 bfqd->last_idling_start = ktime_get();
2442 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), 4137 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
2443 HRTIMER_MODE_REL); 4138 HRTIMER_MODE_REL);
4139 bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
2444} 4140}
2445 4141
2446/* 4142/*
@@ -2490,12 +4186,17 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
2490 4186
2491static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) 4187static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2492{ 4188{
2493 __bfq_bfqd_reset_in_service(bfqd);
2494
2495 if (RB_EMPTY_ROOT(&bfqq->sort_list)) 4189 if (RB_EMPTY_ROOT(&bfqq->sort_list))
2496 bfq_del_bfqq_busy(bfqd, bfqq, 1); 4190 bfq_del_bfqq_busy(bfqd, bfqq, true);
2497 else 4191 else
2498 bfq_activate_bfqq(bfqd, bfqq); 4192 bfq_requeue_bfqq(bfqd, bfqq);
4193
4194 /*
4195 * All in-service entities must have been properly deactivated
4196 * or requeued before executing the next function, which
4197 * resets all in-service entites as no more in service.
4198 */
4199 __bfq_bfqd_reset_in_service(bfqd);
2499} 4200}
2500 4201
2501/** 4202/**
@@ -2972,6 +4673,7 @@ check_queue:
2972 */ 4673 */
2973 bfq_clear_bfqq_wait_request(bfqq); 4674 bfq_clear_bfqq_wait_request(bfqq);
2974 hrtimer_try_to_cancel(&bfqd->idle_slice_timer); 4675 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
4676 bfqg_stats_update_idle_time(bfqq_group(bfqq));
2975 } 4677 }
2976 goto keep_queue; 4678 goto keep_queue;
2977 } 4679 }
@@ -3159,6 +4861,10 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3159 */ 4861 */
3160static void bfq_put_queue(struct bfq_queue *bfqq) 4862static void bfq_put_queue(struct bfq_queue *bfqq)
3161{ 4863{
4864#ifdef CONFIG_BFQ_GROUP_IOSCHED
4865 struct bfq_group *bfqg = bfqq_group(bfqq);
4866#endif
4867
3162 if (bfqq->bfqd) 4868 if (bfqq->bfqd)
3163 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", 4869 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
3164 bfqq, bfqq->ref); 4870 bfqq, bfqq->ref);
@@ -3167,7 +4873,12 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
3167 if (bfqq->ref) 4873 if (bfqq->ref)
3168 return; 4874 return;
3169 4875
4876 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
4877
3170 kmem_cache_free(bfq_pool, bfqq); 4878 kmem_cache_free(bfq_pool, bfqq);
4879#ifdef CONFIG_BFQ_GROUP_IOSCHED
4880 bfqg_put(bfqg);
4881#endif
3171} 4882}
3172 4883
3173static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) 4884static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
@@ -3323,18 +5034,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3323} 5034}
3324 5035
3325static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, 5036static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
5037 struct bfq_group *bfqg,
3326 int ioprio_class, int ioprio) 5038 int ioprio_class, int ioprio)
3327{ 5039{
3328 switch (ioprio_class) { 5040 switch (ioprio_class) {
3329 case IOPRIO_CLASS_RT: 5041 case IOPRIO_CLASS_RT:
3330 return &async_bfqq[0][ioprio]; 5042 return &bfqg->async_bfqq[0][ioprio];
3331 case IOPRIO_CLASS_NONE: 5043 case IOPRIO_CLASS_NONE:
3332 ioprio = IOPRIO_NORM; 5044 ioprio = IOPRIO_NORM;
3333 /* fall through */ 5045 /* fall through */
3334 case IOPRIO_CLASS_BE: 5046 case IOPRIO_CLASS_BE:
3335 return &async_bfqq[1][ioprio]; 5047 return &bfqg->async_bfqq[1][ioprio];
3336 case IOPRIO_CLASS_IDLE: 5048 case IOPRIO_CLASS_IDLE:
3337 return &async_idle_bfqq; 5049 return &bfqg->async_idle_bfqq;
3338 default: 5050 default:
3339 return NULL; 5051 return NULL;
3340 } 5052 }
@@ -3348,11 +5060,18 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3348 const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); 5060 const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3349 struct bfq_queue **async_bfqq = NULL; 5061 struct bfq_queue **async_bfqq = NULL;
3350 struct bfq_queue *bfqq; 5062 struct bfq_queue *bfqq;
5063 struct bfq_group *bfqg;
3351 5064
3352 rcu_read_lock(); 5065 rcu_read_lock();
3353 5066
5067 bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
5068 if (!bfqg) {
5069 bfqq = &bfqd->oom_bfqq;
5070 goto out;
5071 }
5072
3354 if (!is_sync) { 5073 if (!is_sync) {
3355 async_bfqq = bfq_async_queue_prio(bfqd, ioprio_class, 5074 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3356 ioprio); 5075 ioprio);
3357 bfqq = *async_bfqq; 5076 bfqq = *async_bfqq;
3358 if (bfqq) 5077 if (bfqq)
@@ -3366,7 +5085,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3366 if (bfqq) { 5085 if (bfqq) {
3367 bfq_init_bfqq(bfqd, bfqq, bic, current->pid, 5086 bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
3368 is_sync); 5087 is_sync);
3369 bfq_init_entity(&bfqq->entity); 5088 bfq_init_entity(&bfqq->entity, bfqg);
3370 bfq_log_bfqq(bfqd, bfqq, "allocated"); 5089 bfq_log_bfqq(bfqd, bfqq, "allocated");
3371 } else { 5090 } else {
3372 bfqq = &bfqd->oom_bfqq; 5091 bfqq = &bfqd->oom_bfqq;
@@ -3379,9 +5098,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3379 * prune it. 5098 * prune it.
3380 */ 5099 */
3381 if (async_bfqq) { 5100 if (async_bfqq) {
3382 bfqq->ref++; 5101 bfqq->ref++; /*
3383 bfq_log_bfqq(bfqd, bfqq, 5102 * Extra group reference, w.r.t. sync
3384 "get_queue, bfqq not in async: %p, %d", 5103 * queue. This extra reference is removed
5104 * only if bfqq->bfqg disappears, to
5105 * guarantee that this queue is not freed
5106 * until its group goes away.
5107 */
5108 bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3385 bfqq, bfqq->ref); 5109 bfqq, bfqq->ref);
3386 *async_bfqq = bfqq; 5110 *async_bfqq = bfqq;
3387 } 5111 }
@@ -3516,6 +5240,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3516 */ 5240 */
3517 bfq_clear_bfqq_wait_request(bfqq); 5241 bfq_clear_bfqq_wait_request(bfqq);
3518 hrtimer_try_to_cancel(&bfqd->idle_slice_timer); 5242 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
5243 bfqg_stats_update_idle_time(bfqq_group(bfqq));
3519 5244
3520 /* 5245 /*
3521 * The queue is not empty, because a new request just 5246 * The queue is not empty, because a new request just
@@ -3657,6 +5382,11 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
3657 struct bfq_queue *bfqq = RQ_BFQQ(rq); 5382 struct bfq_queue *bfqq = RQ_BFQQ(rq);
3658 struct bfq_data *bfqd = bfqq->bfqd; 5383 struct bfq_data *bfqd = bfqq->bfqd;
3659 5384
5385 if (rq->rq_flags & RQF_STARTED)
5386 bfqg_stats_update_completion(bfqq_group(bfqq),
5387 rq_start_time_ns(rq),
5388 rq_io_start_time_ns(rq),
5389 rq->cmd_flags);
3660 5390
3661 if (likely(rq->rq_flags & RQF_STARTED)) { 5391 if (likely(rq->rq_flags & RQF_STARTED)) {
3662 unsigned long flags; 5392 unsigned long flags;
@@ -3707,6 +5437,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
3707 if (!bic) 5437 if (!bic)
3708 goto queue_fail; 5438 goto queue_fail;
3709 5439
5440 bfq_bic_update_cgroup(bic, bio);
5441
3710 bfqq = bic_to_bfqq(bic, is_sync); 5442 bfqq = bic_to_bfqq(bic, is_sync);
3711 if (!bfqq || bfqq == &bfqd->oom_bfqq) { 5443 if (!bfqq || bfqq == &bfqd->oom_bfqq) {
3712 if (bfqq) 5444 if (bfqq)
@@ -3803,6 +5535,8 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
3803 5535
3804 bfq_log(bfqd, "put_async_bfqq: %p", bfqq); 5536 bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
3805 if (bfqq) { 5537 if (bfqq) {
5538 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
5539
3806 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", 5540 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
3807 bfqq, bfqq->ref); 5541 bfqq, bfqq->ref);
3808 bfq_put_queue(bfqq); 5542 bfq_put_queue(bfqq);
@@ -3811,18 +5545,20 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
3811} 5545}
3812 5546
3813/* 5547/*
3814 * Release the extra reference of the async queues as the device 5548 * Release all the bfqg references to its async queues. If we are
3815 * goes away. 5549 * deallocating the group these queues may still contain requests, so
5550 * we reparent them to the root cgroup (i.e., the only one that will
5551 * exist for sure until all the requests on a device are gone).
3816 */ 5552 */
3817static void bfq_put_async_queues(struct bfq_data *bfqd) 5553static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
3818{ 5554{
3819 int i, j; 5555 int i, j;
3820 5556
3821 for (i = 0; i < 2; i++) 5557 for (i = 0; i < 2; i++)
3822 for (j = 0; j < IOPRIO_BE_NR; j++) 5558 for (j = 0; j < IOPRIO_BE_NR; j++)
3823 __bfq_put_async_bfqq(bfqd, &async_bfqq[i][j]); 5559 __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
3824 5560
3825 __bfq_put_async_bfqq(bfqd, &async_idle_bfqq); 5561 __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
3826} 5562}
3827 5563
3828static void bfq_exit_queue(struct elevator_queue *e) 5564static void bfq_exit_queue(struct elevator_queue *e)
@@ -3834,20 +5570,42 @@ static void bfq_exit_queue(struct elevator_queue *e)
3834 5570
3835 spin_lock_irq(&bfqd->lock); 5571 spin_lock_irq(&bfqd->lock);
3836 list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) 5572 list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
3837 bfq_deactivate_bfqq(bfqd, bfqq, false); 5573 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
3838 bfq_put_async_queues(bfqd);
3839 spin_unlock_irq(&bfqd->lock); 5574 spin_unlock_irq(&bfqd->lock);
3840 5575
3841 hrtimer_cancel(&bfqd->idle_slice_timer); 5576 hrtimer_cancel(&bfqd->idle_slice_timer);
3842 5577
5578#ifdef CONFIG_BFQ_GROUP_IOSCHED
5579 blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
5580#else
5581 spin_lock_irq(&bfqd->lock);
5582 bfq_put_async_queues(bfqd, bfqd->root_group);
5583 kfree(bfqd->root_group);
5584 spin_unlock_irq(&bfqd->lock);
5585#endif
5586
3843 kfree(bfqd); 5587 kfree(bfqd);
3844} 5588}
3845 5589
5590static void bfq_init_root_group(struct bfq_group *root_group,
5591 struct bfq_data *bfqd)
5592{
5593 int i;
5594
5595#ifdef CONFIG_BFQ_GROUP_IOSCHED
5596 root_group->entity.parent = NULL;
5597 root_group->my_entity = NULL;
5598 root_group->bfqd = bfqd;
5599#endif
5600 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
5601 root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
5602 root_group->sched_data.bfq_class_idle_last_service = jiffies;
5603}
5604
3846static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) 5605static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
3847{ 5606{
3848 struct bfq_data *bfqd; 5607 struct bfq_data *bfqd;
3849 struct elevator_queue *eq; 5608 struct elevator_queue *eq;
3850 int i;
3851 5609
3852 eq = elevator_alloc(q, e); 5610 eq = elevator_alloc(q, e);
3853 if (!eq) 5611 if (!eq)
@@ -3860,6 +5618,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
3860 } 5618 }
3861 eq->elevator_data = bfqd; 5619 eq->elevator_data = bfqd;
3862 5620
5621 spin_lock_irq(q->queue_lock);
5622 q->elevator = eq;
5623 spin_unlock_irq(q->queue_lock);
5624
3863 /* 5625 /*
3864 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. 5626 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
3865 * Grab a permanent reference to it, so that the normal code flow 5627 * Grab a permanent reference to it, so that the normal code flow
@@ -3880,8 +5642,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
3880 5642
3881 bfqd->queue = q; 5643 bfqd->queue = q;
3882 5644
3883 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) 5645 INIT_LIST_HEAD(&bfqd->dispatch);
3884 bfqd->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
3885 5646
3886 hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, 5647 hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
3887 HRTIMER_MODE_REL); 5648 HRTIMER_MODE_REL);
@@ -3899,17 +5660,40 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
3899 bfqd->bfq_back_max = bfq_back_max; 5660 bfqd->bfq_back_max = bfq_back_max;
3900 bfqd->bfq_back_penalty = bfq_back_penalty; 5661 bfqd->bfq_back_penalty = bfq_back_penalty;
3901 bfqd->bfq_slice_idle = bfq_slice_idle; 5662 bfqd->bfq_slice_idle = bfq_slice_idle;
3902 bfqd->bfq_class_idle_last_service = 0;
3903 bfqd->bfq_timeout = bfq_timeout; 5663 bfqd->bfq_timeout = bfq_timeout;
3904 5664
3905 bfqd->bfq_requests_within_timer = 120; 5665 bfqd->bfq_requests_within_timer = 120;
3906 5666
3907 spin_lock_init(&bfqd->lock); 5667 spin_lock_init(&bfqd->lock);
3908 INIT_LIST_HEAD(&bfqd->dispatch);
3909 5668
3910 q->elevator = eq; 5669 /*
5670 * The invocation of the next bfq_create_group_hierarchy
5671 * function is the head of a chain of function calls
5672 * (bfq_create_group_hierarchy->blkcg_activate_policy->
5673 * blk_mq_freeze_queue) that may lead to the invocation of the
5674 * has_work hook function. For this reason,
5675 * bfq_create_group_hierarchy is invoked only after all
5676 * scheduler data has been initialized, apart from the fields
5677 * that can be initialized only after invoking
5678 * bfq_create_group_hierarchy. This, in particular, enables
5679 * has_work to correctly return false. Of course, to avoid
5680 * other inconsistencies, the blk-mq stack must then refrain
5681 * from invoking further scheduler hooks before this init
5682 * function is finished.
5683 */
5684 bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
5685 if (!bfqd->root_group)
5686 goto out_free;
5687 bfq_init_root_group(bfqd->root_group, bfqd);
5688 bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
5689
3911 5690
3912 return 0; 5691 return 0;
5692
5693out_free:
5694 kfree(bfqd);
5695 kobject_put(&eq->kobj);
5696 return -ENOMEM;
3913} 5697}
3914 5698
3915static void bfq_slab_kill(void) 5699static void bfq_slab_kill(void)
@@ -4134,10 +5918,34 @@ static struct elevator_type iosched_bfq_mq = {
4134 .elevator_owner = THIS_MODULE, 5918 .elevator_owner = THIS_MODULE,
4135}; 5919};
4136 5920
5921#ifdef CONFIG_BFQ_GROUP_IOSCHED
5922static struct blkcg_policy blkcg_policy_bfq = {
5923 .dfl_cftypes = bfq_blkg_files,
5924 .legacy_cftypes = bfq_blkcg_legacy_files,
5925
5926 .cpd_alloc_fn = bfq_cpd_alloc,
5927 .cpd_init_fn = bfq_cpd_init,
5928 .cpd_bind_fn = bfq_cpd_init,
5929 .cpd_free_fn = bfq_cpd_free,
5930
5931 .pd_alloc_fn = bfq_pd_alloc,
5932 .pd_init_fn = bfq_pd_init,
5933 .pd_offline_fn = bfq_pd_offline,
5934 .pd_free_fn = bfq_pd_free,
5935 .pd_reset_stats_fn = bfq_pd_reset_stats,
5936};
5937#endif
5938
4137static int __init bfq_init(void) 5939static int __init bfq_init(void)
4138{ 5940{
4139 int ret; 5941 int ret;
4140 5942
5943#ifdef CONFIG_BFQ_GROUP_IOSCHED
5944 ret = blkcg_policy_register(&blkcg_policy_bfq);
5945 if (ret)
5946 return ret;
5947#endif
5948
4141 ret = -ENOMEM; 5949 ret = -ENOMEM;
4142 if (bfq_slab_setup()) 5950 if (bfq_slab_setup())
4143 goto err_pol_unreg; 5951 goto err_pol_unreg;
@@ -4149,12 +5957,18 @@ static int __init bfq_init(void)
4149 return 0; 5957 return 0;
4150 5958
4151err_pol_unreg: 5959err_pol_unreg:
5960#ifdef CONFIG_BFQ_GROUP_IOSCHED
5961 blkcg_policy_unregister(&blkcg_policy_bfq);
5962#endif
4152 return ret; 5963 return ret;
4153} 5964}
4154 5965
4155static void __exit bfq_exit(void) 5966static void __exit bfq_exit(void)
4156{ 5967{
4157 elv_unregister(&iosched_bfq_mq); 5968 elv_unregister(&iosched_bfq_mq);
5969#ifdef CONFIG_BFQ_GROUP_IOSCHED
5970 blkcg_policy_unregister(&blkcg_policy_bfq);
5971#endif
4158 bfq_slab_kill(); 5972 bfq_slab_kill();
4159} 5973}
4160 5974