aboutsummaryrefslogtreecommitdiffstats
path: root/block/cfq-iosched.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/cfq-iosched.c')
-rw-r--r--block/cfq-iosched.c624
1 files changed, 381 insertions, 243 deletions
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9eba291eb6fd..ae21919f15e1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
55 55
56#define RQ_CIC(rq) \ 56#define RQ_CIC(rq) \
57 ((struct cfq_io_context *) (rq)->elevator_private) 57 ((struct cfq_io_context *) (rq)->elevator_private[0])
58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
60 60
61static struct kmem_cache *cfq_pool; 61static struct kmem_cache *cfq_pool;
62static struct kmem_cache *cfq_ioc_pool; 62static struct kmem_cache *cfq_ioc_pool;
@@ -87,7 +87,6 @@ struct cfq_rb_root {
87 unsigned count; 87 unsigned count;
88 unsigned total_weight; 88 unsigned total_weight;
89 u64 min_vdisktime; 89 u64 min_vdisktime;
90 struct rb_node *active;
91}; 90};
92#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ 91#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
93 .count = 0, .min_vdisktime = 0, } 92 .count = 0, .min_vdisktime = 0, }
@@ -97,7 +96,7 @@ struct cfq_rb_root {
97 */ 96 */
98struct cfq_queue { 97struct cfq_queue {
99 /* reference count */ 98 /* reference count */
100 atomic_t ref; 99 int ref;
101 /* various state flags, see below */ 100 /* various state flags, see below */
102 unsigned int flags; 101 unsigned int flags;
103 /* parent cfq_data */ 102 /* parent cfq_data */
@@ -147,7 +146,6 @@ struct cfq_queue {
147 struct cfq_rb_root *service_tree; 146 struct cfq_rb_root *service_tree;
148 struct cfq_queue *new_cfqq; 147 struct cfq_queue *new_cfqq;
149 struct cfq_group *cfqg; 148 struct cfq_group *cfqg;
150 struct cfq_group *orig_cfqg;
151 /* Number of sectors dispatched from queue in single dispatch round */ 149 /* Number of sectors dispatched from queue in single dispatch round */
152 unsigned long nr_sectors; 150 unsigned long nr_sectors;
153}; 151};
@@ -160,6 +158,7 @@ enum wl_prio_t {
160 BE_WORKLOAD = 0, 158 BE_WORKLOAD = 0,
161 RT_WORKLOAD = 1, 159 RT_WORKLOAD = 1,
162 IDLE_WORKLOAD = 2, 160 IDLE_WORKLOAD = 2,
161 CFQ_PRIO_NR,
163}; 162};
164 163
165/* 164/*
@@ -179,15 +178,25 @@ struct cfq_group {
179 /* group service_tree key */ 178 /* group service_tree key */
180 u64 vdisktime; 179 u64 vdisktime;
181 unsigned int weight; 180 unsigned int weight;
182 bool on_st; 181 unsigned int new_weight;
182 bool needs_update;
183 183
184 /* number of cfqq currently on this group */ 184 /* number of cfqq currently on this group */
185 int nr_cfqq; 185 int nr_cfqq;
186 186
187 /* Per group busy queus average. Useful for workload slice calc. */
188 unsigned int busy_queues_avg[2];
189 /* 187 /*
190 * rr lists of queues with requests, onle rr for each priority class. 188 * Per group busy queues average. Useful for workload slice calc. We
189 * create the array for each prio class but at run time it is used
190 * only for RT and BE class and slot for IDLE class remains unused.
191 * This is primarily done to avoid confusion and a gcc warning.
192 */
193 unsigned int busy_queues_avg[CFQ_PRIO_NR];
194 /*
195 * rr lists of queues with requests. We maintain service trees for
196 * RT and BE classes. These trees are subdivided in subclasses
197 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
198 * class there is no subclassification and all the cfq queues go on
199 * a single tree service_tree_idle.
191 * Counts are embedded in the cfq_rb_root 200 * Counts are embedded in the cfq_rb_root
192 */ 201 */
193 struct cfq_rb_root service_trees[2][3]; 202 struct cfq_rb_root service_trees[2][3];
@@ -199,7 +208,7 @@ struct cfq_group {
199 struct blkio_group blkg; 208 struct blkio_group blkg;
200#ifdef CONFIG_CFQ_GROUP_IOSCHED 209#ifdef CONFIG_CFQ_GROUP_IOSCHED
201 struct hlist_node cfqd_node; 210 struct hlist_node cfqd_node;
202 atomic_t ref; 211 int ref;
203#endif 212#endif
204 /* number of requests that are on the dispatch list or inside driver */ 213 /* number of requests that are on the dispatch list or inside driver */
205 int dispatched; 214 int dispatched;
@@ -221,7 +230,6 @@ struct cfq_data {
221 enum wl_type_t serving_type; 230 enum wl_type_t serving_type;
222 unsigned long workload_expires; 231 unsigned long workload_expires;
223 struct cfq_group *serving_group; 232 struct cfq_group *serving_group;
224 bool noidle_tree_requires_idle;
225 233
226 /* 234 /*
227 * Each priority tree is sorted by next_request position. These 235 * Each priority tree is sorted by next_request position. These
@@ -231,6 +239,7 @@ struct cfq_data {
231 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 239 struct rb_root prio_trees[CFQ_PRIO_LISTS];
232 240
233 unsigned int busy_queues; 241 unsigned int busy_queues;
242 unsigned int busy_sync_queues;
234 243
235 int rq_in_driver; 244 int rq_in_driver;
236 int rq_in_flight[2]; 245 int rq_in_flight[2];
@@ -278,7 +287,6 @@ struct cfq_data {
278 unsigned int cfq_slice_idle; 287 unsigned int cfq_slice_idle;
279 unsigned int cfq_group_idle; 288 unsigned int cfq_group_idle;
280 unsigned int cfq_latency; 289 unsigned int cfq_latency;
281 unsigned int cfq_group_isolation;
282 290
283 unsigned int cic_index; 291 unsigned int cic_index;
284 struct list_head cic_list; 292 struct list_head cic_list;
@@ -292,7 +300,9 @@ struct cfq_data {
292 300
293 /* List of cfq groups being managed on this device*/ 301 /* List of cfq groups being managed on this device*/
294 struct hlist_head cfqg_list; 302 struct hlist_head cfqg_list;
295 struct rcu_head rcu; 303
304 /* Number of groups which are on blkcg->blkg_list */
305 unsigned int nr_blkcg_linked_grps;
296}; 306};
297 307
298static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 308static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -359,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy);
359#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 369#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
360 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 370 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
361 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 371 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
362 blkg_path(&(cfqq)->cfqg->blkg), ##args); 372 blkg_path(&(cfqq)->cfqg->blkg), ##args)
363 373
364#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 374#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
365 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 375 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
366 blkg_path(&(cfqg)->blkg), ##args); \ 376 blkg_path(&(cfqg)->blkg), ##args) \
367 377
368#else 378#else
369#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 379#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
370 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 380 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
371#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); 381#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
372#endif 382#endif
373#define cfq_log(cfqd, fmt, args...) \ 383#define cfq_log(cfqd, fmt, args...) \
374 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 384 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
@@ -494,13 +504,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
494 } 504 }
495} 505}
496 506
497static int cfq_queue_empty(struct request_queue *q)
498{
499 struct cfq_data *cfqd = q->elevator->elevator_data;
500
501 return !cfqd->rq_queued;
502}
503
504/* 507/*
505 * Scale schedule slice based on io priority. Use the sync time slice only 508 * Scale schedule slice based on io priority. Use the sync time slice only
506 * if a queue is marked sync and has sync io queued. A sync queue with async 509 * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -551,20 +554,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
551 554
552static void update_min_vdisktime(struct cfq_rb_root *st) 555static void update_min_vdisktime(struct cfq_rb_root *st)
553{ 556{
554 u64 vdisktime = st->min_vdisktime;
555 struct cfq_group *cfqg; 557 struct cfq_group *cfqg;
556 558
557 if (st->active) {
558 cfqg = rb_entry_cfqg(st->active);
559 vdisktime = cfqg->vdisktime;
560 }
561
562 if (st->left) { 559 if (st->left) {
563 cfqg = rb_entry_cfqg(st->left); 560 cfqg = rb_entry_cfqg(st->left);
564 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 561 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
562 cfqg->vdisktime);
565 } 563 }
566
567 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
568} 564}
569 565
570/* 566/*
@@ -596,8 +592,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
596 return cfq_target_latency * cfqg->weight / st->total_weight; 592 return cfq_target_latency * cfqg->weight / st->total_weight;
597} 593}
598 594
599static inline void 595static inline unsigned
600cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 596cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
601{ 597{
602 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 598 unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
603 if (cfqd->cfq_latency) { 599 if (cfqd->cfq_latency) {
@@ -623,6 +619,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
623 low_slice); 619 low_slice);
624 } 620 }
625 } 621 }
622 return slice;
623}
624
625static inline void
626cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
627{
628 unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
629
626 cfqq->slice_start = jiffies; 630 cfqq->slice_start = jiffies;
627 cfqq->slice_end = jiffies + slice; 631 cfqq->slice_end = jiffies + slice;
628 cfqq->allocated_slice = slice; 632 cfqq->allocated_slice = slice;
@@ -637,11 +641,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
637static inline bool cfq_slice_used(struct cfq_queue *cfqq) 641static inline bool cfq_slice_used(struct cfq_queue *cfqq)
638{ 642{
639 if (cfq_cfqq_slice_new(cfqq)) 643 if (cfq_cfqq_slice_new(cfqq))
640 return 0; 644 return false;
641 if (time_before(jiffies, cfqq->slice_end)) 645 if (time_before(jiffies, cfqq->slice_end))
642 return 0; 646 return false;
643 647
644 return 1; 648 return true;
645} 649}
646 650
647/* 651/*
@@ -663,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
663 if (rq2 == NULL) 667 if (rq2 == NULL)
664 return rq1; 668 return rq1;
665 669
666 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 670 if (rq_is_sync(rq1) != rq_is_sync(rq2))
667 return rq1; 671 return rq_is_sync(rq1) ? rq1 : rq2;
668 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 672
669 return rq2; 673 if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
670 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) 674 return rq1->cmd_flags & REQ_META ? rq1 : rq2;
671 return rq1;
672 else if ((rq2->cmd_flags & REQ_META) &&
673 !(rq1->cmd_flags & REQ_META))
674 return rq2;
675 675
676 s1 = blk_rq_pos(rq1); 676 s1 = blk_rq_pos(rq1);
677 s2 = blk_rq_pos(rq2); 677 s2 = blk_rq_pos(rq2);
@@ -853,20 +853,40 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
853} 853}
854 854
855static void 855static void
856cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 856cfq_update_group_weight(struct cfq_group *cfqg)
857{
858 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
859 if (cfqg->needs_update) {
860 cfqg->weight = cfqg->new_weight;
861 cfqg->needs_update = false;
862 }
863}
864
865static void
866cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
867{
868 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
869
870 cfq_update_group_weight(cfqg);
871 __cfq_group_service_tree_add(st, cfqg);
872 st->total_weight += cfqg->weight;
873}
874
875static void
876cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
857{ 877{
858 struct cfq_rb_root *st = &cfqd->grp_service_tree; 878 struct cfq_rb_root *st = &cfqd->grp_service_tree;
859 struct cfq_group *__cfqg; 879 struct cfq_group *__cfqg;
860 struct rb_node *n; 880 struct rb_node *n;
861 881
862 cfqg->nr_cfqq++; 882 cfqg->nr_cfqq++;
863 if (cfqg->on_st) 883 if (!RB_EMPTY_NODE(&cfqg->rb_node))
864 return; 884 return;
865 885
866 /* 886 /*
867 * Currently put the group at the end. Later implement something 887 * Currently put the group at the end. Later implement something
868 * so that groups get lesser vtime based on their weights, so that 888 * so that groups get lesser vtime based on their weights, so that
869 * if group does not loose all if it was not continously backlogged. 889 * if group does not loose all if it was not continuously backlogged.
870 */ 890 */
871 n = rb_last(&st->rb); 891 n = rb_last(&st->rb);
872 if (n) { 892 if (n) {
@@ -874,20 +894,22 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
874 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 894 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
875 } else 895 } else
876 cfqg->vdisktime = st->min_vdisktime; 896 cfqg->vdisktime = st->min_vdisktime;
897 cfq_group_service_tree_add(st, cfqg);
898}
877 899
878 __cfq_group_service_tree_add(st, cfqg); 900static void
879 cfqg->on_st = true; 901cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
880 st->total_weight += cfqg->weight; 902{
903 st->total_weight -= cfqg->weight;
904 if (!RB_EMPTY_NODE(&cfqg->rb_node))
905 cfq_rb_erase(&cfqg->rb_node, st);
881} 906}
882 907
883static void 908static void
884cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 909cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
885{ 910{
886 struct cfq_rb_root *st = &cfqd->grp_service_tree; 911 struct cfq_rb_root *st = &cfqd->grp_service_tree;
887 912
888 if (st->active == &cfqg->rb_node)
889 st->active = NULL;
890
891 BUG_ON(cfqg->nr_cfqq < 1); 913 BUG_ON(cfqg->nr_cfqq < 1);
892 cfqg->nr_cfqq--; 914 cfqg->nr_cfqq--;
893 915
@@ -896,15 +918,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
896 return; 918 return;
897 919
898 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 920 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
899 cfqg->on_st = false; 921 cfq_group_service_tree_del(st, cfqg);
900 st->total_weight -= cfqg->weight;
901 if (!RB_EMPTY_NODE(&cfqg->rb_node))
902 cfq_rb_erase(&cfqg->rb_node, st);
903 cfqg->saved_workload_slice = 0; 922 cfqg->saved_workload_slice = 0;
904 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 923 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
905} 924}
906 925
907static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 926static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
927 unsigned int *unaccounted_time)
908{ 928{
909 unsigned int slice_used; 929 unsigned int slice_used;
910 930
@@ -923,8 +943,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
923 1); 943 1);
924 } else { 944 } else {
925 slice_used = jiffies - cfqq->slice_start; 945 slice_used = jiffies - cfqq->slice_start;
926 if (slice_used > cfqq->allocated_slice) 946 if (slice_used > cfqq->allocated_slice) {
947 *unaccounted_time = slice_used - cfqq->allocated_slice;
927 slice_used = cfqq->allocated_slice; 948 slice_used = cfqq->allocated_slice;
949 }
950 if (time_after(cfqq->slice_start, cfqq->dispatch_start))
951 *unaccounted_time += cfqq->slice_start -
952 cfqq->dispatch_start;
928 } 953 }
929 954
930 return slice_used; 955 return slice_used;
@@ -934,12 +959,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
934 struct cfq_queue *cfqq) 959 struct cfq_queue *cfqq)
935{ 960{
936 struct cfq_rb_root *st = &cfqd->grp_service_tree; 961 struct cfq_rb_root *st = &cfqd->grp_service_tree;
937 unsigned int used_sl, charge; 962 unsigned int used_sl, charge, unaccounted_sl = 0;
938 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 963 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
939 - cfqg->service_tree_idle.count; 964 - cfqg->service_tree_idle.count;
940 965
941 BUG_ON(nr_sync < 0); 966 BUG_ON(nr_sync < 0);
942 used_sl = charge = cfq_cfqq_slice_usage(cfqq); 967 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
943 968
944 if (iops_mode(cfqd)) 969 if (iops_mode(cfqd))
945 charge = cfqq->slice_dispatch; 970 charge = cfqq->slice_dispatch;
@@ -947,9 +972,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
947 charge = cfqq->allocated_slice; 972 charge = cfqq->allocated_slice;
948 973
949 /* Can't update vdisktime while group is on service tree */ 974 /* Can't update vdisktime while group is on service tree */
950 cfq_rb_erase(&cfqg->rb_node, st); 975 cfq_group_service_tree_del(st, cfqg);
951 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 976 cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
952 __cfq_group_service_tree_add(st, cfqg); 977 /* If a new weight was requested, update now, off tree */
978 cfq_group_service_tree_add(st, cfqg);
953 979
954 /* This group is being expired. Save the context */ 980 /* This group is being expired. Save the context */
955 if (time_after(cfqd->workload_expires, jiffies)) { 981 if (time_after(cfqd->workload_expires, jiffies)) {
@@ -962,10 +988,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
962 988
963 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 989 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
964 st->min_vdisktime); 990 st->min_vdisktime);
965 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" 991 cfq_log_cfqq(cfqq->cfqd, cfqq,
966 " sect=%u", used_sl, cfqq->slice_dispatch, charge, 992 "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
967 iops_mode(cfqd), cfqq->nr_sectors); 993 used_sl, cfqq->slice_dispatch, charge,
968 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 994 iops_mode(cfqd), cfqq->nr_sectors);
995 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
996 unaccounted_sl);
969 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 997 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
970} 998}
971 999
@@ -977,35 +1005,55 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
977 return NULL; 1005 return NULL;
978} 1006}
979 1007
980void 1008void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
981cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 1009 unsigned int weight)
982{ 1010{
983 cfqg_of_blkg(blkg)->weight = weight; 1011 struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1012 cfqg->new_weight = weight;
1013 cfqg->needs_update = true;
984} 1014}
985 1015
986static struct cfq_group * 1016static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
987cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) 1017 struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
988{ 1018{
989 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
990 struct cfq_group *cfqg = NULL;
991 void *key = cfqd;
992 int i, j;
993 struct cfq_rb_root *st;
994 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 1019 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
995 unsigned int major, minor; 1020 unsigned int major, minor;
996 1021
997 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 1022 /*
998 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 1023 * Add group onto cgroup list. It might happen that bdi->dev is
1024 * not initialized yet. Initialize this new group without major
1025 * and minor info and this info will be filled in once a new thread
1026 * comes for IO.
1027 */
1028 if (bdi->dev) {
999 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1029 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1000 cfqg->blkg.dev = MKDEV(major, minor); 1030 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1001 goto done; 1031 (void *)cfqd, MKDEV(major, minor));
1002 } 1032 } else
1003 if (cfqg || !create) 1033 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1004 goto done; 1034 (void *)cfqd, 0);
1035
1036 cfqd->nr_blkcg_linked_grps++;
1037 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1038
1039 /* Add group on cfqd list */
1040 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1041}
1042
1043/*
1044 * Should be called from sleepable context. No request queue lock as per
1045 * cpu stats are allocated dynamically and alloc_percpu needs to be called
1046 * from sleepable context.
1047 */
1048static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
1049{
1050 struct cfq_group *cfqg = NULL;
1051 int i, j, ret;
1052 struct cfq_rb_root *st;
1005 1053
1006 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 1054 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1007 if (!cfqg) 1055 if (!cfqg)
1008 goto done; 1056 return NULL;
1009 1057
1010 for_each_cfqg_st(cfqg, i, j, st) 1058 for_each_cfqg_st(cfqg, i, j, st)
1011 *st = CFQ_RB_ROOT; 1059 *st = CFQ_RB_ROOT;
@@ -1017,52 +1065,103 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
1017 * elevator which will be dropped by either elevator exit 1065 * elevator which will be dropped by either elevator exit
1018 * or cgroup deletion path depending on who is exiting first. 1066 * or cgroup deletion path depending on who is exiting first.
1019 */ 1067 */
1020 atomic_set(&cfqg->ref, 1); 1068 cfqg->ref = 1;
1069
1070 ret = blkio_alloc_blkg_stats(&cfqg->blkg);
1071 if (ret) {
1072 kfree(cfqg);
1073 return NULL;
1074 }
1075
1076 return cfqg;
1077}
1078
1079static struct cfq_group *
1080cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
1081{
1082 struct cfq_group *cfqg = NULL;
1083 void *key = cfqd;
1084 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1085 unsigned int major, minor;
1021 1086
1022 /* 1087 /*
1023 * Add group onto cgroup list. It might happen that bdi->dev is 1088 * This is the common case when there are no blkio cgroups.
1024 * not initiliazed yet. Initialize this new group without major 1089 * Avoid lookup in this case
1025 * and minor info and this info will be filled in once a new thread
1026 * comes for IO. See code above.
1027 */ 1090 */
1028 if (bdi->dev) { 1091 if (blkcg == &blkio_root_cgroup)
1029 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1092 cfqg = &cfqd->root_group;
1030 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1093 else
1031 MKDEV(major, minor)); 1094 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1032 } else
1033 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1034 0);
1035
1036 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1037 1095
1038 /* Add group on cfqd list */ 1096 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1039 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1097 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1098 cfqg->blkg.dev = MKDEV(major, minor);
1099 }
1040 1100
1041done:
1042 return cfqg; 1101 return cfqg;
1043} 1102}
1044 1103
1045/* 1104/*
1046 * Search for the cfq group current task belongs to. If create = 1, then also 1105 * Search for the cfq group current task belongs to. request_queue lock must
1047 * create the cfq group if it does not exist. request_queue lock must be held. 1106 * be held.
1048 */ 1107 */
1049static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1108static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1050{ 1109{
1051 struct cgroup *cgroup; 1110 struct blkio_cgroup *blkcg;
1052 struct cfq_group *cfqg = NULL; 1111 struct cfq_group *cfqg = NULL, *__cfqg = NULL;
1112 struct request_queue *q = cfqd->queue;
1113
1114 rcu_read_lock();
1115 blkcg = task_blkio_cgroup(current);
1116 cfqg = cfq_find_cfqg(cfqd, blkcg);
1117 if (cfqg) {
1118 rcu_read_unlock();
1119 return cfqg;
1120 }
1121
1122 /*
1123 * Need to allocate a group. Allocation of group also needs allocation
1124 * of per cpu stats which in-turn takes a mutex() and can block. Hence
1125 * we need to drop rcu lock and queue_lock before we call alloc.
1126 *
1127 * Not taking any queue reference here and assuming that queue is
1128 * around by the time we return. CFQ queue allocation code does
1129 * the same. It might be racy though.
1130 */
1131
1132 rcu_read_unlock();
1133 spin_unlock_irq(q->queue_lock);
1134
1135 cfqg = cfq_alloc_cfqg(cfqd);
1136
1137 spin_lock_irq(q->queue_lock);
1053 1138
1054 rcu_read_lock(); 1139 rcu_read_lock();
1055 cgroup = task_cgroup(current, blkio_subsys_id); 1140 blkcg = task_blkio_cgroup(current);
1056 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); 1141
1057 if (!cfqg && create) 1142 /*
1143 * If some other thread already allocated the group while we were
1144 * not holding queue lock, free up the group
1145 */
1146 __cfqg = cfq_find_cfqg(cfqd, blkcg);
1147
1148 if (__cfqg) {
1149 kfree(cfqg);
1150 rcu_read_unlock();
1151 return __cfqg;
1152 }
1153
1154 if (!cfqg)
1058 cfqg = &cfqd->root_group; 1155 cfqg = &cfqd->root_group;
1156
1157 cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1059 rcu_read_unlock(); 1158 rcu_read_unlock();
1060 return cfqg; 1159 return cfqg;
1061} 1160}
1062 1161
1063static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1162static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1064{ 1163{
1065 atomic_inc(&cfqg->ref); 1164 cfqg->ref++;
1066 return cfqg; 1165 return cfqg;
1067} 1166}
1068 1167
@@ -1074,7 +1173,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1074 1173
1075 cfqq->cfqg = cfqg; 1174 cfqq->cfqg = cfqg;
1076 /* cfqq reference on cfqg */ 1175 /* cfqq reference on cfqg */
1077 atomic_inc(&cfqq->cfqg->ref); 1176 cfqq->cfqg->ref++;
1078} 1177}
1079 1178
1080static void cfq_put_cfqg(struct cfq_group *cfqg) 1179static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1082,11 +1181,13 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
1082 struct cfq_rb_root *st; 1181 struct cfq_rb_root *st;
1083 int i, j; 1182 int i, j;
1084 1183
1085 BUG_ON(atomic_read(&cfqg->ref) <= 0); 1184 BUG_ON(cfqg->ref <= 0);
1086 if (!atomic_dec_and_test(&cfqg->ref)) 1185 cfqg->ref--;
1186 if (cfqg->ref)
1087 return; 1187 return;
1088 for_each_cfqg_st(cfqg, i, j, st) 1188 for_each_cfqg_st(cfqg, i, j, st)
1089 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 1189 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1190 free_percpu(cfqg->blkg.stats_cpu);
1090 kfree(cfqg); 1191 kfree(cfqg);
1091} 1192}
1092 1193
@@ -1145,7 +1246,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1145} 1246}
1146 1247
1147#else /* GROUP_IOSCHED */ 1248#else /* GROUP_IOSCHED */
1148static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1249static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1149{ 1250{
1150 return &cfqd->root_group; 1251 return &cfqd->root_group;
1151} 1252}
@@ -1179,33 +1280,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1179 struct cfq_rb_root *service_tree; 1280 struct cfq_rb_root *service_tree;
1180 int left; 1281 int left;
1181 int new_cfqq = 1; 1282 int new_cfqq = 1;
1182 int group_changed = 0;
1183
1184#ifdef CONFIG_CFQ_GROUP_IOSCHED
1185 if (!cfqd->cfq_group_isolation
1186 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1187 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1188 /* Move this cfq to root group */
1189 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1190 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1191 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1192 cfqq->orig_cfqg = cfqq->cfqg;
1193 cfqq->cfqg = &cfqd->root_group;
1194 atomic_inc(&cfqd->root_group.ref);
1195 group_changed = 1;
1196 } else if (!cfqd->cfq_group_isolation
1197 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1198 /* cfqq is sequential now needs to go to its original group */
1199 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1200 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1201 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1202 cfq_put_cfqg(cfqq->cfqg);
1203 cfqq->cfqg = cfqq->orig_cfqg;
1204 cfqq->orig_cfqg = NULL;
1205 group_changed = 1;
1206 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1207 }
1208#endif
1209 1283
1210 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1284 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1211 cfqq_type(cfqq)); 1285 cfqq_type(cfqq));
@@ -1276,9 +1350,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1276 rb_link_node(&cfqq->rb_node, parent, p); 1350 rb_link_node(&cfqq->rb_node, parent, p);
1277 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1351 rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1278 service_tree->count++; 1352 service_tree->count++;
1279 if ((add_front || !new_cfqq) && !group_changed) 1353 if (add_front || !new_cfqq)
1280 return; 1354 return;
1281 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1355 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1282} 1356}
1283 1357
1284static struct cfq_queue * 1358static struct cfq_queue *
@@ -1366,6 +1440,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1366 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1440 BUG_ON(cfq_cfqq_on_rr(cfqq));
1367 cfq_mark_cfqq_on_rr(cfqq); 1441 cfq_mark_cfqq_on_rr(cfqq);
1368 cfqd->busy_queues++; 1442 cfqd->busy_queues++;
1443 if (cfq_cfqq_sync(cfqq))
1444 cfqd->busy_sync_queues++;
1369 1445
1370 cfq_resort_rr_list(cfqd, cfqq); 1446 cfq_resort_rr_list(cfqd, cfqq);
1371} 1447}
@@ -1389,9 +1465,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1389 cfqq->p_root = NULL; 1465 cfqq->p_root = NULL;
1390 } 1466 }
1391 1467
1392 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1468 cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1393 BUG_ON(!cfqd->busy_queues); 1469 BUG_ON(!cfqd->busy_queues);
1394 cfqd->busy_queues--; 1470 cfqd->busy_queues--;
1471 if (cfq_cfqq_sync(cfqq))
1472 cfqd->busy_sync_queues--;
1395} 1473}
1396 1474
1397/* 1475/*
@@ -1663,8 +1741,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1663 /* 1741 /*
1664 * store what was left of this slice, if the queue idled/timed out 1742 * store what was left of this slice, if the queue idled/timed out
1665 */ 1743 */
1666 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 1744 if (timed_out) {
1667 cfqq->slice_resid = cfqq->slice_end - jiffies; 1745 if (cfq_cfqq_slice_new(cfqq))
1746 cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
1747 else
1748 cfqq->slice_resid = cfqq->slice_end - jiffies;
1668 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1749 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1669 } 1750 }
1670 1751
@@ -1678,9 +1759,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1678 if (cfqq == cfqd->active_queue) 1759 if (cfqq == cfqd->active_queue)
1679 cfqd->active_queue = NULL; 1760 cfqd->active_queue = NULL;
1680 1761
1681 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1682 cfqd->grp_service_tree.active = NULL;
1683
1684 if (cfqd->active_cic) { 1762 if (cfqd->active_cic) {
1685 put_io_context(cfqd->active_cic->ioc); 1763 put_io_context(cfqd->active_cic->ioc);
1686 cfqd->active_cic = NULL; 1764 cfqd->active_cic = NULL;
@@ -1892,10 +1970,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1892 * in their service tree. 1970 * in their service tree.
1893 */ 1971 */
1894 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) 1972 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1895 return 1; 1973 return true;
1896 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 1974 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1897 service_tree->count); 1975 service_tree->count);
1898 return 0; 1976 return false;
1899} 1977}
1900 1978
1901static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1979static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -1946,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1946 */ 2024 */
1947 if (sample_valid(cic->ttime_samples) && 2025 if (sample_valid(cic->ttime_samples) &&
1948 (cfqq->slice_end - jiffies < cic->ttime_mean)) { 2026 (cfqq->slice_end - jiffies < cic->ttime_mean)) {
1949 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", 2027 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
1950 cic->ttime_mean); 2028 cic->ttime_mean);
1951 return; 2029 return;
1952 } 2030 }
1953 2031
@@ -2020,7 +2098,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2020 2098
2021 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 2099 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2022 2100
2023 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 2101 return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2024} 2102}
2025 2103
2026/* 2104/*
@@ -2031,7 +2109,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
2031 int process_refs, io_refs; 2109 int process_refs, io_refs;
2032 2110
2033 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 2111 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2034 process_refs = atomic_read(&cfqq->ref) - io_refs; 2112 process_refs = cfqq->ref - io_refs;
2035 BUG_ON(process_refs < 0); 2113 BUG_ON(process_refs < 0);
2036 return process_refs; 2114 return process_refs;
2037} 2115}
@@ -2071,10 +2149,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2071 */ 2149 */
2072 if (new_process_refs >= process_refs) { 2150 if (new_process_refs >= process_refs) {
2073 cfqq->new_cfqq = new_cfqq; 2151 cfqq->new_cfqq = new_cfqq;
2074 atomic_add(process_refs, &new_cfqq->ref); 2152 new_cfqq->ref += process_refs;
2075 } else { 2153 } else {
2076 new_cfqq->new_cfqq = cfqq; 2154 new_cfqq->new_cfqq = cfqq;
2077 atomic_add(new_process_refs, &cfqq->ref); 2155 cfqq->ref += new_process_refs;
2078 } 2156 }
2079} 2157}
2080 2158
@@ -2107,12 +2185,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2107 unsigned count; 2185 unsigned count;
2108 struct cfq_rb_root *st; 2186 struct cfq_rb_root *st;
2109 unsigned group_slice; 2187 unsigned group_slice;
2110 2188 enum wl_prio_t original_prio = cfqd->serving_prio;
2111 if (!cfqg) {
2112 cfqd->serving_prio = IDLE_WORKLOAD;
2113 cfqd->workload_expires = jiffies + 1;
2114 return;
2115 }
2116 2189
2117 /* Choose next priority. RT > BE > IDLE */ 2190 /* Choose next priority. RT > BE > IDLE */
2118 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2191 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
@@ -2125,6 +2198,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2125 return; 2198 return;
2126 } 2199 }
2127 2200
2201 if (original_prio != cfqd->serving_prio)
2202 goto new_workload;
2203
2128 /* 2204 /*
2129 * For RT and BE, we have to choose also the type 2205 * For RT and BE, we have to choose also the type
2130 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2206 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
@@ -2139,6 +2215,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2139 if (count && !time_after(jiffies, cfqd->workload_expires)) 2215 if (count && !time_after(jiffies, cfqd->workload_expires))
2140 return; 2216 return;
2141 2217
2218new_workload:
2142 /* otherwise select new workload type */ 2219 /* otherwise select new workload type */
2143 cfqd->serving_type = 2220 cfqd->serving_type =
2144 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); 2221 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
@@ -2180,7 +2257,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2180 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2257 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2181 cfq_log(cfqd, "workload slice:%d", slice); 2258 cfq_log(cfqd, "workload slice:%d", slice);
2182 cfqd->workload_expires = jiffies + slice; 2259 cfqd->workload_expires = jiffies + slice;
2183 cfqd->noidle_tree_requires_idle = false;
2184} 2260}
2185 2261
2186static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2262static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -2191,7 +2267,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2191 if (RB_EMPTY_ROOT(&st->rb)) 2267 if (RB_EMPTY_ROOT(&st->rb))
2192 return NULL; 2268 return NULL;
2193 cfqg = cfq_rb_first_group(st); 2269 cfqg = cfq_rb_first_group(st);
2194 st->active = &cfqg->rb_node;
2195 update_min_vdisktime(st); 2270 update_min_vdisktime(st);
2196 return cfqg; 2271 return cfqg;
2197} 2272}
@@ -2285,6 +2360,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2285 goto keep_queue; 2360 goto keep_queue;
2286 } 2361 }
2287 2362
2363 /*
2364 * This is a deep seek queue, but the device is much faster than
2365 * the queue can deliver, don't idle
2366 **/
2367 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2368 (cfq_cfqq_slice_new(cfqq) ||
2369 (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2370 cfq_clear_cfqq_deep(cfqq);
2371 cfq_clear_cfqq_idle_window(cfqq);
2372 }
2373
2288 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2374 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2289 cfqq = NULL; 2375 cfqq = NULL;
2290 goto keep_queue; 2376 goto keep_queue;
@@ -2359,12 +2445,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2359{ 2445{
2360 /* the queue hasn't finished any request, can't estimate */ 2446 /* the queue hasn't finished any request, can't estimate */
2361 if (cfq_cfqq_slice_new(cfqq)) 2447 if (cfq_cfqq_slice_new(cfqq))
2362 return 1; 2448 return true;
2363 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, 2449 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2364 cfqq->slice_end)) 2450 cfqq->slice_end))
2365 return 1; 2451 return true;
2366 2452
2367 return 0; 2453 return false;
2368} 2454}
2369 2455
2370static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2456static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -2391,6 +2477,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2391 * Does this cfqq already have too much IO in flight? 2477 * Does this cfqq already have too much IO in flight?
2392 */ 2478 */
2393 if (cfqq->dispatched >= max_dispatch) { 2479 if (cfqq->dispatched >= max_dispatch) {
2480 bool promote_sync = false;
2394 /* 2481 /*
2395 * idle queue must always only have a single IO in flight 2482 * idle queue must always only have a single IO in flight
2396 */ 2483 */
@@ -2398,15 +2485,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2398 return false; 2485 return false;
2399 2486
2400 /* 2487 /*
2488 * If there is only one sync queue
2489 * we can ignore async queue here and give the sync
2490 * queue no dispatch limit. The reason is a sync queue can
2491 * preempt async queue, limiting the sync queue doesn't make
2492 * sense. This is useful for aiostress test.
2493 */
2494 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
2495 promote_sync = true;
2496
2497 /*
2401 * We have other queues, don't allow more IO from this one 2498 * We have other queues, don't allow more IO from this one
2402 */ 2499 */
2403 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) 2500 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
2501 !promote_sync)
2404 return false; 2502 return false;
2405 2503
2406 /* 2504 /*
2407 * Sole queue user, no limit 2505 * Sole queue user, no limit
2408 */ 2506 */
2409 if (cfqd->busy_queues == 1) 2507 if (cfqd->busy_queues == 1 || promote_sync)
2410 max_dispatch = -1; 2508 max_dispatch = -1;
2411 else 2509 else
2412 /* 2510 /*
@@ -2528,18 +2626,18 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
2528static void cfq_put_queue(struct cfq_queue *cfqq) 2626static void cfq_put_queue(struct cfq_queue *cfqq)
2529{ 2627{
2530 struct cfq_data *cfqd = cfqq->cfqd; 2628 struct cfq_data *cfqd = cfqq->cfqd;
2531 struct cfq_group *cfqg, *orig_cfqg; 2629 struct cfq_group *cfqg;
2532 2630
2533 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2631 BUG_ON(cfqq->ref <= 0);
2534 2632
2535 if (!atomic_dec_and_test(&cfqq->ref)) 2633 cfqq->ref--;
2634 if (cfqq->ref)
2536 return; 2635 return;
2537 2636
2538 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2637 cfq_log_cfqq(cfqd, cfqq, "put_queue");
2539 BUG_ON(rb_first(&cfqq->sort_list)); 2638 BUG_ON(rb_first(&cfqq->sort_list));
2540 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2639 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2541 cfqg = cfqq->cfqg; 2640 cfqg = cfqq->cfqg;
2542 orig_cfqg = cfqq->orig_cfqg;
2543 2641
2544 if (unlikely(cfqd->active_queue == cfqq)) { 2642 if (unlikely(cfqd->active_queue == cfqq)) {
2545 __cfq_slice_expired(cfqd, cfqq, 0); 2643 __cfq_slice_expired(cfqd, cfqq, 0);
@@ -2549,33 +2647,23 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2549 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2647 BUG_ON(cfq_cfqq_on_rr(cfqq));
2550 kmem_cache_free(cfq_pool, cfqq); 2648 kmem_cache_free(cfq_pool, cfqq);
2551 cfq_put_cfqg(cfqg); 2649 cfq_put_cfqg(cfqg);
2552 if (orig_cfqg)
2553 cfq_put_cfqg(orig_cfqg);
2554} 2650}
2555 2651
2556/* 2652/*
2557 * Must always be called with the rcu_read_lock() held 2653 * Call func for each cic attached to this ioc.
2558 */ 2654 */
2559static void 2655static void
2560__call_for_each_cic(struct io_context *ioc, 2656call_for_each_cic(struct io_context *ioc,
2561 void (*func)(struct io_context *, struct cfq_io_context *)) 2657 void (*func)(struct io_context *, struct cfq_io_context *))
2562{ 2658{
2563 struct cfq_io_context *cic; 2659 struct cfq_io_context *cic;
2564 struct hlist_node *n; 2660 struct hlist_node *n;
2565 2661
2662 rcu_read_lock();
2663
2566 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2664 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2567 func(ioc, cic); 2665 func(ioc, cic);
2568}
2569 2666
2570/*
2571 * Call func for each cic attached to this ioc.
2572 */
2573static void
2574call_for_each_cic(struct io_context *ioc,
2575 void (*func)(struct io_context *, struct cfq_io_context *))
2576{
2577 rcu_read_lock();
2578 __call_for_each_cic(ioc, func);
2579 rcu_read_unlock(); 2667 rcu_read_unlock();
2580} 2668}
2581 2669
@@ -2636,7 +2724,7 @@ static void cfq_free_io_context(struct io_context *ioc)
2636 * should be ok to iterate over the known list, we will see all cic's 2724 * should be ok to iterate over the known list, we will see all cic's
2637 * since no new ones are added. 2725 * since no new ones are added.
2638 */ 2726 */
2639 __call_for_each_cic(ioc, cic_free_func); 2727 call_for_each_cic(ioc, cic_free_func);
2640} 2728}
2641 2729
2642static void cfq_put_cooperator(struct cfq_queue *cfqq) 2730static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -2685,8 +2773,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2685 smp_wmb(); 2773 smp_wmb();
2686 cic->key = cfqd_dead_key(cfqd); 2774 cic->key = cfqd_dead_key(cfqd);
2687 2775
2688 if (ioc->ioc_data == cic) 2776 rcu_read_lock();
2777 if (rcu_dereference(ioc->ioc_data) == cic) {
2778 rcu_read_unlock();
2779 spin_lock(&ioc->lock);
2689 rcu_assign_pointer(ioc->ioc_data, NULL); 2780 rcu_assign_pointer(ioc->ioc_data, NULL);
2781 spin_unlock(&ioc->lock);
2782 } else
2783 rcu_read_unlock();
2690 2784
2691 if (cic->cfqq[BLK_RW_ASYNC]) { 2785 if (cic->cfqq[BLK_RW_ASYNC]) {
2692 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2786 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -2835,7 +2929,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2835 RB_CLEAR_NODE(&cfqq->p_node); 2929 RB_CLEAR_NODE(&cfqq->p_node);
2836 INIT_LIST_HEAD(&cfqq->fifo); 2930 INIT_LIST_HEAD(&cfqq->fifo);
2837 2931
2838 atomic_set(&cfqq->ref, 0); 2932 cfqq->ref = 0;
2839 cfqq->cfqd = cfqd; 2933 cfqq->cfqd = cfqd;
2840 2934
2841 cfq_mark_cfqq_prio_changed(cfqq); 2935 cfq_mark_cfqq_prio_changed(cfqq);
@@ -2892,7 +2986,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2892 struct cfq_group *cfqg; 2986 struct cfq_group *cfqg;
2893 2987
2894retry: 2988retry:
2895 cfqg = cfq_get_cfqg(cfqd, 1); 2989 cfqg = cfq_get_cfqg(cfqd);
2896 cic = cfq_cic_lookup(cfqd, ioc); 2990 cic = cfq_cic_lookup(cfqd, ioc);
2897 /* cic always exists here */ 2991 /* cic always exists here */
2898 cfqq = cic_to_cfqq(cic, is_sync); 2992 cfqq = cic_to_cfqq(cic, is_sync);
@@ -2971,11 +3065,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2971 * pin the queue now that it's allocated, scheduler exit will prune it 3065 * pin the queue now that it's allocated, scheduler exit will prune it
2972 */ 3066 */
2973 if (!is_sync && !(*async_cfqq)) { 3067 if (!is_sync && !(*async_cfqq)) {
2974 atomic_inc(&cfqq->ref); 3068 cfqq->ref++;
2975 *async_cfqq = cfqq; 3069 *async_cfqq = cfqq;
2976 } 3070 }
2977 3071
2978 atomic_inc(&cfqq->ref); 3072 cfqq->ref++;
2979 return cfqq; 3073 return cfqq;
2980} 3074}
2981 3075
@@ -2993,7 +3087,8 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
2993 3087
2994 spin_lock_irqsave(&ioc->lock, flags); 3088 spin_lock_irqsave(&ioc->lock, flags);
2995 3089
2996 BUG_ON(ioc->ioc_data == cic); 3090 BUG_ON(rcu_dereference_check(ioc->ioc_data,
3091 lockdep_is_held(&ioc->lock)) == cic);
2997 3092
2998 radix_tree_delete(&ioc->radix_root, cfqd->cic_index); 3093 radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
2999 hlist_del_rcu(&cic->cic_list); 3094 hlist_del_rcu(&cic->cic_list);
@@ -3177,7 +3272,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3177 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3272 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3178 cfq_mark_cfqq_deep(cfqq); 3273 cfq_mark_cfqq_deep(cfqq);
3179 3274
3180 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3275 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3276 enable_idle = 0;
3277 else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3181 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3278 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3182 enable_idle = 0; 3279 enable_idle = 0;
3183 else if (sample_valid(cic->ttime_samples)) { 3280 else if (sample_valid(cic->ttime_samples)) {
@@ -3255,6 +3352,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3255 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3352 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3256 return true; 3353 return true;
3257 3354
3355 /* An idle queue should not be idle now for some reason */
3356 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3357 return true;
3358
3258 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3359 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3259 return false; 3360 return false;
3260 3361
@@ -3274,10 +3375,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3274 */ 3375 */
3275static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3376static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3276{ 3377{
3378 struct cfq_queue *old_cfqq = cfqd->active_queue;
3379
3277 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3380 cfq_log_cfqq(cfqd, cfqq, "preempt");
3278 cfq_slice_expired(cfqd, 1); 3381 cfq_slice_expired(cfqd, 1);
3279 3382
3280 /* 3383 /*
3384 * workload type is changed, don't save slice, otherwise preempt
3385 * doesn't happen
3386 */
3387 if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
3388 cfqq->cfqg->saved_workload_slice = 0;
3389
3390 /*
3281 * Put the new queue at the front of the of the current list, 3391 * Put the new queue at the front of the of the current list,
3282 * so we know that it will be selected next. 3392 * so we know that it will be selected next.
3283 */ 3393 */
@@ -3402,6 +3512,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3402{ 3512{
3403 struct cfq_io_context *cic = cfqd->active_cic; 3513 struct cfq_io_context *cic = cfqd->active_cic;
3404 3514
3515 /* If the queue already has requests, don't wait */
3516 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3517 return false;
3518
3405 /* If there are other queues in the group, don't wait */ 3519 /* If there are other queues in the group, don't wait */
3406 if (cfqq->cfqg->nr_cfqq > 1) 3520 if (cfqq->cfqg->nr_cfqq > 1)
3407 return false; 3521 return false;
@@ -3494,17 +3608,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3494 cfq_slice_expired(cfqd, 1); 3608 cfq_slice_expired(cfqd, 1);
3495 else if (sync && cfqq_empty && 3609 else if (sync && cfqq_empty &&
3496 !cfq_close_cooperator(cfqd, cfqq)) { 3610 !cfq_close_cooperator(cfqd, cfqq)) {
3497 cfqd->noidle_tree_requires_idle |= 3611 cfq_arm_slice_timer(cfqd);
3498 !(rq->cmd_flags & REQ_NOIDLE);
3499 /*
3500 * Idling is enabled for SYNC_WORKLOAD.
3501 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3502 * only if we processed at least one !REQ_NOIDLE request
3503 */
3504 if (cfqd->serving_type == SYNC_WORKLOAD
3505 || cfqd->noidle_tree_requires_idle
3506 || cfqq->cfqg->nr_cfqq == 1)
3507 cfq_arm_slice_timer(cfqd);
3508 } 3612 }
3509 } 3613 }
3510 3614
@@ -3589,12 +3693,12 @@ static void cfq_put_request(struct request *rq)
3589 3693
3590 put_io_context(RQ_CIC(rq)->ioc); 3694 put_io_context(RQ_CIC(rq)->ioc);
3591 3695
3592 rq->elevator_private = NULL; 3696 rq->elevator_private[0] = NULL;
3593 rq->elevator_private2 = NULL; 3697 rq->elevator_private[1] = NULL;
3594 3698
3595 /* Put down rq reference on cfqg */ 3699 /* Put down rq reference on cfqg */
3596 cfq_put_cfqg(RQ_CFQG(rq)); 3700 cfq_put_cfqg(RQ_CFQG(rq));
3597 rq->elevator_private3 = NULL; 3701 rq->elevator_private[2] = NULL;
3598 3702
3599 cfq_put_queue(cfqq); 3703 cfq_put_queue(cfqq);
3600 } 3704 }
@@ -3681,19 +3785,15 @@ new_queue:
3681 } 3785 }
3682 3786
3683 cfqq->allocated[rw]++; 3787 cfqq->allocated[rw]++;
3684 atomic_inc(&cfqq->ref);
3685 3788
3789 cfqq->ref++;
3790 rq->elevator_private[0] = cic;
3791 rq->elevator_private[1] = cfqq;
3792 rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
3686 spin_unlock_irqrestore(q->queue_lock, flags); 3793 spin_unlock_irqrestore(q->queue_lock, flags);
3687
3688 rq->elevator_private = cic;
3689 rq->elevator_private2 = cfqq;
3690 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3691 return 0; 3794 return 0;
3692 3795
3693queue_fail: 3796queue_fail:
3694 if (cic)
3695 put_io_context(cic->ioc);
3696
3697 cfq_schedule_dispatch(cfqd); 3797 cfq_schedule_dispatch(cfqd);
3698 spin_unlock_irqrestore(q->queue_lock, flags); 3798 spin_unlock_irqrestore(q->queue_lock, flags);
3699 cfq_log(cfqd, "set_request fail"); 3799 cfq_log(cfqd, "set_request fail");
@@ -3788,15 +3888,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
3788 cfq_put_queue(cfqd->async_idle_cfqq); 3888 cfq_put_queue(cfqd->async_idle_cfqq);
3789} 3889}
3790 3890
3791static void cfq_cfqd_free(struct rcu_head *head)
3792{
3793 kfree(container_of(head, struct cfq_data, rcu));
3794}
3795
3796static void cfq_exit_queue(struct elevator_queue *e) 3891static void cfq_exit_queue(struct elevator_queue *e)
3797{ 3892{
3798 struct cfq_data *cfqd = e->elevator_data; 3893 struct cfq_data *cfqd = e->elevator_data;
3799 struct request_queue *q = cfqd->queue; 3894 struct request_queue *q = cfqd->queue;
3895 bool wait = false;
3800 3896
3801 cfq_shutdown_timer_wq(cfqd); 3897 cfq_shutdown_timer_wq(cfqd);
3802 3898
@@ -3815,7 +3911,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
3815 3911
3816 cfq_put_async_queues(cfqd); 3912 cfq_put_async_queues(cfqd);
3817 cfq_release_cfq_groups(cfqd); 3913 cfq_release_cfq_groups(cfqd);
3818 cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3914
3915 /*
3916 * If there are groups which we could not unlink from blkcg list,
3917 * wait for a rcu period for them to be freed.
3918 */
3919 if (cfqd->nr_blkcg_linked_grps)
3920 wait = true;
3819 3921
3820 spin_unlock_irq(q->queue_lock); 3922 spin_unlock_irq(q->queue_lock);
3821 3923
@@ -3825,8 +3927,25 @@ static void cfq_exit_queue(struct elevator_queue *e)
3825 ida_remove(&cic_index_ida, cfqd->cic_index); 3927 ida_remove(&cic_index_ida, cfqd->cic_index);
3826 spin_unlock(&cic_index_lock); 3928 spin_unlock(&cic_index_lock);
3827 3929
3828 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3930 /*
3829 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3931 * Wait for cfqg->blkg->key accessors to exit their grace periods.
3932 * Do this wait only if there are other unlinked groups out
3933 * there. This can happen if cgroup deletion path claimed the
3934 * responsibility of cleaning up a group before queue cleanup code
3935 * get to the group.
3936 *
3937 * Do not call synchronize_rcu() unconditionally as there are drivers
3938 * which create/delete request queue hundreds of times during scan/boot
3939 * and synchronize_rcu() can take significant time and slow down boot.
3940 */
3941 if (wait)
3942 synchronize_rcu();
3943
3944#ifdef CONFIG_CFQ_GROUP_IOSCHED
3945 /* Free up per cpu stats for root group */
3946 free_percpu(cfqd->root_group.blkg.stats_cpu);
3947#endif
3948 kfree(cfqd);
3830} 3949}
3831 3950
3832static int cfq_alloc_cic_index(void) 3951static int cfq_alloc_cic_index(void)
@@ -3859,9 +3978,17 @@ static void *cfq_init_queue(struct request_queue *q)
3859 return NULL; 3978 return NULL;
3860 3979
3861 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3980 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3862 if (!cfqd) 3981 if (!cfqd) {
3982 spin_lock(&cic_index_lock);
3983 ida_remove(&cic_index_ida, i);
3984 spin_unlock(&cic_index_lock);
3863 return NULL; 3985 return NULL;
3986 }
3864 3987
3988 /*
3989 * Don't need take queue_lock in the routine, since we are
3990 * initializing the ioscheduler, and nobody is using cfqd
3991 */
3865 cfqd->cic_index = i; 3992 cfqd->cic_index = i;
3866 3993
3867 /* Init root service tree */ 3994 /* Init root service tree */
@@ -3878,14 +4005,29 @@ static void *cfq_init_queue(struct request_queue *q)
3878 4005
3879#ifdef CONFIG_CFQ_GROUP_IOSCHED 4006#ifdef CONFIG_CFQ_GROUP_IOSCHED
3880 /* 4007 /*
3881 * Take a reference to root group which we never drop. This is just 4008 * Set root group reference to 2. One reference will be dropped when
3882 * to make sure that cfq_put_cfqg() does not try to kfree root group 4009 * all groups on cfqd->cfqg_list are being deleted during queue exit.
4010 * Other reference will remain there as we don't want to delete this
4011 * group as it is statically allocated and gets destroyed when
4012 * throtl_data goes away.
3883 */ 4013 */
3884 atomic_set(&cfqg->ref, 1); 4014 cfqg->ref = 2;
4015
4016 if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
4017 kfree(cfqg);
4018 kfree(cfqd);
4019 return NULL;
4020 }
4021
3885 rcu_read_lock(); 4022 rcu_read_lock();
4023
3886 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 4024 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
3887 (void *)cfqd, 0); 4025 (void *)cfqd, 0);
3888 rcu_read_unlock(); 4026 rcu_read_unlock();
4027 cfqd->nr_blkcg_linked_grps++;
4028
4029 /* Add group on cfqd->cfqg_list */
4030 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
3889#endif 4031#endif
3890 /* 4032 /*
3891 * Not strictly needed (since RB_ROOT just clears the node and we 4033 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -3901,7 +4043,7 @@ static void *cfq_init_queue(struct request_queue *q)
3901 * will not attempt to free it. 4043 * will not attempt to free it.
3902 */ 4044 */
3903 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 4045 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3904 atomic_inc(&cfqd->oom_cfqq.ref); 4046 cfqd->oom_cfqq.ref++;
3905 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 4047 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3906 4048
3907 INIT_LIST_HEAD(&cfqd->cic_list); 4049 INIT_LIST_HEAD(&cfqd->cic_list);
@@ -3925,7 +4067,6 @@ static void *cfq_init_queue(struct request_queue *q)
3925 cfqd->cfq_slice_idle = cfq_slice_idle; 4067 cfqd->cfq_slice_idle = cfq_slice_idle;
3926 cfqd->cfq_group_idle = cfq_group_idle; 4068 cfqd->cfq_group_idle = cfq_group_idle;
3927 cfqd->cfq_latency = 1; 4069 cfqd->cfq_latency = 1;
3928 cfqd->cfq_group_isolation = 0;
3929 cfqd->hw_tag = -1; 4070 cfqd->hw_tag = -1;
3930 /* 4071 /*
3931 * we optimistically start assuming sync ops weren't delayed in last 4072 * we optimistically start assuming sync ops weren't delayed in last
@@ -4001,7 +4142,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4001SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4142SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4002SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4143SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4003SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4144SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4004SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
4005#undef SHOW_FUNCTION 4145#undef SHOW_FUNCTION
4006 4146
4007#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4147#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -4035,7 +4175,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4035STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4175STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4036 UINT_MAX, 0); 4176 UINT_MAX, 0);
4037STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4177STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4038STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
4039#undef STORE_FUNCTION 4178#undef STORE_FUNCTION
4040 4179
4041#define CFQ_ATTR(name) \ 4180#define CFQ_ATTR(name) \
@@ -4053,7 +4192,6 @@ static struct elv_fs_entry cfq_attrs[] = {
4053 CFQ_ATTR(slice_idle), 4192 CFQ_ATTR(slice_idle),
4054 CFQ_ATTR(group_idle), 4193 CFQ_ATTR(group_idle),
4055 CFQ_ATTR(low_latency), 4194 CFQ_ATTR(low_latency),
4056 CFQ_ATTR(group_isolation),
4057 __ATTR_NULL 4195 __ATTR_NULL
4058}; 4196};
4059 4197
@@ -4068,7 +4206,6 @@ static struct elevator_type iosched_cfq = {
4068 .elevator_add_req_fn = cfq_insert_request, 4206 .elevator_add_req_fn = cfq_insert_request,
4069 .elevator_activate_req_fn = cfq_activate_request, 4207 .elevator_activate_req_fn = cfq_activate_request,
4070 .elevator_deactivate_req_fn = cfq_deactivate_request, 4208 .elevator_deactivate_req_fn = cfq_deactivate_request,
4071 .elevator_queue_empty_fn = cfq_queue_empty,
4072 .elevator_completed_req_fn = cfq_completed_request, 4209 .elevator_completed_req_fn = cfq_completed_request,
4073 .elevator_former_req_fn = elv_rb_former_request, 4210 .elevator_former_req_fn = elv_rb_former_request,
4074 .elevator_latter_req_fn = elv_rb_latter_request, 4211 .elevator_latter_req_fn = elv_rb_latter_request,
@@ -4090,6 +4227,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
4090 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4227 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
4091 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4228 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4092 }, 4229 },
4230 .plid = BLKIO_POLICY_PROP,
4093}; 4231};
4094#else 4232#else
4095static struct blkio_policy_type blkio_policy_cfq; 4233static struct blkio_policy_type blkio_policy_cfq;