diff options
author | Corrado Zoccolo <czoccolo@gmail.com> | 2009-10-26 17:45:29 -0400 |
---|---|---|
committer | Jens Axboe <jens.axboe@oracle.com> | 2009-10-28 04:23:26 -0400 |
commit | 718eee0579b802aabe3bafacf09d0a9b0830f1dd (patch) | |
tree | 3a85a6d38ed7b68ed6ca21d04158afee13980e5e /block | |
parent | a6d44e982d3734583b3b4e1d36921af8cfd61fc0 (diff) |
cfq-iosched: fairness for sync no-idle queues
Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.
We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
requests.
This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
computed proportionally to the number of processes in each set.
Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'block')
-rw-r--r-- | block/cfq-iosched.c | 200 |
1 files changed, 168 insertions, 32 deletions
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 76afa3696894..859f534ae9ef 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -134,7 +134,7 @@ struct cfq_queue { | |||
134 | }; | 134 | }; |
135 | 135 | ||
136 | /* | 136 | /* |
137 | * Index in the service_trees. | 137 | * First index in the service_trees. |
138 | * IDLE is handled separately, so it has negative index | 138 | * IDLE is handled separately, so it has negative index |
139 | */ | 139 | */ |
140 | enum wl_prio_t { | 140 | enum wl_prio_t { |
@@ -144,6 +144,16 @@ enum wl_prio_t { | |||
144 | }; | 144 | }; |
145 | 145 | ||
146 | /* | 146 | /* |
147 | * Second index in the service_trees. | ||
148 | */ | ||
149 | enum wl_type_t { | ||
150 | ASYNC_WORKLOAD = 0, | ||
151 | SYNC_NOIDLE_WORKLOAD = 1, | ||
152 | SYNC_WORKLOAD = 2 | ||
153 | }; | ||
154 | |||
155 | |||
156 | /* | ||
147 | * Per block device queue structure | 157 | * Per block device queue structure |
148 | */ | 158 | */ |
149 | struct cfq_data { | 159 | struct cfq_data { |
@@ -153,12 +163,14 @@ struct cfq_data { | |||
153 | * rr lists of queues with requests, onle rr for each priority class. | 163 | * rr lists of queues with requests, onle rr for each priority class. |
154 | * Counts are embedded in the cfq_rb_root | 164 | * Counts are embedded in the cfq_rb_root |
155 | */ | 165 | */ |
156 | struct cfq_rb_root service_trees[2]; | 166 | struct cfq_rb_root service_trees[2][3]; |
157 | struct cfq_rb_root service_tree_idle; | 167 | struct cfq_rb_root service_tree_idle; |
158 | /* | 168 | /* |
159 | * The priority currently being served | 169 | * The priority currently being served |
160 | */ | 170 | */ |
161 | enum wl_prio_t serving_prio; | 171 | enum wl_prio_t serving_prio; |
172 | enum wl_type_t serving_type; | ||
173 | unsigned long workload_expires; | ||
162 | 174 | ||
163 | /* | 175 | /* |
164 | * Each priority tree is sorted by next_request position. These | 176 | * Each priority tree is sorted by next_request position. These |
@@ -221,12 +233,13 @@ struct cfq_data { | |||
221 | }; | 233 | }; |
222 | 234 | ||
223 | static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio, | 235 | static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio, |
236 | enum wl_type_t type, | ||
224 | struct cfq_data *cfqd) | 237 | struct cfq_data *cfqd) |
225 | { | 238 | { |
226 | if (prio == IDLE_WORKLOAD) | 239 | if (prio == IDLE_WORKLOAD) |
227 | return &cfqd->service_tree_idle; | 240 | return &cfqd->service_tree_idle; |
228 | 241 | ||
229 | return &cfqd->service_trees[prio]; | 242 | return &cfqd->service_trees[prio][type]; |
230 | } | 243 | } |
231 | 244 | ||
232 | enum cfqq_state_flags { | 245 | enum cfqq_state_flags { |
@@ -282,12 +295,24 @@ static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) | |||
282 | return BE_WORKLOAD; | 295 | return BE_WORKLOAD; |
283 | } | 296 | } |
284 | 297 | ||
298 | |||
299 | static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) | ||
300 | { | ||
301 | if (!cfq_cfqq_sync(cfqq)) | ||
302 | return ASYNC_WORKLOAD; | ||
303 | if (!cfq_cfqq_idle_window(cfqq)) | ||
304 | return SYNC_NOIDLE_WORKLOAD; | ||
305 | return SYNC_WORKLOAD; | ||
306 | } | ||
307 | |||
285 | static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd) | 308 | static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd) |
286 | { | 309 | { |
287 | if (wl == IDLE_WORKLOAD) | 310 | if (wl == IDLE_WORKLOAD) |
288 | return cfqd->service_tree_idle.count; | 311 | return cfqd->service_tree_idle.count; |
289 | 312 | ||
290 | return cfqd->service_trees[wl].count; | 313 | return cfqd->service_trees[wl][ASYNC_WORKLOAD].count |
314 | + cfqd->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count | ||
315 | + cfqd->service_trees[wl][SYNC_WORKLOAD].count; | ||
291 | } | 316 | } |
292 | 317 | ||
293 | static void cfq_dispatch_insert(struct request_queue *, struct request *); | 318 | static void cfq_dispatch_insert(struct request_queue *, struct request *); |
@@ -597,7 +622,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
597 | struct cfq_rb_root *service_tree; | 622 | struct cfq_rb_root *service_tree; |
598 | int left; | 623 | int left; |
599 | 624 | ||
600 | service_tree = service_tree_for(cfqq_prio(cfqq), cfqd); | 625 | service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd); |
601 | if (cfq_class_idle(cfqq)) { | 626 | if (cfq_class_idle(cfqq)) { |
602 | rb_key = CFQ_IDLE_DELAY; | 627 | rb_key = CFQ_IDLE_DELAY; |
603 | parent = rb_last(&service_tree->rb); | 628 | parent = rb_last(&service_tree->rb); |
@@ -1030,7 +1055,7 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) | |||
1030 | static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) | 1055 | static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) |
1031 | { | 1056 | { |
1032 | struct cfq_rb_root *service_tree = | 1057 | struct cfq_rb_root *service_tree = |
1033 | service_tree_for(cfqd->serving_prio, cfqd); | 1058 | service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd); |
1034 | 1059 | ||
1035 | if (RB_EMPTY_ROOT(&service_tree->rb)) | 1060 | if (RB_EMPTY_ROOT(&service_tree->rb)) |
1036 | return NULL; | 1061 | return NULL; |
@@ -1167,7 +1192,7 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, | |||
1167 | static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 1192 | static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
1168 | { | 1193 | { |
1169 | enum wl_prio_t prio = cfqq_prio(cfqq); | 1194 | enum wl_prio_t prio = cfqq_prio(cfqq); |
1170 | struct cfq_rb_root *service_tree; | 1195 | struct cfq_rb_root *service_tree = cfqq->service_tree; |
1171 | 1196 | ||
1172 | /* We never do for idle class queues. */ | 1197 | /* We never do for idle class queues. */ |
1173 | if (prio == IDLE_WORKLOAD) | 1198 | if (prio == IDLE_WORKLOAD) |
@@ -1181,7 +1206,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
1181 | * Otherwise, we do only if they are the last ones | 1206 | * Otherwise, we do only if they are the last ones |
1182 | * in their service tree. | 1207 | * in their service tree. |
1183 | */ | 1208 | */ |
1184 | service_tree = service_tree_for(prio, cfqd); | 1209 | if (!service_tree) |
1210 | service_tree = service_tree_for(prio, cfqq_type(cfqq), cfqd); | ||
1211 | |||
1185 | if (service_tree->count == 0) | 1212 | if (service_tree->count == 0) |
1186 | return true; | 1213 | return true; |
1187 | 1214 | ||
@@ -1235,14 +1262,20 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) | |||
1235 | 1262 | ||
1236 | cfq_mark_cfqq_wait_request(cfqq); | 1263 | cfq_mark_cfqq_wait_request(cfqq); |
1237 | 1264 | ||
1238 | /* | ||
1239 | * we don't want to idle for seeks, but we do want to allow | ||
1240 | * fair distribution of slice time for a process doing back-to-back | ||
1241 | * seeks. so allow a little bit of time for him to submit a new rq | ||
1242 | */ | ||
1243 | sl = cfqd->cfq_slice_idle; | 1265 | sl = cfqd->cfq_slice_idle; |
1244 | if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)) | 1266 | /* are we servicing noidle tree, and there are more queues? |
1267 | * non-rotational or NCQ: no idle | ||
1268 | * non-NCQ rotational : very small idle, to allow | ||
1269 | * fair distribution of slice time for a process doing back-to-back | ||
1270 | * seeks. | ||
1271 | */ | ||
1272 | if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && | ||
1273 | service_tree_for(cfqd->serving_prio, SYNC_NOIDLE_WORKLOAD, cfqd) | ||
1274 | ->count > 0) { | ||
1275 | if (blk_queue_nonrot(cfqd->queue) || cfqd->hw_tag) | ||
1276 | return; | ||
1245 | sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); | 1277 | sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); |
1278 | } | ||
1246 | 1279 | ||
1247 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); | 1280 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); |
1248 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); | 1281 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); |
@@ -1346,6 +1379,106 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) | |||
1346 | } | 1379 | } |
1347 | } | 1380 | } |
1348 | 1381 | ||
1382 | static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, | ||
1383 | bool prio_changed) | ||
1384 | { | ||
1385 | struct cfq_queue *queue; | ||
1386 | int i; | ||
1387 | bool key_valid = false; | ||
1388 | unsigned long lowest_key = 0; | ||
1389 | enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; | ||
1390 | |||
1391 | if (prio_changed) { | ||
1392 | /* | ||
1393 | * When priorities switched, we prefer starting | ||
1394 | * from SYNC_NOIDLE (first choice), or just SYNC | ||
1395 | * over ASYNC | ||
1396 | */ | ||
1397 | if (service_tree_for(prio, cur_best, cfqd)->count) | ||
1398 | return cur_best; | ||
1399 | cur_best = SYNC_WORKLOAD; | ||
1400 | if (service_tree_for(prio, cur_best, cfqd)->count) | ||
1401 | return cur_best; | ||
1402 | |||
1403 | return ASYNC_WORKLOAD; | ||
1404 | } | ||
1405 | |||
1406 | for (i = 0; i < 3; ++i) { | ||
1407 | /* otherwise, select the one with lowest rb_key */ | ||
1408 | queue = cfq_rb_first(service_tree_for(prio, i, cfqd)); | ||
1409 | if (queue && | ||
1410 | (!key_valid || time_before(queue->rb_key, lowest_key))) { | ||
1411 | lowest_key = queue->rb_key; | ||
1412 | cur_best = i; | ||
1413 | key_valid = true; | ||
1414 | } | ||
1415 | } | ||
1416 | |||
1417 | return cur_best; | ||
1418 | } | ||
1419 | |||
1420 | static void choose_service_tree(struct cfq_data *cfqd) | ||
1421 | { | ||
1422 | enum wl_prio_t previous_prio = cfqd->serving_prio; | ||
1423 | bool prio_changed; | ||
1424 | unsigned slice; | ||
1425 | unsigned count; | ||
1426 | |||
1427 | /* Choose next priority. RT > BE > IDLE */ | ||
1428 | if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd)) | ||
1429 | cfqd->serving_prio = RT_WORKLOAD; | ||
1430 | else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd)) | ||
1431 | cfqd->serving_prio = BE_WORKLOAD; | ||
1432 | else { | ||
1433 | cfqd->serving_prio = IDLE_WORKLOAD; | ||
1434 | cfqd->workload_expires = jiffies + 1; | ||
1435 | return; | ||
1436 | } | ||
1437 | |||
1438 | /* | ||
1439 | * For RT and BE, we have to choose also the type | ||
1440 | * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload | ||
1441 | * expiration time | ||
1442 | */ | ||
1443 | prio_changed = (cfqd->serving_prio != previous_prio); | ||
1444 | count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd) | ||
1445 | ->count; | ||
1446 | |||
1447 | /* | ||
1448 | * If priority didn't change, check workload expiration, | ||
1449 | * and that we still have other queues ready | ||
1450 | */ | ||
1451 | if (!prio_changed && count && | ||
1452 | !time_after(jiffies, cfqd->workload_expires)) | ||
1453 | return; | ||
1454 | |||
1455 | /* otherwise select new workload type */ | ||
1456 | cfqd->serving_type = | ||
1457 | cfq_choose_wl(cfqd, cfqd->serving_prio, prio_changed); | ||
1458 | count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd) | ||
1459 | ->count; | ||
1460 | |||
1461 | /* | ||
1462 | * the workload slice is computed as a fraction of target latency | ||
1463 | * proportional to the number of queues in that workload, over | ||
1464 | * all the queues in the same priority class | ||
1465 | */ | ||
1466 | slice = cfq_target_latency * count / | ||
1467 | max_t(unsigned, cfqd->busy_queues_avg[cfqd->serving_prio], | ||
1468 | cfq_busy_queues_wl(cfqd->serving_prio, cfqd)); | ||
1469 | |||
1470 | if (cfqd->serving_type == ASYNC_WORKLOAD) | ||
1471 | /* async workload slice is scaled down according to | ||
1472 | * the sync/async slice ratio. */ | ||
1473 | slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; | ||
1474 | else | ||
1475 | /* sync workload slice is at least 2 * cfq_slice_idle */ | ||
1476 | slice = max(slice, 2 * cfqd->cfq_slice_idle); | ||
1477 | |||
1478 | slice = max_t(unsigned, slice, CFQ_MIN_TT); | ||
1479 | cfqd->workload_expires = jiffies + slice; | ||
1480 | } | ||
1481 | |||
1349 | /* | 1482 | /* |
1350 | * Select a queue for service. If we have a current active queue, | 1483 | * Select a queue for service. If we have a current active queue, |
1351 | * check whether to continue servicing it, or retrieve and set a new one. | 1484 | * check whether to continue servicing it, or retrieve and set a new one. |
@@ -1398,14 +1531,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) | |||
1398 | expire: | 1531 | expire: |
1399 | cfq_slice_expired(cfqd, 0); | 1532 | cfq_slice_expired(cfqd, 0); |
1400 | new_queue: | 1533 | new_queue: |
1401 | if (!new_cfqq) { | 1534 | /* |
1402 | if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd)) | 1535 | * Current queue expired. Check if we have to switch to a new |
1403 | cfqd->serving_prio = RT_WORKLOAD; | 1536 | * service tree |
1404 | else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd)) | 1537 | */ |
1405 | cfqd->serving_prio = BE_WORKLOAD; | 1538 | if (!new_cfqq) |
1406 | else | 1539 | choose_service_tree(cfqd); |
1407 | cfqd->serving_prio = IDLE_WORKLOAD; | 1540 | |
1408 | } | ||
1409 | cfqq = cfq_set_active_queue(cfqd, new_cfqq); | 1541 | cfqq = cfq_set_active_queue(cfqd, new_cfqq); |
1410 | keep_queue: | 1542 | keep_queue: |
1411 | return cfqq; | 1543 | return cfqq; |
@@ -1432,10 +1564,12 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) | |||
1432 | { | 1564 | { |
1433 | struct cfq_queue *cfqq; | 1565 | struct cfq_queue *cfqq; |
1434 | int dispatched = 0; | 1566 | int dispatched = 0; |
1435 | int i; | 1567 | int i, j; |
1436 | for (i = 0; i < 2; ++i) | 1568 | for (i = 0; i < 2; ++i) |
1437 | while ((cfqq = cfq_rb_first(&cfqd->service_trees[i])) != NULL) | 1569 | for (j = 0; j < 3; ++j) |
1438 | dispatched += __cfq_forced_dispatch_cfqq(cfqq); | 1570 | while ((cfqq = cfq_rb_first(&cfqd->service_trees[i][j])) |
1571 | != NULL) | ||
1572 | dispatched += __cfq_forced_dispatch_cfqq(cfqq); | ||
1439 | 1573 | ||
1440 | while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL) | 1574 | while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL) |
1441 | dispatched += __cfq_forced_dispatch_cfqq(cfqq); | 1575 | dispatched += __cfq_forced_dispatch_cfqq(cfqq); |
@@ -2218,13 +2352,10 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
2218 | enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); | 2352 | enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); |
2219 | 2353 | ||
2220 | if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || | 2354 | if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || |
2221 | (!cfqd->cfq_latency && cfqd->hw_tag && CFQQ_SEEKY(cfqq))) | 2355 | (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))) |
2222 | enable_idle = 0; | 2356 | enable_idle = 0; |
2223 | else if (sample_valid(cic->ttime_samples)) { | 2357 | else if (sample_valid(cic->ttime_samples)) { |
2224 | unsigned int slice_idle = cfqd->cfq_slice_idle; | 2358 | if (cic->ttime_mean > cfqd->cfq_slice_idle) |
2225 | if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)) | ||
2226 | slice_idle = msecs_to_jiffies(CFQ_MIN_TT); | ||
2227 | if (cic->ttime_mean > slice_idle) | ||
2228 | enable_idle = 0; | 2359 | enable_idle = 0; |
2229 | else | 2360 | else |
2230 | enable_idle = 1; | 2361 | enable_idle = 1; |
@@ -2262,6 +2393,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, | |||
2262 | if (cfq_class_idle(cfqq)) | 2393 | if (cfq_class_idle(cfqq)) |
2263 | return true; | 2394 | return true; |
2264 | 2395 | ||
2396 | if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD | ||
2397 | && new_cfqq->service_tree == cfqq->service_tree) | ||
2398 | return true; | ||
2399 | |||
2265 | /* | 2400 | /* |
2266 | * if the new request is sync, but the currently running queue is | 2401 | * if the new request is sync, but the currently running queue is |
2267 | * not, let the sync request have priority. | 2402 | * not, let the sync request have priority. |
@@ -2778,14 +2913,15 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
2778 | static void *cfq_init_queue(struct request_queue *q) | 2913 | static void *cfq_init_queue(struct request_queue *q) |
2779 | { | 2914 | { |
2780 | struct cfq_data *cfqd; | 2915 | struct cfq_data *cfqd; |
2781 | int i; | 2916 | int i, j; |
2782 | 2917 | ||
2783 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); | 2918 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); |
2784 | if (!cfqd) | 2919 | if (!cfqd) |
2785 | return NULL; | 2920 | return NULL; |
2786 | 2921 | ||
2787 | for (i = 0; i < 2; ++i) | 2922 | for (i = 0; i < 2; ++i) |
2788 | cfqd->service_trees[i] = CFQ_RB_ROOT; | 2923 | for (j = 0; j < 3; ++j) |
2924 | cfqd->service_trees[i][j] = CFQ_RB_ROOT; | ||
2789 | cfqd->service_tree_idle = CFQ_RB_ROOT; | 2925 | cfqd->service_tree_idle = CFQ_RB_ROOT; |
2790 | 2926 | ||
2791 | /* | 2927 | /* |