/*
* CFQ, or complete fairness queueing, disk scheduler.
*
* Based on ideas from a previously unfinished io
* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
*
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*/
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
/*
* tunables
*/
/* max queue in one round of service */
static const int cfq_quantum = 4;
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
static const int cfq_slice_sync = HZ / 10;
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
/*
* offset from end of service tree
*/
#define CFQ_IDLE_DELAY (HZ / 5)
/*
* below this threshold, we consider thinktime immediate
*/
#define CFQ_MIN_TT (2)
#define CFQ_SLICE_SCALE (5)
#define CFQ_HW_QUEUE_MIN (5)
#define RQ_CIC(rq) \
((struct cfq_io_context *) (rq)->elevator_private)
#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
static struct completion *ioc_gone;
static DEFINE_SPINLOCK(ioc_gone_lock);
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define sample_valid(samples) ((samples) > 80)
/*
* Most of our rbtree usage is for sorting with min extraction, so
* if we cache the leftmost node we don't have to walk down the tree
* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
* move this into the elevator for the rq sorting as well.
*/
struct cfq_rb_root {
struct rb_root rb;
struct rb_node *left;
};
#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, }
/*
* Per process-grouping structure
*/
struct cfq_queue {
/* reference count */
atomic_t ref;
/* various state flags, see below */
unsigned int flags;
/* parent cfq_data */
struct cfq_data *cfqd;
/* service_tree member */
struct rb_node rb_node;
/* service_tree key */
unsigned long rb_key;
/* prio tree member */
struct rb_node p_node;
/* prio tree root we belong to, if any */
struct rb_root *p_root;
/* sorted list of pending requests */
struct rb_root sort_list;
/* if fifo isn't expired, next request to serve */
struct request *next_rq;
/* requests queued in sort_list */
int queued[2];
/* currently allocated requests */
int allocated[2];
/* fifo list of requests in sort_list */
struct list_head fifo;
unsigned long slice_end;
long slice_resid;
unsigned int slice_dispatch;
/* pending metadata requests */
int meta_pending;
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
/* io prio of this group */
unsigned short ioprio, org_ioprio;
unsigned short ioprio_class, org_ioprio_class;
pid_t pid;
};
/*
* Per block device queue structure
*/
struct cfq_data {
struct request_queue *queue;
/*
* rr list of queues with requests and the count of them
*/
struct cfq_rb_root service_tree;
/*
* Each priority tree is sorted by next_request position. These
* trees are used when determining if two or more queues are
* interleaving requests (see cfq_close_cooperator).
*/
struct rb_root prio_trees[CFQ_PRIO_LISTS];
unsigned int busy_queues;
int rq_in_driver[2];
int sync_flight;
/*
* queue-depth detection
*/
int rq_queued;
int hw_tag;
int hw_tag_samples;
int rq_in_driver_peak;
/*
* idle window management
*/
struct timer_list idle_slice_timer;
struct work_struct unplug_work;
struct cfq_queue *active_queue;
struct cfq_io_context *active_cic;
/*
* async queue for each priority case
*/
struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
struct cfq_queue *async_idle_cfqq;
sector_t last_position;
/*
* tunables, see top of file
*/
unsigned int cfq_quantum;
unsigned int cfq_fifo_expire[2];
unsigned int cfq_back_penalty;
unsigned int cfq_back_max;
unsigned int cfq_slice[2];
unsigned int cfq_slice_async_rq;
unsigned int cfq_slice_idle;
unsigned int cfq_latency;
struct list_head cic_list;
/*
* Fallback dummy cfqq for extreme OOM conditions
*/
struct cfq_queue oom_cfqq;
unsigned long last_end_sync_rq;
};
enum cfqq_state_flags {
CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
CFQ_CFQQ_FLAG_sync, /* synchronous queue */
CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */
};
#define CFQ_CFQQ_FNS(name) \
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
{ \
(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
} \
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
{ \
(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
} \
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
{ \
return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
}
CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
CFQ_CFQQ_FNS(must_dispatch);
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
CFQ_CFQQ_FNS(slice_new);
CFQ_CFQQ_FNS(sync);
CFQ_CFQQ_FNS(coop);
#undef CFQ_CFQQ_FNS
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
#define cfq_log(cfqd, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
static void cfq_dispatch_insert(struct request_queue *, struct request *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
struct io_context *, gfp_t);
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
struct io_context *);
static inline int rq_in_driver(struct cfq_data *cfqd)
{
return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
}
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
bool is_sync)
{
return cic->cfqq[is_sync];
}
static inline void cic_set_cfqq(struct cfq_io_context *cic,
struct cfq_queue *cfqq, bool is_sync)
{
cic->cfqq[is_sync] = cfqq;
}
/*
* We regard a request as SYNC, if it's either a read or has the SYNC bit
* set (in which case it could also be direct WRITE).
*/
static inline bool cfq_bio_sync(struct bio *bio)
{
return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
}
/*
* scheduler run of queue, if there are requests pending and no one in the
* driver that will restart queueing
*/
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
{
if (cfqd->busy_queues) {
cfq_log(cfqd, "schedule dispatch");
kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
}
}
static int cfq_queue_empty(struct request_queue *q)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
return !cfqd->busy_queues;
}
/*
* Scale schedule slice based on io priority. Use the sync time slice only
* if a queue is marked sync and has sync io queued. A sync queue with async
* io only, should not get full sync slice length.
*/
static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
unsigned short prio)
{
const int base_slice = cfqd->cfq_slice[sync];
WARN_ON(prio >= IOPRIO_BE_NR);
return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
}
static inline int
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
}
/*
* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
* isn't valid until the first request from the dispatch is activated
* and the slice time set.
*/
static inline bool cfq_slice_used(struct cfq_queue *cfqq)
{
if (cfq_cfqq_slice_new(cfqq))
return 0;
if (time_before(jiffies, cfqq->slice_end))
return 0;
return 1;
}
/*
* Lifted from AS - choose which of rq1 and rq2 that is best served now.
* We choose the request that is closest to the head right now. Distance
* behind the head is penalized and only allowed to a certain extent.
*/
static struct request *
cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
{
sector_t last, s1, s2, d1 = 0, d2 = 0;
unsigned long back_max;
#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
unsigned wrap = 0; /* bit mask: requests behind the disk head? */
if (rq1 == NULL || rq1 == rq2)
return rq2;
if (rq2 == NULL)
return rq1;
if (rq_is_sync(rq1) && !rq_is_sync(rq2))
return rq1;
else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
return rq2;
if (rq_is_meta(rq1) && !rq_is_meta(rq2))
return rq1;
else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
return rq2;
s1 = blk_rq_pos(rq1);
s2 = blk_rq_pos(rq2);
last = cfqd->last_position;
/*
* by definition, 1KiB is 2 sectors
*/
back_max = cfqd->cfq_back_max * 2;
/*
* Strict one way elevator _except_ in the case where we allow
* short backward seeks which are biased as twice the cost of a
* similar forward seek.
*/
if (s1 >= last)
d1 = s1 - last;
else if (s1 + back_max >= last)
d1 = (last - s1) * cfqd->cfq_back_penalty;
else
wrap |= CFQ_RQ1_WRAP;
if (s2 >= last)
d2 = s2 - last;
else if (s2 + back_max >= last)
d2 = (last - s2) * cfqd->cfq_back_penalty;
else
wrap |= CFQ_RQ2_WRAP;
/* Found required data */
/*
* By doing switch() on the bit mask "wrap" we avoid having to
* check two variables for all permutations: --> faster!
*/
switch (wrap) {
case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
if (d1 < d2)
return rq1;
else if (d2 < d1)
return rq2;
else {
if (s1 >= s2)
return rq1;
else
return rq2;
}
case CFQ_RQ2_WRAP:
return rq1;
case CFQ_RQ1_WRAP:
return rq2;
case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
default:
/*
* Since both rqs are wrapped,
* start with the one that's further behind head
* (--> only *one* back seek required),
* since back seek takes more time than forward.
*/
if (s1 <= s2)
return rq1;
else
return rq2;
}
}
/*
* The below is leftmost cache rbtree addon
*/
static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
{
if (!root->left)
root->left = rb_first(&root->rb);
if (root->left)
return rb_entry(root->left, struct cfq_queue, rb_node);
return NULL;
}
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
rb_erase(n, root);
RB_CLEAR_NODE(n);
}
static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
{
if (root->left == n)
root->left = NULL;
rb_erase_init(n, &root->rb);
}
/*
* would be nice to take fifo expire time into account as well
*/
static struct request *
cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct request *last)
{
struct rb_node *rbnext = rb_next(&last->rb_node);
struct rb_node *rbprev = rb_prev(&last->rb_node);
struct request *next = NULL, *prev = NULL;
BUG_ON(RB_EMPTY_NODE(&last->rb_node));
if (rbprev)
prev = rb_entry_rq(rbprev);
if (rbnext)
next = rb_entry_rq(rbnext);
else {
rbnext = rb_first(&cfqq->sort_list);
if (rbnext && rbnext != &last->rb_node)
next = rb_entry_rq(rbnext);
}
return cfq_choose_req(cfqd, next, prev);
}
static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
struct cfq_queue *cfqq)
{
/*
* just an approximation, should be ok.
*/
return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
}
/*
* The cfqd->service_tree holds all pending cfq_queue's that have
* requests waiting to be processed. It is sorted in the order that
* we will service the queues.
*/
static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
bool add_front)
{
struct rb_node **p, *parent;
struct cfq_queue *__cfqq;
unsigned long rb_key;
int left;
if (cfq_class_idle(cfqq)) {
rb_key = CFQ_IDLE_DELAY;
parent = rb_last(&cfqd->service_tree.rb);
if (parent && parent != &cfqq->rb_node) {
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
rb_key += __cfqq->rb_key;
} else
rb_key += jiffies;
} else if (!add_front) {
/*
* Get our rb key offset. Subtract any residual slice
* value carried from last service. A negative resid
* count indicates slice overrun, and this should position
* the next service time further away in the tree.
*/
rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
rb_key -= cfqq->slice_resid;
cfqq->slice_resid = 0;
} else {
rb_key = -HZ;
__cfqq = cfq_rb_first(&cfqd->service_tree);
rb_key += __cfqq ? __cfqq->rb_key : jiffies;
}
if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
/*
* same position, nothing more to do
*/
if (rb_key == cfqq->rb_key)
return;
cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
}
left = 1;
parent = NULL;
p = &cfqd->service_tree.rb.rb_node;
while (*p) {
struct rb_node **n;
parent = *p;
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
/*
* sort RT queues first, we always want to give
* preference to them. IDLE queues goes to the back.
* after that, sort on the next service time.
*/
if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
n = &(*p)->rb_left;
else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
n = &(*p)->rb_right;
else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
n = &(*p)->rb_left;
else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
n = &(*p)->rb_right;
else if (time_before(rb_key, __cfqq->rb_key))
n = &(*p)->rb_left;
else
n = &(*p)->rb_right;
if (n == &(*p)->rb_right)
left = 0;
p = n;
}
if (left)
cfqd->service_tree.left = &cfqq->rb_node;
cfqq->rb_key = rb_key;
rb_link_node(&cfqq->rb_node, parent, p);
rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
}
static struct cfq_queue *
cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
sector_t sector, struct rb_node **ret_parent,
struct rb_node ***rb_link)
{
struct rb_node **p, *parent;
struct cfq_queue *cfqq = NULL;
parent = NULL;
p = &root->rb_node;
while (*p) {
struct rb_node **n;
parent = *p;
cfqq = rb_entry(parent, struct cfq_queue, p_node);
/*
* Sort strictly based on sector. Smallest to the left,
* largest to the right.
*/
if (sector > blk_rq_pos(cfqq->next_rq))
n = &(*p)->rb_right;
else if (sector < blk_rq_pos(cfqq->next_rq))
n = &(*p)->rb_left;
else
break;
p = n;
cfqq = NULL;
}
*ret_parent = parent;
if (rb_link)
*rb_link = p;
return cfqq;
}
static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
struct rb_node **p, *parent;
struct cfq_queue *__cfqq;
if (cfqq->p_root) {
rb_erase(&cfqq->p_node, cfqq->p_root);
cfqq->p_root = NULL;
}
if (cfq_class_idle(cfqq))
return;
if (!cfqq->next_rq)
return;
cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
blk_rq_pos(cfqq->next_rq), &parent, &p);
if (!__cfqq) {
rb_link_node(&cfqq->p_node, parent, p);
rb_insert_color(&cfqq->p_node, cfqq->p_root);
} else
cfqq->p_root = NULL;
}
/*
* Update cfqq's position in the service tree.
*/
static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
/*
* Resorting requires the cfqq to be on the RR list already.
*/
if (cfq_cfqq_on_rr(cfqq)) {
cfq_service_tree_add(cfqd, cfqq, 0);
cfq_prio_tree_add(cfqd, cfqq);
}
}
/*
* add to busy list of queues for service, trying to be fair in ordering
* the pending list according to last request service
*/
static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
BUG_ON(cfq_cfqq_on_rr(cfqq));
cfq_mark_cfqq_on_rr(cfqq);
|