From fba822722e3f9d438fca8fd9541d7ddd447d7a48 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 18 Apr 2006 09:44:06 +0200 Subject: [PATCH 1/2] iosched: fix typo and barrier() On rmmod path, cfq/as waits to make sure all io-contexts was freed. However, it's using complete(), not wait_for_completion(). I think barrier() is not enough in here. To avoid the following case, this patch replaces barrier() with smb_wmb(). cpu0 visibility cpu1 [ioc_gnone=NULL,ioc_count=1] ioc_gnone = &all_gone NULL,ioc_count=1 atomic_read(&ioc_count) NULL,ioc_count=1 wait_for_completion() NULL,ioc_count=0 atomic_sub_and_test() NULL,ioc_count=0 if ( && ioc_gone) [ioc_gone==NULL, so doesn't call complete()] &all_gone,ioc_count=0 Signed-off-by: OGAWA Hirofumi Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 67d446de0227..01820b1094e9 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2439,9 +2439,10 @@ static void __exit cfq_exit(void) DECLARE_COMPLETION(all_gone); elv_unregister(&iosched_cfq); ioc_gone = &all_gone; - barrier(); + /* ioc_gone's update must be visible before reading ioc_count */ + smp_wmb(); if (atomic_read(&ioc_count)) - complete(ioc_gone); + wait_for_completion(ioc_gone); synchronize_rcu(); cfq_slab_kill(); } -- cgit v1.2.2 From dbecf3ab40b5a6cc4499543778cd9f9682c0abad Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 18 Apr 2006 09:45:18 +0200 Subject: [PATCH 2/2] cfq: fix cic's rbtree traversal When queue dies, we set cic->key=NULL as dead mark. So, when we traverse a rbtree, we must check whether it's still valid key. if it was invalidated, drop it, then restart the traversal from top. Signed-off-by: OGAWA Hirofumi Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 01820b1094e9..246feae16c60 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1472,15 +1472,31 @@ out: return cfqq; } +static void +cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic) +{ + read_lock(&cfq_exit_lock); + rb_erase(&cic->rb_node, &ioc->cic_root); + read_unlock(&cfq_exit_lock); + kmem_cache_free(cfq_ioc_pool, cic); + atomic_dec(&ioc_count); +} + static struct cfq_io_context * cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc) { - struct rb_node *n = ioc->cic_root.rb_node; + struct rb_node *n; struct cfq_io_context *cic; void *key = cfqd; +restart: + n = ioc->cic_root.rb_node; while (n) { cic = rb_entry(n, struct cfq_io_context, rb_node); + if (unlikely(!cic->key)) { + cfq_drop_dead_cic(ioc, cic); + goto restart; + } if (key < cic->key) n = n->rb_left; @@ -1497,20 +1513,24 @@ static inline void cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, struct cfq_io_context *cic) { - struct rb_node **p = &ioc->cic_root.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p; + struct rb_node *parent; struct cfq_io_context *__cic; - read_lock(&cfq_exit_lock); - cic->ioc = ioc; cic->key = cfqd; ioc->set_ioprio = cfq_ioc_set_ioprio; - +restart: + parent = NULL; + p = &ioc->cic_root.rb_node; while (*p) { parent = *p; __cic = rb_entry(parent, struct cfq_io_context, rb_node); + if (unlikely(!__cic->key)) { + cfq_drop_dead_cic(ioc, cic); + goto restart; + } if (cic->key < __cic->key) p = &(*p)->rb_left; @@ -1520,6 +1540,7 @@ cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, BUG(); } + read_lock(&cfq_exit_lock); rb_link_node(&cic->rb_node, parent, p); rb_insert_color(&cic->rb_node, &ioc->cic_root); list_add(&cic->queue_list, &cfqd->cic_list); -- cgit v1.2.2 From be3b075354e170368a0d29558cae492205e80a64 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 18 Apr 2006 19:18:31 +0200 Subject: [PATCH] cfq: Further rbtree traversal and cfq_exit_queue() race fix In current code, we are re-reading cic->key after dead cic->key check. So, in theory, it may really re-read *after* cfq_exit_queue() seted NULL. To avoid race, we copy it to stack, then use it. With this change, I guess gcc will assign cic->key to a register or stack, and it wouldn't be re-readed. Signed-off-by: OGAWA Hirofumi Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 246feae16c60..2540dfaa3e38 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1487,20 +1487,22 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc) { struct rb_node *n; struct cfq_io_context *cic; - void *key = cfqd; + void *k, *key = cfqd; restart: n = ioc->cic_root.rb_node; while (n) { cic = rb_entry(n, struct cfq_io_context, rb_node); - if (unlikely(!cic->key)) { + /* ->key must be copied to avoid race with cfq_exit_queue() */ + k = cic->key; + if (unlikely(!k)) { cfq_drop_dead_cic(ioc, cic); goto restart; } - if (key < cic->key) + if (key < k) n = n->rb_left; - else if (key > cic->key) + else if (key > k) n = n->rb_right; else return cic; @@ -1516,6 +1518,7 @@ cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, struct rb_node **p; struct rb_node *parent; struct cfq_io_context *__cic; + void *k; cic->ioc = ioc; cic->key = cfqd; @@ -1527,14 +1530,16 @@ restart: while (*p) { parent = *p; __cic = rb_entry(parent, struct cfq_io_context, rb_node); - if (unlikely(!__cic->key)) { + /* ->key must be copied to avoid race with cfq_exit_queue() */ + k = __cic->key; + if (unlikely(!k)) { cfq_drop_dead_cic(ioc, cic); goto restart; } - if (cic->key < __cic->key) + if (cic->key < k) p = &(*p)->rb_left; - else if (cic->key > __cic->key) + else if (cic->key > k) p = &(*p)->rb_right; else BUG(); -- cgit v1.2.2 From 3793c65c13e4751c7a10f98198bae1758453eb0e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 30 May 2006 21:11:04 +0200 Subject: [PATCH] cfq-iosched: fixup locking and ->queue_list list management - Drop cic from the list when seen as dead. - Fixup the locking, just use a simple spinlock. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- block/cfq-iosched.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 2540dfaa3e38..11ce6aaf1bd0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -33,7 +33,7 @@ static int cfq_slice_idle = HZ / 70; #define CFQ_KEY_ASYNC (0) -static DEFINE_RWLOCK(cfq_exit_lock); +static DEFINE_SPINLOCK(cfq_exit_lock); /* * for the hash of cfqq inside the cfqd @@ -1284,7 +1284,7 @@ static void cfq_exit_io_context(struct io_context *ioc) /* * put the reference this task is holding to the various queues */ - read_lock_irqsave(&cfq_exit_lock, flags); + spin_lock_irqsave(&cfq_exit_lock, flags); n = rb_first(&ioc->cic_root); while (n != NULL) { @@ -1294,7 +1294,7 @@ static void cfq_exit_io_context(struct io_context *ioc) n = rb_next(n); } - read_unlock_irqrestore(&cfq_exit_lock, flags); + spin_unlock_irqrestore(&cfq_exit_lock, flags); } static struct cfq_io_context * @@ -1400,17 +1400,17 @@ static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio) struct cfq_io_context *cic; struct rb_node *n; - write_lock(&cfq_exit_lock); + spin_lock(&cfq_exit_lock); n = rb_first(&ioc->cic_root); while (n != NULL) { cic = rb_entry(n, struct cfq_io_context, rb_node); - + changed_ioprio(cic); n = rb_next(n); } - write_unlock(&cfq_exit_lock); + spin_unlock(&cfq_exit_lock); return 0; } @@ -1475,9 +1475,10 @@ out: static void cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic) { - read_lock(&cfq_exit_lock); + spin_lock(&cfq_exit_lock); rb_erase(&cic->rb_node, &ioc->cic_root); - read_unlock(&cfq_exit_lock); + list_del_init(&cic->queue_list); + spin_unlock(&cfq_exit_lock); kmem_cache_free(cfq_ioc_pool, cic); atomic_dec(&ioc_count); } @@ -1545,11 +1546,11 @@ restart: BUG(); } - read_lock(&cfq_exit_lock); + spin_lock(&cfq_exit_lock); rb_link_node(&cic->rb_node, parent, p); rb_insert_color(&cic->rb_node, &ioc->cic_root); list_add(&cic->queue_list, &cfqd->cic_list); - read_unlock(&cfq_exit_lock); + spin_unlock(&cfq_exit_lock); } /* @@ -2187,7 +2188,7 @@ static void cfq_exit_queue(elevator_t *e) cfq_shutdown_timer_wq(cfqd); - write_lock(&cfq_exit_lock); + spin_lock(&cfq_exit_lock); spin_lock_irq(q->queue_lock); if (cfqd->active_queue) @@ -2210,7 +2211,7 @@ static void cfq_exit_queue(elevator_t *e) } spin_unlock_irq(q->queue_lock); - write_unlock(&cfq_exit_lock); + spin_unlock(&cfq_exit_lock); cfq_shutdown_timer_wq(cfqd); -- cgit v1.2.2 From e0de0206a2a37cd3e0ba9954d9f863e11d6d1782 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Jun 2006 10:07:26 +0200 Subject: [PATCH] cfq-iosched: check busy queues before deciding we are idle For just one busy queue (like async write out), we often overlooked that we could queue more io and decided we were idle instead. This causes us quite a bit of performance loss. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 11ce6aaf1bd0..d582433441c5 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -878,6 +878,13 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) cfqq = list_entry_cfqq(cfqd->cur_rr.next); + /* + * If no new queues are available, check if the busy list has some + * before falling back to idle io. + */ + if (!cfqq && !list_empty(&cfqd->busy_rr)) + cfqq = list_entry_cfqq(cfqd->busy_rr.next); + /* * if we have idle queues and no rt or be queues had pending * requests, either allow immediate service if the grace period -- cgit v1.2.2 From 12e9fddd6eb827937fcaac8ac7712c7303898b1f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Jun 2006 10:09:56 +0200 Subject: [PATCH] cfq-iosched: Detect idle process issuing async request If we are anticipating a sync request from this process and we are waiting for that and see an async request come in, expire that slice and move on. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d582433441c5..93ba54605eda 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1747,14 +1747,24 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq); + cic = crq->io_context; + /* * we never wait for an async request and we don't allow preemption * of an async request. so just return early */ - if (!cfq_crq_is_sync(crq)) + if (!cfq_crq_is_sync(crq)) { + /* + * sync process issued an async request, if it's waiting + * then expire it and kick rq handling. + */ + if (cic == cfqd->active_cic && + del_timer(&cfqd->idle_slice_timer)) { + cfq_slice_expired(cfqd, 0); + cfq_start_queueing(cfqd, cfqq); + } return; - - cic = crq->io_context; + } cfq_update_io_thinktime(cfqd, cic); cfq_update_io_seektime(cfqd, cic, crq); -- cgit v1.2.2 From 25776e3594f841b7fae7b33ebecf009a0a55bed1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Jun 2006 10:12:26 +0200 Subject: [PATCH] cfq-iosched: Detect hardware queueing If the hardware is doing real queueing, decide that it's worthless to idle the hardware. It does reasonable simultaneous io in that case anyways, and the idling hurts some work loads. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 93ba54605eda..5d2047b93eb5 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -133,6 +133,7 @@ struct cfq_data { mempool_t *crq_pool; int rq_in_driver; + int hw_tag; /* * schedule slice state info @@ -664,6 +665,15 @@ static void cfq_activate_request(request_queue_t *q, struct request *rq) struct cfq_data *cfqd = q->elevator->elevator_data; cfqd->rq_in_driver++; + + /* + * If the depth is larger 1, it really could be queueing. But lets + * make the mark a little higher - idling could still be good for + * low queueing, and a low queueing number could also just indicate + * a SCSI mid layer like behaviour where limit+1 is often seen. + */ + if (!cfqd->hw_tag && cfqd->rq_in_driver > 4) + cfqd->hw_tag = 1; } static void cfq_deactivate_request(request_queue_t *q, struct request *rq) @@ -1465,7 +1475,8 @@ retry: * set ->slice_left to allow preemption for a new process */ cfqq->slice_left = 2 * cfqd->cfq_slice_idle; - cfq_mark_cfqq_idle_window(cfqq); + if (!cfqd->hw_tag) + cfq_mark_cfqq_idle_window(cfqq); cfq_mark_cfqq_prio_changed(cfqq); cfq_init_prio_data(cfqq); } @@ -1656,7 +1667,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, { int enable_idle = cfq_cfqq_idle_window(cfqq); - if (!cic->ioc->task || !cfqd->cfq_slice_idle) + if (!cic->ioc->task || !cfqd->cfq_slice_idle || cfqd->hw_tag) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { if (cic->ttime_mean > cfqd->cfq_slice_idle) -- cgit v1.2.2 From ae818a38d4755ba4c16a22a8eacec859511a5393 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Jun 2006 10:13:43 +0200 Subject: [PATCH] cfq-iosched: fix bug in timer handling for the idle class There's a small window from when the timer is entered and we grab the queue lock, where cfq_set_active_queue() could be rearming the timer for us. Seen in the wild on a 12-way ppc box. Fix this by just using mod_timer(), which will do the right thing for us. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5d2047b93eb5..85d188a30f82 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2193,10 +2193,9 @@ static void cfq_idle_class_timer(unsigned long data) * race with a non-idle queue, reset timer */ end = cfqd->last_end_request + CFQ_IDLE_GRACE; - if (!time_after_eq(jiffies, end)) { - cfqd->idle_class_timer.expires = end; - add_timer(&cfqd->idle_class_timer); - } else + if (!time_after_eq(jiffies, end)) + mod_timer(&cfqd->idle_class_timer, end); + else cfq_schedule_dispatch(cfqd); spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); -- cgit v1.2.2 From b52a834892f17b6c54c34ab65f1fad1a9229e764 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Jun 2006 18:53:43 +0200 Subject: [PATCH] cfq-iosched: busy_rr fairness fix Now that we select busy_rr for possible service, insert entries at the back of that list instead of at the front. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 85d188a30f82..8e9d84825e1c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -501,10 +501,13 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) /* * if queue was preempted, just add to front to be fair. busy_rr - * isn't sorted. + * isn't sorted, but insert at the back for fairness. */ if (preempted || list == &cfqd->busy_rr) { - list_add(&cfqq->cfq_list, list); + if (preempted) + list = list->prev; + + list_add_tail(&cfqq->cfq_list, list); return; } -- cgit v1.2.2 From bc1c116974a5c3f498112a6f175d3e4a8cd5bdbc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Jun 2006 08:49:06 +0200 Subject: [PATCH] elevator switching race There's a race between shutting down one io scheduler and firing up the next, in which a new io could enter and cause the io scheduler to be invoked with bad or NULL data. To fix this, we need to maintain the queue lock for a bit longer. Unfortunately we cannot do that, since the elevator init requires to be run without the lock held. This isn't easily fixable, without also changing the mempool API. So split the initialization into two parts, and alloc-init operation and an attach operation. Then we can preallocate the io scheduler and related structures, and run the attach inside the lock after we detach the old one. This patch has survived 30 minutes of 1 second io scheduler switching with a very busy io load. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- block/cfq-iosched.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block/cfq-iosched.c') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8e9d84825e1c..a46d030e092a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2251,14 +2251,14 @@ static void cfq_exit_queue(elevator_t *e) kfree(cfqd); } -static int cfq_init_queue(request_queue_t *q, elevator_t *e) +static void *cfq_init_queue(request_queue_t *q, elevator_t *e) { struct cfq_data *cfqd; int i; cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL); if (!cfqd) - return -ENOMEM; + return NULL; memset(cfqd, 0, sizeof(*cfqd)); @@ -2288,8 +2288,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) for (i = 0; i < CFQ_QHASH_ENTRIES; i++) INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); - e->elevator_data = cfqd; - cfqd->queue = q; cfqd->max_queued = q->nr_requests / 4; @@ -2316,14 +2314,14 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; - return 0; + return cfqd; out_crqpool: kfree(cfqd->cfq_hash); out_cfqhash: kfree(cfqd->crq_hash); out_crqhash: kfree(cfqd); - return -ENOMEM; + return NULL; } static void cfq_slab_kill(void) -- cgit v1.2.2