From bca4b914b5da3d8e7b9b647f620b71dc85c0c394 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 20 May 2010 23:21:34 +0400 Subject: cfq-iosched: remove dead_key from cfq_io_context Remove ->dead_key field from cfq_io_context to shrink its size to 128 bytes. (64 bytes for 32-bit hosts) Use lower bit in ->key as dead-mark, instead of moving key to separate field. After this for dead cfq_io_context we got cic->key != cfqd automatically. Thus, io_context's last-hit cache should work without changing. Now to check ->key for non-dead state compare it with cfqd, instead of checking ->key for non-null value as it was before. Plus remove obsolete race protection in cfq_cic_lookup. This race gone after v2.6.24-1728-g4ac845a Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ed897b5ef315..407602350350 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -430,6 +430,23 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic, cic->cfqq[is_sync] = cfqq; } +#define CIC_DEAD_KEY 1ul + +static inline void *cfqd_dead_key(struct cfq_data *cfqd) +{ + return (void *)((unsigned long) cfqd | CIC_DEAD_KEY); +} + +static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) +{ + struct cfq_data *cfqd = cic->key; + + if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) + return NULL; + + return cfqd; +} + /* * We regard a request as SYNC, if it's either a read or has the SYNC bit * set (in which case it could also be direct WRITE). @@ -2510,11 +2527,12 @@ static void cfq_cic_free(struct cfq_io_context *cic) static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) { unsigned long flags; + unsigned long dead_key = (unsigned long) cic->key; - BUG_ON(!cic->dead_key); + BUG_ON(!(dead_key & CIC_DEAD_KEY)); spin_lock_irqsave(&ioc->lock, flags); - radix_tree_delete(&ioc->radix_root, cic->dead_key); + radix_tree_delete(&ioc->radix_root, dead_key & ~CIC_DEAD_KEY); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2573,11 +2591,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, list_del_init(&cic->queue_list); /* - * Make sure key == NULL is seen for dead queues + * Make sure dead mark is seen for dead queues */ smp_wmb(); - cic->dead_key = (unsigned long) cic->key; - cic->key = NULL; + cic->key = cfqd_dead_key(cfqd); if (ioc->ioc_data == cic) rcu_assign_pointer(ioc->ioc_data, NULL); @@ -2596,7 +2613,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, static void cfq_exit_single_io_context(struct io_context *ioc, struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); if (cfqd) { struct request_queue *q = cfqd->queue; @@ -2609,7 +2626,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc, * race between exiting task and queue */ smp_read_barrier_depends(); - if (cic->key) + if (cic->key == cfqd) __cfq_exit_single_io_context(cfqd, cic); spin_unlock_irqrestore(q->queue_lock, flags); @@ -2689,7 +2706,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; unsigned long flags; @@ -2746,7 +2763,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) { struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); unsigned long flags; struct request_queue *q; @@ -2883,6 +2900,7 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, unsigned long flags; WARN_ON(!list_empty(&cic->queue_list)); + BUG_ON(cic->key != cfqd_dead_key(cfqd)); spin_lock_irqsave(&ioc->lock, flags); @@ -2900,7 +2918,6 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) { struct cfq_io_context *cic; unsigned long flags; - void *k; if (unlikely(!ioc)) return NULL; @@ -2921,9 +2938,7 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) rcu_read_unlock(); if (!cic) break; - /* ->key must be copied to avoid race with cfq_exit_queue() */ - k = cic->key; - if (unlikely(!k)) { + if (unlikely(cic->key != cfqd)) { cfq_drop_dead_cic(cfqd, ioc, cic); rcu_read_lock(); continue; -- cgit v1.2.2 From 80b15c7389caa81a3860f9fc2ee47ec0ea572a63 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 20 May 2010 23:21:41 +0400 Subject: cfq-iosched: compact io_context radix_tree Use small consequent indexes as radix tree keys instead of sparse cfqd address. This change will reduce radix tree depth from 11 (6 for 32-bit hosts) to 1 if host have <=64 disks under cfq control, or to 0 if there only one disk. So, this patch save 10*560 bytes for each process (5*296 for 32-bit hosts) For each cfqd allocate cic index from ida. To unlink dead cic from tree without cfqd access store index into ->key. (bit 0 -- dead mark, bits 1..30 -- index: ida produce id in range 0..2^31-1) Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 407602350350..c72e5acc573d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -64,6 +64,9 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); +static DEFINE_SPINLOCK(cic_index_lock); +static DEFINE_IDA(cic_index_ida); + #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) @@ -271,6 +274,7 @@ struct cfq_data { unsigned int cfq_latency; unsigned int cfq_group_isolation; + unsigned int cic_index; struct list_head cic_list; /* @@ -431,10 +435,11 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic, } #define CIC_DEAD_KEY 1ul +#define CIC_DEAD_INDEX_SHIFT 1 static inline void *cfqd_dead_key(struct cfq_data *cfqd) { - return (void *)((unsigned long) cfqd | CIC_DEAD_KEY); + return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); } static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) @@ -2532,7 +2537,7 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) BUG_ON(!(dead_key & CIC_DEAD_KEY)); spin_lock_irqsave(&ioc->lock, flags); - radix_tree_delete(&ioc->radix_root, dead_key & ~CIC_DEAD_KEY); + radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2906,7 +2911,7 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, BUG_ON(ioc->ioc_data == cic); - radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); + radix_tree_delete(&ioc->radix_root, cfqd->cic_index); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2934,7 +2939,7 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) } do { - cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); + cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); rcu_read_unlock(); if (!cic) break; @@ -2971,7 +2976,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, spin_lock_irqsave(&ioc->lock, flags); ret = radix_tree_insert(&ioc->radix_root, - (unsigned long) cfqd, cic); + cfqd->cic_index, cic); if (!ret) hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -3723,10 +3728,32 @@ static void cfq_exit_queue(struct elevator_queue *e) cfq_shutdown_timer_wq(cfqd); + spin_lock(&cic_index_lock); + ida_remove(&cic_index_ida, cfqd->cic_index); + spin_unlock(&cic_index_lock); + /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ call_rcu(&cfqd->rcu, cfq_cfqd_free); } +static int cfq_alloc_cic_index(void) +{ + int index, error; + + do { + if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&cic_index_lock); + error = ida_get_new(&cic_index_ida, &index); + spin_unlock(&cic_index_lock); + if (error && error != -EAGAIN) + return error; + } while (error); + + return index; +} + static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; @@ -3734,10 +3761,16 @@ static void *cfq_init_queue(struct request_queue *q) struct cfq_group *cfqg; struct cfq_rb_root *st; + i = cfq_alloc_cic_index(); + if (i < 0) + return NULL; + cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; + cfqd->cic_index = i; + /* Init root service tree */ cfqd->grp_service_tree = CFQ_RB_ROOT; @@ -3999,6 +4032,7 @@ static void __exit cfq_exit(void) */ if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); + ida_destroy(&cic_index_ida); cfq_slab_kill(); } -- cgit v1.2.2 From e36f724b4ae70e443a7d152929b60059cbfa1a26 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 24 May 2010 09:07:32 +0200 Subject: block: Adjust elv_iosched_show to return "none" for bio-based DM Bio-based DM doesn't use an elevator (queue is !blk_queue_stackable()). Longer-term DM will not allocate an elevator for bio-based DM. But even then there will be small potential for an elevator to be allocated for a request-based DM table only to have a bio-based table be loaded in the end. Displaying "none" for bio-based DM will help avoid user confusion. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 6df2b5056b51..0abce473d606 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1097,7 +1097,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_type *__e; int len = 0; - if (!q->elevator) + if (!q->elevator || !blk_queue_stackable(q)) return sprintf(name, "none\n"); elv = e->elevator_type; -- cgit v1.2.2 From d02a2c077fb81f3224c770be62a318165b23b486 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 25 May 2010 10:16:53 +0200 Subject: cfq-iosched: fix an oops caused by slab leak I got below oops when unloading cfq-iosched. Considering scenario: queue A merge to B, C merge to D and B will be merged to D. Before B is merged to D, we do split B. We should put B's reference for D. [ 807.768536] ============================================================================= [ 807.768539] BUG cfq_queue: Objects remaining on kmem_cache_close() [ 807.768541] ----------------------------------------------------------------------------- [ 807.768543] [ 807.768546] INFO: Slab 0xffffea0003e6b4e0 objects=26 used=1 fp=0xffff88011d584fd8 flags=0x200000000004082 [ 807.768550] Pid: 5946, comm: rmmod Tainted: G W 2.6.34-07097-gf4b87de-dirty #724 [ 807.768552] Call Trace: [ 807.768560] [] slab_err+0x8f/0x9d [ 807.768564] [] ? flush_cpu_slab+0x0/0x93 [ 807.768569] [] ? add_preempt_count+0xe/0xca [ 807.768572] [] ? sub_preempt_count+0xe/0xb6 [ 807.768577] [] ? _raw_spin_unlock+0x15/0x30 [ 807.768580] [] ? sub_preempt_count+0xe/0xb6 [ 807.768584] [] list_slab_objects+0x9b/0x19f [ 807.768588] [] ? add_preempt_count+0xc6/0xca [ 807.768591] [] kmem_cache_destroy+0x13f/0x21d [ 807.768597] [] cfq_slab_kill+0x1a/0x43 [cfq_iosched] [ 807.768601] [] cfq_exit+0x93/0x9e [cfq_iosched] [ 807.768606] [] sys_delete_module+0x1b1/0x219 [ 807.768612] [] system_call_fastpath+0x16/0x1b [ 807.768618] INFO: Object 0xffff88011d584618 @offset=1560 [ 807.768622] INFO: Allocated in cfq_get_queue+0x11e/0x274 [cfq_iosched] age=7173 cpu=1 pid=5496 [ 807.768626] ============================================================================= Cc: stable@kernel.org Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c72e5acc573d..5ff4f4850e71 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2560,15 +2560,10 @@ static void cfq_free_io_context(struct io_context *ioc) __call_for_each_cic(ioc, cic_free_func); } -static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static void cfq_put_cooperator(struct cfq_queue *cfqq) { struct cfq_queue *__cfqq, *next; - if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); - } - /* * If this queue was scheduled to merge with another queue, be * sure to drop the reference taken on that queue (and others in @@ -2584,6 +2579,16 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_put_queue(__cfqq); __cfqq = next; } +} + +static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + if (unlikely(cfqq == cfqd->active_queue)) { + __cfq_slice_expired(cfqd, cfqq, 0); + cfq_schedule_dispatch(cfqd); + } + + cfq_put_cooperator(cfqq); cfq_put_queue(cfqq); } @@ -3536,6 +3541,9 @@ split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) } cic_set_cfqq(cic, NULL, 1); + + cfq_put_cooperator(cfqq); + cfq_put_queue(cfqq); return NULL; } -- cgit v1.2.2 From c86d1b8ae622e1ea5d20e98bd72fbd7d9dd69016 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Jun 2010 11:34:52 -0600 Subject: block: avoid unconditionally freeing previously allocated request_queue On blk_init_allocated_queue_node failure, only free the request_queue if it is wasn't previously allocated outside the block layer (e.g. blk_init_queue_node was blk_init_allocated_queue_node caller). This addresses an interface bug introduced by the following commit: 01effb0 block: allow initialization of previously allocated request_queue Otherwise the request_queue may be free'd out from underneath a caller that is managing the request_queue directly (e.g. caller uses blk_alloc_queue + blk_init_allocated_queue_node). Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 3bc5579d6f54..826d07078902 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -570,9 +570,17 @@ EXPORT_SYMBOL(blk_init_queue); struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { - struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); + struct request_queue *uninit_q, *q; - return blk_init_allocated_queue_node(q, rfn, lock, node_id); + uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); + if (!uninit_q) + return NULL; + + q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id); + if (!q) + blk_cleanup_queue(uninit_q); + + return q; } EXPORT_SYMBOL(blk_init_queue_node); @@ -592,10 +600,8 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, return NULL; q->node = node_id; - if (blk_init_free_list(q)) { - kmem_cache_free(blk_requestq_cachep, q); + if (blk_init_free_list(q)) return NULL; - } q->request_fn = rfn; q->prep_rq_fn = NULL; @@ -618,7 +624,6 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, return q; } - blk_put_queue(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue_node); -- cgit v1.2.2 From 1abec4fdbb142e3ccb6ce99832fae42129134a96 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 25 May 2010 13:15:15 -0400 Subject: block: make blk_init_free_list and elevator_init idempotent blk_init_allocated_queue_node may fail and the caller _could_ retry. Accommodate the unlikely event that blk_init_allocated_queue_node is called on an already initialized (possibly partially) request_queue. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +++ block/elevator.c | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 826d07078902..f84cce42fc58 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -467,6 +467,9 @@ static int blk_init_free_list(struct request_queue *q) { struct request_list *rl = &q->rq; + if (unlikely(rl->rq_pool)) + return 0; + rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; rl->elvpriv = 0; diff --git a/block/elevator.c b/block/elevator.c index 0abce473d606..923a9139106c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -242,9 +242,11 @@ int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; struct elevator_queue *eq; - int ret = 0; void *data; + if (unlikely(q->elevator)) + return 0; + INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->end_sector = 0; @@ -284,7 +286,7 @@ int elevator_init(struct request_queue *q, char *name) } elevator_attach(q, eq, data); - return ret; + return 0; } EXPORT_SYMBOL(elevator_init); -- cgit v1.2.2