From a1ed5b0cffe4b16a93a6a3390e8cee0fbef94f86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:50:16 +0200 Subject: klist: don't iterate over deleted entries A klist entry is kept on the list till all its current iterations are finished; however, a new iteration after deletion also iterates over deleted entries as long as their reference count stays above zero. This causes problems for cases where there are users which iterate over the list while synchronized against list manipulations and natuarally expect already deleted entries to not show up during iteration. This patch implements dead flag which gets set on deletion so that iteration can skip already deleted entries. The dead flag piggy backs on the lowest bit of knode->n_klist and only visible to klist implementation proper. While at it, drop klist_iter->i_head as it's redundant and doesn't offer anything in semantics or performance wise as klist_iter->i_klist is dereferenced on every iteration anyway. Signed-off-by: Tejun Heo Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Jens Axboe Signed-off-by: Jens Axboe --- lib/klist.c | 96 ++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 70 insertions(+), 26 deletions(-) (limited to 'lib') diff --git a/lib/klist.c b/lib/klist.c index cca37f96faa2..bbdd3015c2c7 100644 --- a/lib/klist.c +++ b/lib/klist.c @@ -37,6 +37,37 @@ #include #include +/* + * Use the lowest bit of n_klist to mark deleted nodes and exclude + * dead ones from iteration. + */ +#define KNODE_DEAD 1LU +#define KNODE_KLIST_MASK ~KNODE_DEAD + +static struct klist *knode_klist(struct klist_node *knode) +{ + return (struct klist *) + ((unsigned long)knode->n_klist & KNODE_KLIST_MASK); +} + +static bool knode_dead(struct klist_node *knode) +{ + return (unsigned long)knode->n_klist & KNODE_DEAD; +} + +static void knode_set_klist(struct klist_node *knode, struct klist *klist) +{ + knode->n_klist = klist; + /* no knode deserves to start its life dead */ + WARN_ON(knode_dead(knode)); +} + +static void knode_kill(struct klist_node *knode) +{ + /* and no knode should die twice ever either, see we're very humane */ + WARN_ON(knode_dead(knode)); + *(unsigned long *)&knode->n_klist |= KNODE_DEAD; +} /** * klist_init - Initialize a klist structure. @@ -79,7 +110,7 @@ static void klist_node_init(struct klist *k, struct klist_node *n) INIT_LIST_HEAD(&n->n_node); init_completion(&n->n_removed); kref_init(&n->n_ref); - n->n_klist = k; + knode_set_klist(n, k); if (k->get) k->get(n); } @@ -115,7 +146,7 @@ EXPORT_SYMBOL_GPL(klist_add_tail); */ void klist_add_after(struct klist_node *n, struct klist_node *pos) { - struct klist *k = pos->n_klist; + struct klist *k = knode_klist(pos); klist_node_init(k, n); spin_lock(&k->k_lock); @@ -131,7 +162,7 @@ EXPORT_SYMBOL_GPL(klist_add_after); */ void klist_add_before(struct klist_node *n, struct klist_node *pos) { - struct klist *k = pos->n_klist; + struct klist *k = knode_klist(pos); klist_node_init(k, n); spin_lock(&k->k_lock); @@ -144,9 +175,10 @@ static void klist_release(struct kref *kref) { struct klist_node *n = container_of(kref, struct klist_node, n_ref); + WARN_ON(!knode_dead(n)); list_del(&n->n_node); complete(&n->n_removed); - n->n_klist = NULL; + knode_set_klist(n, NULL); } static int klist_dec_and_del(struct klist_node *n) @@ -154,22 +186,29 @@ static int klist_dec_and_del(struct klist_node *n) return kref_put(&n->n_ref, klist_release); } -/** - * klist_del - Decrement the reference count of node and try to remove. - * @n: node we're deleting. - */ -void klist_del(struct klist_node *n) +static void klist_put(struct klist_node *n, bool kill) { - struct klist *k = n->n_klist; + struct klist *k = knode_klist(n); void (*put)(struct klist_node *) = k->put; spin_lock(&k->k_lock); + if (kill) + knode_kill(n); if (!klist_dec_and_del(n)) put = NULL; spin_unlock(&k->k_lock); if (put) put(n); } + +/** + * klist_del - Decrement the reference count of node and try to remove. + * @n: node we're deleting. + */ +void klist_del(struct klist_node *n) +{ + klist_put(n, true); +} EXPORT_SYMBOL_GPL(klist_del); /** @@ -206,7 +245,6 @@ void klist_iter_init_node(struct klist *k, struct klist_iter *i, struct klist_node *n) { i->i_klist = k; - i->i_head = &k->k_list; i->i_cur = n; if (n) kref_get(&n->n_ref); @@ -237,7 +275,7 @@ EXPORT_SYMBOL_GPL(klist_iter_init); void klist_iter_exit(struct klist_iter *i) { if (i->i_cur) { - klist_del(i->i_cur); + klist_put(i->i_cur, false); i->i_cur = NULL; } } @@ -258,27 +296,33 @@ static struct klist_node *to_klist_node(struct list_head *n) */ struct klist_node *klist_next(struct klist_iter *i) { - struct list_head *next; - struct klist_node *lnode = i->i_cur; - struct klist_node *knode = NULL; void (*put)(struct klist_node *) = i->i_klist->put; + struct klist_node *last = i->i_cur; + struct klist_node *next; spin_lock(&i->i_klist->k_lock); - if (lnode) { - next = lnode->n_node.next; - if (!klist_dec_and_del(lnode)) + + if (last) { + next = to_klist_node(last->n_node.next); + if (!klist_dec_and_del(last)) put = NULL; } else - next = i->i_head->next; + next = to_klist_node(i->i_klist->k_list.next); - if (next != i->i_head) { - knode = to_klist_node(next); - kref_get(&knode->n_ref); + i->i_cur = NULL; + while (next != to_klist_node(&i->i_klist->k_list)) { + if (likely(!knode_dead(next))) { + kref_get(&next->n_ref); + i->i_cur = next; + break; + } + next = to_klist_node(next->n_node.next); } - i->i_cur = knode; + spin_unlock(&i->i_klist->k_lock); - if (put && lnode) - put(lnode); - return knode; + + if (put && last) + put(last); + return i->i_cur; } EXPORT_SYMBOL_GPL(klist_next); -- cgit v1.2.2 From 870d6656126add8e383645732b03df2b7ccd4f94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:25 +0900 Subject: block: implement CONFIG_DEBUG_BLOCK_EXT_DEVT Extended devt introduces non-contiguos device numbers. This patch implements a debug option which forces most devt allocations to be from the extended area and spreads them out. This is enabled by default if DEBUG_KERNEL is set and achieves... 1. Detects code paths in kernel or userland which expect predetermined consecutive device numbers. 2. When something goes wrong, avoid corruption as adding to the minor of earlier partition won't lead to the wrong but valid device. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0b504814e378..5a536f703a83 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -624,6 +624,22 @@ config BACKTRACE_SELF_TEST Say N if you are unsure. +config DEBUG_BLOCK_EXT_DEVT + bool "Force extended block device numbers and spread them" + depends on DEBUG_KERNEL + depends on BLOCK + default y + help + Conventionally, block device numbers are allocated from + predetermined contiguous area. However, extended block area + may introduce non-contiguous block device numbers. This + option forces most block device numbers to be allocated from + the extended space and spreads them to discover kernel or + userland code paths which assume predetermined contiguous + device number allocation. + + Say N if you are unsure. + config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_KERNEL -- cgit v1.2.2 From 759f8ca3048f7438aa3129268d7252552505d662 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Aug 2008 09:06:29 +0200 Subject: Change default value of CONFIG_DEBUG_BLOCK_EXT_DEVT to 'n' It's a debug option that you would explicitly enable to test this feature, we should default it to 'n' to prevent accidental surprises for now. Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5a536f703a83..4378d5e923ca 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -628,7 +628,7 @@ config DEBUG_BLOCK_EXT_DEVT bool "Force extended block device numbers and spread them" depends on DEBUG_KERNEL depends on BLOCK - default y + default n help Conventionally, block device numbers are allocated from predetermined contiguous area. However, extended block area -- cgit v1.2.2 From 55dc7db70a73a3809a2334063c9b5b0d8ccebdaa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Sep 2008 13:44:35 +0200 Subject: init: DEBUG_BLOCK_EXT_DEVT requires explicit root= param DEBUG_BLOCK_EXT_DEVT shuffles SCSI and IDE device numbers and root device number set using rdev become meaningless. Root devices should be explicitly specified using textual names. Warn about it if root can't be found and DEBUG_BLOCK_EXT_DEVT is enabled. Also, add warning to the help text. Signed-off-by: Tejun Heo Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4378d5e923ca..c556896abe57 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -638,6 +638,12 @@ config DEBUG_BLOCK_EXT_DEVT userland code paths which assume predetermined contiguous device number allocation. + Note that turning on this debug option shuffles all the + device numbers for all IDE and SCSI devices including libata + ones, so root partition specified using device number + directly (via rdev or root=MAJ:MIN) won't work anymore. + Textual device names (root=/dev/sdXn) will continue to work. + Say N if you are unsure. config LKDTM -- cgit v1.2.2 From 581d4e28d9195aa8b2231383dbabc288988d615e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 14 Sep 2008 05:56:33 -0700 Subject: block: add fault injection mechanism for faking request timeouts Only works for the generic request timer handling. Allows one to sporadically ignore request completions, thus exercising the timeout handling. Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c556896abe57..7d7a31d0ddeb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -683,10 +683,21 @@ config FAIL_PAGE_ALLOC config FAIL_MAKE_REQUEST bool "Fault-injection capability for disk IO" - depends on FAULT_INJECTION + depends on FAULT_INJECTION && BLOCK help Provide fault-injection capability for disk IO. +config FAIL_IO_TIMEOUT + bool "Faul-injection capability for faking disk interrupts" + depends on FAULT_INJECTION && BLOCK + help + Provide fault-injection capability on end IO handling. This + will make the block layer "forget" an interrupt as configured, + thus exercising the error handling. + + Only works with drivers that use the generic timeout handling, + for others it wont do anything. + config FAULT_INJECTION_DEBUG_FS bool "Debugfs entries for fault-injection capabilities" depends on FAULT_INJECTION && SYSFS && DEBUG_FS -- cgit v1.2.2