From f9abd1ace43d6186268856dbec2ebf411218d6ca Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 5 Aug 2006 12:14:07 -0700 Subject: [PATCH] md: Fix a bug that recently crept into md/linear A recent patch that allowed linear arrays to be reconfigured on-line allowed in a bug which results in divide by zero - not all mddev->array_size were converted to conf->array_size. This patch finished the conversion and fixed the bug. The offending patch was commit 7c7546ccf6463edbeee8d9aac6de7be1cd80d08a. Thanks to Simon Kirby for the bug report. Cc: Simon Kirby Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/linear.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index ff83c9b5979e..b99c19c7eb22 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -162,7 +162,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) goto out; } - min_spacing = mddev->array_size; + min_spacing = conf->array_size; sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); /* min_spacing is the minimum spacing that will fit the hash @@ -171,7 +171,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) * that is larger than min_spacing as use the size of that as * the actual spacing */ - conf->hash_spacing = mddev->array_size; + conf->hash_spacing = conf->array_size; for (i=0; i < cnt-1 ; i++) { sector_t sz = 0; int j; @@ -228,7 +228,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) curr_offset = 0; i = 0; for (curr_offset = 0; - curr_offset < mddev->array_size; + curr_offset < conf->array_size; curr_offset += conf->hash_spacing) { while (i < mddev->raid_disks-1 && -- cgit v1.2.2 From 485311a23c72c87332f9a55ce25e650e40ae3fc7 Mon Sep 17 00:00:00 2001 From: Michal Miroslaw Date: Sun, 13 Aug 2006 23:24:20 -0700 Subject: [PATCH] dm: BUG/OOPS fix Fix BUG I tripped on while testing failover and multipathing. BUG shows up on error path in multipath_ctr() when parse_priority_group() fails after returning at least once without error. The fix is to initialize m->ti early - just after alloc()ing it. BUG: unable to handle kernel NULL pointer dereference at virtual address 00000000 printing eip: c027c3d2 *pde = 00000000 Oops: 0000 [#3] Modules linked in: qla2xxx ext3 jbd mbcache sg ide_cd cdrom floppy CPU: 0 EIP: 0060:[] Not tainted VLI EFLAGS: 00010202 (2.6.17.3 #1) EIP is at dm_put_device+0xf/0x3b eax: 00000001 ebx: ee4fcac0 ecx: 00000000 edx: ee4fcac0 esi: ee4fc4e0 edi: ee4fc4e0 ebp: 00000000 esp: c5db3e78 ds: 007b es: 007b ss: 0068 Process multipathd (pid: 15912, threadinfo=c5db2000 task=ef485a90) Stack: ec4eda40 c02816bd ee4fc4c0 00000000 f7e89498 f883e0bc c02816f6 f7e89480 f7e8948c c0281801 ffffffea f7e89480 f883e080 c0281ffe 00000001 00000000 00000004 dfe9cab8 f7a693c0 f883e080 f883e0c0 ca4b99c0 c027c6ee 01400000 Call Trace: free_pgpaths+0x31/0x45 free_priority_group+0x25/0x2e free_multipath+0x35/0x67 multipath_ctr+0x123/0x12d dm_table_add_target+0x11e/0x18b populate_table+0x8a/0xaf table_load+0x52/0xf9 ctl_ioctl+0xca/0xfc table_load+0x0/0xf9 do_ioctl+0x3e/0x43 vfs_ioctl+0x16c/0x178 sys_ioctl+0x48/0x60 syscall_call+0x7/0xb Code: 97 f0 00 00 00 89 c1 83 c9 01 80 e2 01 0f 44 c1 88 43 14 8b 04 24 59 5b 5e 5f 5d c3 53 89 c1 89 d3 ff 4a 08 0f 94 c0 84 c0 74 2a <8b> 01 8b 10 89 d8 e8 f6 fb ff ff 8b 03 8b 53 04 89 50 04 89 02 EIP: [] dm_put_device+0xf/0x3b SS:ESP 0068:c5db3e78 Signed-off-by: Michal Miroslaw Acked-by: Alasdair G Kergon Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-mpath.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 217615b33223..93f701ea87bc 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -710,6 +710,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, return -EINVAL; } + m->ti = ti; + r = parse_features(&as, m, ti); if (r) goto bad; @@ -751,7 +753,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, } ti->private = m; - m->ti = ti; return 0; -- cgit v1.2.2 From c06aad854fdb9da38fcc22dccfe9d72919453e43 Mon Sep 17 00:00:00 2001 From: Daniel Kobras Date: Sun, 27 Aug 2006 01:23:24 -0700 Subject: [PATCH] dm: Fix deadlock under high i/o load in raid1 setup. On an nForce4-equipped machine with two SATA disk in raid1 setup using dmraid, we experienced frequent deadlock of the system under high i/o load. 'cat /dev/zero > ~/zero' was the most reliable way to reproduce them: Randomly after a few GB, 'cp' would be left in 'D' state along with kjournald and kmirrord. The functions cp and kjournald were blocked in did vary, but kmirrord's wchan always pointed to 'mempool_alloc()'. We've seen this pattern on 2.6.15 and 2.6.17 kernels. http://lkml.org/lkml/2005/4/20/142 indicates that this problem has been around even before. So much for the facts, here's my interpretation: mempool_alloc() first tries to atomically allocate the requested memory, or falls back to hand out preallocated chunks from the mempool. If both fail, it puts the calling process (kmirrord in this case) on a private waitqueue until somebody refills the pool. Where the only 'somebody' is kmirrord itself, so we have a deadlock. I worked around this problem by falling back to a (blocking) kmalloc when before kmirrord would have ended up on the waitqueue. This defeats part of the benefits of using the mempool, but at least keeps the system running. And it could be done with a two-line change. Note that mempool_alloc() clears the GFP_NOIO flag internally, and only uses it to decide whether to wait or return an error if immediate allocation fails, so the attached patch doesn't change behaviour in the non-deadlocking case. Path is against current git (2.6.18-rc4), but should apply to earlier versions as well. I've tested on 2.6.15, where this patch makes the difference between random lockup and a stable system. Signed-off-by: Daniel Kobras Acked-by: Alasdair G Kergon Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/dm-raid1.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index be48cedf986b..c54de989eb00 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -255,7 +255,9 @@ static struct region *__rh_alloc(struct region_hash *rh, region_t region) struct region *reg, *nreg; read_unlock(&rh->hash_lock); - nreg = mempool_alloc(rh->region_pool, GFP_NOIO); + nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); + if (unlikely(!nreg)) + nreg = kmalloc(sizeof(struct region), GFP_NOIO); nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? RH_CLEAN : RH_NOSYNC; nreg->rh = rh; -- cgit v1.2.2 From 84692195969b83f0ba57dc33ecf73e6c124dd186 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 27 Aug 2006 01:23:49 -0700 Subject: [PATCH] md: avoid backward event updates in md superblock when degraded. If we - shut down a clean array, - restart with one (or more) drive(s) missing - make some changes - pause, so that they array gets marked 'clean', the event count on the superblock of included drives will be the same as that of the removed drives. So adding the removed drive back in will cause it to be included with no resync. To avoid this, we only update the eventcount backwards when the array is not degraded. In this case there can (should) be no non-connected drives that we can get confused with, and this is the particular case where updating-backwards is valuable. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index b6d16022a53e..8dbab2ef3885 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1597,6 +1597,19 @@ void md_update_sb(mddev_t * mddev) repeat: spin_lock_irq(&mddev->write_lock); + + if (mddev->degraded && mddev->sb_dirty == 3) + /* If the array is degraded, then skipping spares is both + * dangerous and fairly pointless. + * Dangerous because a device that was removed from the array + * might have a event_count that still looks up-to-date, + * so it can be re-added without a resync. + * Pointless because if there are any spares to skip, + * then a recovery will happen and soon that array won't + * be degraded any more and the spare can go back to sleep then. + */ + mddev->sb_dirty = 1; + sync_req = mddev->in_sync; mddev->utime = get_seconds(); if (mddev->sb_dirty == 3) -- cgit v1.2.2 From 6394cca54894f6a9bcf927ab78d28985944298ff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 27 Aug 2006 01:23:50 -0700 Subject: [PATCH] md: fix recent breakage of md/raid1 array checking A recent patch broke the ability to do a user-request check of a raid1. This patch fixes the breakage and also moves a comment that was dislocated by the same patch. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid1.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1efe22a2d041..87bfe9e7d8ca 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1625,15 +1625,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } - /* before building a request, check if we can skip these blocks.. - * This call the bitmap_start_sync doesn't actually record anything - */ if (mddev->bitmap == NULL && mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; return max_sector - sector_nr; } + /* before building a request, check if we can skip these blocks.. + * This call the bitmap_start_sync doesn't actually record anything + */ if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ -- cgit v1.2.2 From ddac7c7e3a0fe9cfdcef0de24476b8d69f8cf3e7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 31 Aug 2006 21:27:36 -0700 Subject: [PATCH] md: Fix issues with referencing rdev in md/raid1 We need to be careful when referencing mirrors[i].rdev. It can disappear under us at various times. So: fix a couple of problem places. comment a couple of non-problem places move an 'atomic_add' which deferences rdev down a little way to some where where it is sure to not be NULL. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid1.c | 57 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 22 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 87bfe9e7d8ca..3b4d69c05623 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -930,10 +930,13 @@ static void status(struct seq_file *seq, mddev_t *mddev) seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); - for (i = 0; i < conf->raid_disks; i++) + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); seq_printf(seq, "%s", - conf->mirrors[i].rdev && - test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); + rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf(seq, "]"); } @@ -975,7 +978,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) static void print_conf(conf_t *conf) { int i; - mirror_info_t *tmp; printk("RAID1 conf printout:\n"); if (!conf) { @@ -985,14 +987,17 @@ static void print_conf(conf_t *conf) printk(" --- wd:%d rd:%d\n", conf->working_disks, conf->raid_disks); + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; - tmp = conf->mirrors + i; - if (tmp->rdev) + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); } + rcu_read_unlock(); } static void close_sync(conf_t *conf) @@ -1008,20 +1013,20 @@ static int raid1_spare_active(mddev_t *mddev) { int i; conf_t *conf = mddev->private; - mirror_info_t *tmp; /* * Find all failed disks within the RAID1 configuration - * and mark them readable + * and mark them readable. + * Called under mddev lock, so rcu protection not needed. */ for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->mirrors + i; - if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags)) { conf->working_disks++; mddev->degraded--; - set_bit(In_sync, &tmp->rdev->flags); + set_bit(In_sync, &rdev->flags); } } @@ -1237,7 +1242,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) /* ouch - failed to read all of that. * Try some synchronous reads of other devices to get * good data, much like with normal read errors. Only - * read into the pages we already have so they we don't + * read into the pages we already have so we don't * need to re-issue the read request. * We don't need to freeze the array, because being in an * active sync request, there is no normal IO, and @@ -1257,6 +1262,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) s = PAGE_SIZE >> 9; do { if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev->bdev, sect + rdev->data_offset, @@ -1463,6 +1472,11 @@ static void raid1d(mddev_t *mddev) s = PAGE_SIZE >> 9; do { + /* Note: no rcu protection needed here + * as this is synchronous in the raid1d thread + * which is the thread that might remove + * a device. If raid1d ever becomes multi-threaded.... + */ rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && @@ -1486,7 +1500,6 @@ static void raid1d(mddev_t *mddev) d = conf->raid_disks; d--; rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); if (rdev && test_bit(In_sync, &rdev->flags)) { if (sync_page_io(rdev->bdev, @@ -1509,9 +1522,11 @@ static void raid1d(mddev_t *mddev) s<<9, conf->tmppage, READ) == 0) /* Well, this device is dead */ md_error(mddev, rdev); - else + else { + atomic_add(s, &rdev->corrected_errors); printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b)); + } } } } else { @@ -1787,19 +1802,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0; iraid_disks; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { - md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } } } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, - nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } - return nr_sectors; } -- cgit v1.2.2