diff options
Diffstat (limited to 'drivers/md/md.c')
| -rw-r--r-- | drivers/md/md.c | 528 |
1 files changed, 394 insertions, 134 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index d899204d3743..3802f7a17f16 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -19,6 +19,9 @@ | |||
| 19 | 19 | ||
| 20 | Neil Brown <neilb@cse.unsw.edu.au>. | 20 | Neil Brown <neilb@cse.unsw.edu.au>. |
| 21 | 21 | ||
| 22 | - persistent bitmap code | ||
| 23 | Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. | ||
| 24 | |||
| 22 | This program is free software; you can redistribute it and/or modify | 25 | This program is free software; you can redistribute it and/or modify |
| 23 | it under the terms of the GNU General Public License as published by | 26 | it under the terms of the GNU General Public License as published by |
| 24 | the Free Software Foundation; either version 2, or (at your option) | 27 | the Free Software Foundation; either version 2, or (at your option) |
| @@ -33,6 +36,7 @@ | |||
| 33 | #include <linux/config.h> | 36 | #include <linux/config.h> |
| 34 | #include <linux/linkage.h> | 37 | #include <linux/linkage.h> |
| 35 | #include <linux/raid/md.h> | 38 | #include <linux/raid/md.h> |
| 39 | #include <linux/raid/bitmap.h> | ||
| 36 | #include <linux/sysctl.h> | 40 | #include <linux/sysctl.h> |
| 37 | #include <linux/devfs_fs_kernel.h> | 41 | #include <linux/devfs_fs_kernel.h> |
| 38 | #include <linux/buffer_head.h> /* for invalidate_bdev */ | 42 | #include <linux/buffer_head.h> /* for invalidate_bdev */ |
| @@ -40,6 +44,8 @@ | |||
| 40 | 44 | ||
| 41 | #include <linux/init.h> | 45 | #include <linux/init.h> |
| 42 | 46 | ||
| 47 | #include <linux/file.h> | ||
| 48 | |||
| 43 | #ifdef CONFIG_KMOD | 49 | #ifdef CONFIG_KMOD |
| 44 | #include <linux/kmod.h> | 50 | #include <linux/kmod.h> |
| 45 | #endif | 51 | #endif |
| @@ -189,8 +195,7 @@ static mddev_t * mddev_find(dev_t unit) | |||
| 189 | if (mddev->unit == unit) { | 195 | if (mddev->unit == unit) { |
| 190 | mddev_get(mddev); | 196 | mddev_get(mddev); |
| 191 | spin_unlock(&all_mddevs_lock); | 197 | spin_unlock(&all_mddevs_lock); |
| 192 | if (new) | 198 | kfree(new); |
| 193 | kfree(new); | ||
| 194 | return mddev; | 199 | return mddev; |
| 195 | } | 200 | } |
| 196 | 201 | ||
| @@ -218,6 +223,8 @@ static mddev_t * mddev_find(dev_t unit) | |||
| 218 | INIT_LIST_HEAD(&new->all_mddevs); | 223 | INIT_LIST_HEAD(&new->all_mddevs); |
| 219 | init_timer(&new->safemode_timer); | 224 | init_timer(&new->safemode_timer); |
| 220 | atomic_set(&new->active, 1); | 225 | atomic_set(&new->active, 1); |
| 226 | spin_lock_init(&new->write_lock); | ||
| 227 | init_waitqueue_head(&new->sb_wait); | ||
| 221 | 228 | ||
| 222 | new->queue = blk_alloc_queue(GFP_KERNEL); | 229 | new->queue = blk_alloc_queue(GFP_KERNEL); |
| 223 | if (!new->queue) { | 230 | if (!new->queue) { |
| @@ -320,6 +327,40 @@ static void free_disk_sb(mdk_rdev_t * rdev) | |||
| 320 | } | 327 | } |
| 321 | 328 | ||
| 322 | 329 | ||
| 330 | static int super_written(struct bio *bio, unsigned int bytes_done, int error) | ||
| 331 | { | ||
| 332 | mdk_rdev_t *rdev = bio->bi_private; | ||
| 333 | if (bio->bi_size) | ||
| 334 | return 1; | ||
| 335 | |||
| 336 | if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
| 337 | md_error(rdev->mddev, rdev); | ||
| 338 | |||
| 339 | if (atomic_dec_and_test(&rdev->mddev->pending_writes)) | ||
| 340 | wake_up(&rdev->mddev->sb_wait); | ||
| 341 | return 0; | ||
| 342 | } | ||
| 343 | |||
| 344 | void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | ||
| 345 | sector_t sector, int size, struct page *page) | ||
| 346 | { | ||
| 347 | /* write first size bytes of page to sector of rdev | ||
| 348 | * Increment mddev->pending_writes before returning | ||
| 349 | * and decrement it on completion, waking up sb_wait | ||
| 350 | * if zero is reached. | ||
| 351 | * If an error occurred, call md_error | ||
| 352 | */ | ||
| 353 | struct bio *bio = bio_alloc(GFP_NOIO, 1); | ||
| 354 | |||
| 355 | bio->bi_bdev = rdev->bdev; | ||
| 356 | bio->bi_sector = sector; | ||
| 357 | bio_add_page(bio, page, size, 0); | ||
| 358 | bio->bi_private = rdev; | ||
| 359 | bio->bi_end_io = super_written; | ||
| 360 | atomic_inc(&mddev->pending_writes); | ||
| 361 | submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); | ||
| 362 | } | ||
| 363 | |||
| 323 | static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) | 364 | static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) |
| 324 | { | 365 | { |
| 325 | if (bio->bi_size) | 366 | if (bio->bi_size) |
| @@ -329,7 +370,7 @@ static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) | |||
| 329 | return 0; | 370 | return 0; |
| 330 | } | 371 | } |
| 331 | 372 | ||
| 332 | static int sync_page_io(struct block_device *bdev, sector_t sector, int size, | 373 | int sync_page_io(struct block_device *bdev, sector_t sector, int size, |
| 333 | struct page *page, int rw) | 374 | struct page *page, int rw) |
| 334 | { | 375 | { |
| 335 | struct bio *bio = bio_alloc(GFP_NOIO, 1); | 376 | struct bio *bio = bio_alloc(GFP_NOIO, 1); |
| @@ -416,11 +457,8 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | |||
| 416 | ret = 1; | 457 | ret = 1; |
| 417 | 458 | ||
| 418 | abort: | 459 | abort: |
| 419 | if (tmp1) | 460 | kfree(tmp1); |
| 420 | kfree(tmp1); | 461 | kfree(tmp2); |
| 421 | if (tmp2) | ||
| 422 | kfree(tmp2); | ||
| 423 | |||
| 424 | return ret; | 462 | return ret; |
| 425 | } | 463 | } |
| 426 | 464 | ||
| @@ -569,6 +607,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 569 | mdp_disk_t *desc; | 607 | mdp_disk_t *desc; |
| 570 | mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); | 608 | mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); |
| 571 | 609 | ||
| 610 | rdev->raid_disk = -1; | ||
| 611 | rdev->in_sync = 0; | ||
| 572 | if (mddev->raid_disks == 0) { | 612 | if (mddev->raid_disks == 0) { |
| 573 | mddev->major_version = 0; | 613 | mddev->major_version = 0; |
| 574 | mddev->minor_version = sb->minor_version; | 614 | mddev->minor_version = sb->minor_version; |
| @@ -599,16 +639,35 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 599 | memcpy(mddev->uuid+12,&sb->set_uuid3, 4); | 639 | memcpy(mddev->uuid+12,&sb->set_uuid3, 4); |
| 600 | 640 | ||
| 601 | mddev->max_disks = MD_SB_DISKS; | 641 | mddev->max_disks = MD_SB_DISKS; |
| 602 | } else { | 642 | |
| 603 | __u64 ev1; | 643 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
| 604 | ev1 = md_event(sb); | 644 | mddev->bitmap_file == NULL) { |
| 645 | if (mddev->level != 1) { | ||
| 646 | /* FIXME use a better test */ | ||
| 647 | printk(KERN_WARNING "md: bitmaps only support for raid1\n"); | ||
| 648 | return -EINVAL; | ||
| 649 | } | ||
| 650 | mddev->bitmap_offset = (MD_SB_BYTES >> 9); | ||
| 651 | } | ||
| 652 | |||
| 653 | } else if (mddev->pers == NULL) { | ||
| 654 | /* Insist on good event counter while assembling */ | ||
| 655 | __u64 ev1 = md_event(sb); | ||
| 605 | ++ev1; | 656 | ++ev1; |
| 606 | if (ev1 < mddev->events) | 657 | if (ev1 < mddev->events) |
| 607 | return -EINVAL; | 658 | return -EINVAL; |
| 608 | } | 659 | } else if (mddev->bitmap) { |
| 660 | /* if adding to array with a bitmap, then we can accept an | ||
| 661 | * older device ... but not too old. | ||
| 662 | */ | ||
| 663 | __u64 ev1 = md_event(sb); | ||
| 664 | if (ev1 < mddev->bitmap->events_cleared) | ||
| 665 | return 0; | ||
| 666 | } else /* just a hot-add of a new device, leave raid_disk at -1 */ | ||
| 667 | return 0; | ||
| 668 | |||
| 609 | if (mddev->level != LEVEL_MULTIPATH) { | 669 | if (mddev->level != LEVEL_MULTIPATH) { |
| 610 | rdev->raid_disk = -1; | 670 | rdev->faulty = 0; |
| 611 | rdev->in_sync = rdev->faulty = 0; | ||
| 612 | desc = sb->disks + rdev->desc_nr; | 671 | desc = sb->disks + rdev->desc_nr; |
| 613 | 672 | ||
| 614 | if (desc->state & (1<<MD_DISK_FAULTY)) | 673 | if (desc->state & (1<<MD_DISK_FAULTY)) |
| @@ -618,7 +677,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 618 | rdev->in_sync = 1; | 677 | rdev->in_sync = 1; |
| 619 | rdev->raid_disk = desc->raid_disk; | 678 | rdev->raid_disk = desc->raid_disk; |
| 620 | } | 679 | } |
| 621 | } | 680 | } else /* MULTIPATH are always insync */ |
| 681 | rdev->in_sync = 1; | ||
| 622 | return 0; | 682 | return 0; |
| 623 | } | 683 | } |
| 624 | 684 | ||
| @@ -683,6 +743,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 683 | sb->layout = mddev->layout; | 743 | sb->layout = mddev->layout; |
| 684 | sb->chunk_size = mddev->chunk_size; | 744 | sb->chunk_size = mddev->chunk_size; |
| 685 | 745 | ||
| 746 | if (mddev->bitmap && mddev->bitmap_file == NULL) | ||
| 747 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); | ||
| 748 | |||
| 686 | sb->disks[0].state = (1<<MD_DISK_REMOVED); | 749 | sb->disks[0].state = (1<<MD_DISK_REMOVED); |
| 687 | ITERATE_RDEV(mddev,rdev2,tmp) { | 750 | ITERATE_RDEV(mddev,rdev2,tmp) { |
| 688 | mdp_disk_t *d; | 751 | mdp_disk_t *d; |
| @@ -780,7 +843,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 780 | case 0: | 843 | case 0: |
| 781 | sb_offset = rdev->bdev->bd_inode->i_size >> 9; | 844 | sb_offset = rdev->bdev->bd_inode->i_size >> 9; |
| 782 | sb_offset -= 8*2; | 845 | sb_offset -= 8*2; |
| 783 | sb_offset &= ~(4*2-1); | 846 | sb_offset &= ~(sector_t)(4*2-1); |
| 784 | /* convert from sectors to K */ | 847 | /* convert from sectors to K */ |
| 785 | sb_offset /= 2; | 848 | sb_offset /= 2; |
| 786 | break; | 849 | break; |
| @@ -860,6 +923,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 860 | { | 923 | { |
| 861 | struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 924 | struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); |
| 862 | 925 | ||
| 926 | rdev->raid_disk = -1; | ||
| 927 | rdev->in_sync = 0; | ||
| 863 | if (mddev->raid_disks == 0) { | 928 | if (mddev->raid_disks == 0) { |
| 864 | mddev->major_version = 1; | 929 | mddev->major_version = 1; |
| 865 | mddev->patch_version = 0; | 930 | mddev->patch_version = 0; |
| @@ -877,13 +942,30 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 877 | memcpy(mddev->uuid, sb->set_uuid, 16); | 942 | memcpy(mddev->uuid, sb->set_uuid, 16); |
| 878 | 943 | ||
| 879 | mddev->max_disks = (4096-256)/2; | 944 | mddev->max_disks = (4096-256)/2; |
| 880 | } else { | 945 | |
| 881 | __u64 ev1; | 946 | if ((le32_to_cpu(sb->feature_map) & 1) && |
| 882 | ev1 = le64_to_cpu(sb->events); | 947 | mddev->bitmap_file == NULL ) { |
| 948 | if (mddev->level != 1) { | ||
| 949 | printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); | ||
| 950 | return -EINVAL; | ||
| 951 | } | ||
| 952 | mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); | ||
| 953 | } | ||
| 954 | } else if (mddev->pers == NULL) { | ||
| 955 | /* Insist of good event counter while assembling */ | ||
| 956 | __u64 ev1 = le64_to_cpu(sb->events); | ||
| 883 | ++ev1; | 957 | ++ev1; |
| 884 | if (ev1 < mddev->events) | 958 | if (ev1 < mddev->events) |
| 885 | return -EINVAL; | 959 | return -EINVAL; |
| 886 | } | 960 | } else if (mddev->bitmap) { |
| 961 | /* If adding to array with a bitmap, then we can accept an | ||
| 962 | * older device, but not too old. | ||
| 963 | */ | ||
| 964 | __u64 ev1 = le64_to_cpu(sb->events); | ||
| 965 | if (ev1 < mddev->bitmap->events_cleared) | ||
| 966 | return 0; | ||
| 967 | } else /* just a hot-add of a new device, leave raid_disk at -1 */ | ||
| 968 | return 0; | ||
| 887 | 969 | ||
| 888 | if (mddev->level != LEVEL_MULTIPATH) { | 970 | if (mddev->level != LEVEL_MULTIPATH) { |
| 889 | int role; | 971 | int role; |
| @@ -891,14 +973,10 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 891 | role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); | 973 | role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
| 892 | switch(role) { | 974 | switch(role) { |
| 893 | case 0xffff: /* spare */ | 975 | case 0xffff: /* spare */ |
| 894 | rdev->in_sync = 0; | ||
| 895 | rdev->faulty = 0; | 976 | rdev->faulty = 0; |
| 896 | rdev->raid_disk = -1; | ||
| 897 | break; | 977 | break; |
| 898 | case 0xfffe: /* faulty */ | 978 | case 0xfffe: /* faulty */ |
| 899 | rdev->in_sync = 0; | ||
| 900 | rdev->faulty = 1; | 979 | rdev->faulty = 1; |
| 901 | rdev->raid_disk = -1; | ||
| 902 | break; | 980 | break; |
| 903 | default: | 981 | default: |
| 904 | rdev->in_sync = 1; | 982 | rdev->in_sync = 1; |
| @@ -906,7 +984,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 906 | rdev->raid_disk = role; | 984 | rdev->raid_disk = role; |
| 907 | break; | 985 | break; |
| 908 | } | 986 | } |
| 909 | } | 987 | } else /* MULTIPATH are always insync */ |
| 988 | rdev->in_sync = 1; | ||
| 989 | |||
| 910 | return 0; | 990 | return 0; |
| 911 | } | 991 | } |
| 912 | 992 | ||
| @@ -933,6 +1013,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 933 | else | 1013 | else |
| 934 | sb->resync_offset = cpu_to_le64(0); | 1014 | sb->resync_offset = cpu_to_le64(0); |
| 935 | 1015 | ||
| 1016 | if (mddev->bitmap && mddev->bitmap_file == NULL) { | ||
| 1017 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); | ||
| 1018 | sb->feature_map = cpu_to_le32(1); | ||
| 1019 | } | ||
| 1020 | |||
| 936 | max_dev = 0; | 1021 | max_dev = 0; |
| 937 | ITERATE_RDEV(mddev,rdev2,tmp) | 1022 | ITERATE_RDEV(mddev,rdev2,tmp) |
| 938 | if (rdev2->desc_nr+1 > max_dev) | 1023 | if (rdev2->desc_nr+1 > max_dev) |
| @@ -1196,8 +1281,11 @@ void md_print_devices(void) | |||
| 1196 | printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); | 1281 | printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); |
| 1197 | printk("md: **********************************\n"); | 1282 | printk("md: **********************************\n"); |
| 1198 | ITERATE_MDDEV(mddev,tmp) { | 1283 | ITERATE_MDDEV(mddev,tmp) { |
| 1199 | printk("%s: ", mdname(mddev)); | ||
| 1200 | 1284 | ||
| 1285 | if (mddev->bitmap) | ||
| 1286 | bitmap_print_sb(mddev->bitmap); | ||
| 1287 | else | ||
| 1288 | printk("%s: ", mdname(mddev)); | ||
| 1201 | ITERATE_RDEV(mddev,rdev,tmp2) | 1289 | ITERATE_RDEV(mddev,rdev,tmp2) |
| 1202 | printk("<%s>", bdevname(rdev->bdev,b)); | 1290 | printk("<%s>", bdevname(rdev->bdev,b)); |
| 1203 | printk("\n"); | 1291 | printk("\n"); |
| @@ -1210,30 +1298,6 @@ void md_print_devices(void) | |||
| 1210 | } | 1298 | } |
| 1211 | 1299 | ||
| 1212 | 1300 | ||
| 1213 | static int write_disk_sb(mdk_rdev_t * rdev) | ||
| 1214 | { | ||
| 1215 | char b[BDEVNAME_SIZE]; | ||
| 1216 | if (!rdev->sb_loaded) { | ||
| 1217 | MD_BUG(); | ||
| 1218 | return 1; | ||
| 1219 | } | ||
| 1220 | if (rdev->faulty) { | ||
| 1221 | MD_BUG(); | ||
| 1222 | return 1; | ||
| 1223 | } | ||
| 1224 | |||
| 1225 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", | ||
| 1226 | bdevname(rdev->bdev,b), | ||
| 1227 | (unsigned long long)rdev->sb_offset); | ||
| 1228 | |||
| 1229 | if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) | ||
| 1230 | return 0; | ||
| 1231 | |||
| 1232 | printk("md: write_disk_sb failed for device %s\n", | ||
| 1233 | bdevname(rdev->bdev,b)); | ||
| 1234 | return 1; | ||
| 1235 | } | ||
| 1236 | |||
| 1237 | static void sync_sbs(mddev_t * mddev) | 1301 | static void sync_sbs(mddev_t * mddev) |
| 1238 | { | 1302 | { |
| 1239 | mdk_rdev_t *rdev; | 1303 | mdk_rdev_t *rdev; |
| @@ -1248,12 +1312,14 @@ static void sync_sbs(mddev_t * mddev) | |||
| 1248 | 1312 | ||
| 1249 | static void md_update_sb(mddev_t * mddev) | 1313 | static void md_update_sb(mddev_t * mddev) |
| 1250 | { | 1314 | { |
| 1251 | int err, count = 100; | 1315 | int err; |
| 1252 | struct list_head *tmp; | 1316 | struct list_head *tmp; |
| 1253 | mdk_rdev_t *rdev; | 1317 | mdk_rdev_t *rdev; |
| 1318 | int sync_req; | ||
| 1254 | 1319 | ||
| 1255 | mddev->sb_dirty = 0; | ||
| 1256 | repeat: | 1320 | repeat: |
| 1321 | spin_lock(&mddev->write_lock); | ||
| 1322 | sync_req = mddev->in_sync; | ||
| 1257 | mddev->utime = get_seconds(); | 1323 | mddev->utime = get_seconds(); |
| 1258 | mddev->events ++; | 1324 | mddev->events ++; |
| 1259 | 1325 | ||
| @@ -1266,20 +1332,26 @@ repeat: | |||
| 1266 | MD_BUG(); | 1332 | MD_BUG(); |
| 1267 | mddev->events --; | 1333 | mddev->events --; |
| 1268 | } | 1334 | } |
| 1335 | mddev->sb_dirty = 2; | ||
| 1269 | sync_sbs(mddev); | 1336 | sync_sbs(mddev); |
| 1270 | 1337 | ||
| 1271 | /* | 1338 | /* |
| 1272 | * do not write anything to disk if using | 1339 | * do not write anything to disk if using |
| 1273 | * nonpersistent superblocks | 1340 | * nonpersistent superblocks |
| 1274 | */ | 1341 | */ |
| 1275 | if (!mddev->persistent) | 1342 | if (!mddev->persistent) { |
| 1343 | mddev->sb_dirty = 0; | ||
| 1344 | spin_unlock(&mddev->write_lock); | ||
| 1345 | wake_up(&mddev->sb_wait); | ||
| 1276 | return; | 1346 | return; |
| 1347 | } | ||
| 1348 | spin_unlock(&mddev->write_lock); | ||
| 1277 | 1349 | ||
| 1278 | dprintk(KERN_INFO | 1350 | dprintk(KERN_INFO |
| 1279 | "md: updating %s RAID superblock on device (in sync %d)\n", | 1351 | "md: updating %s RAID superblock on device (in sync %d)\n", |
| 1280 | mdname(mddev),mddev->in_sync); | 1352 | mdname(mddev),mddev->in_sync); |
| 1281 | 1353 | ||
| 1282 | err = 0; | 1354 | err = bitmap_update_sb(mddev->bitmap); |
| 1283 | ITERATE_RDEV(mddev,rdev,tmp) { | 1355 | ITERATE_RDEV(mddev,rdev,tmp) { |
| 1284 | char b[BDEVNAME_SIZE]; | 1356 | char b[BDEVNAME_SIZE]; |
| 1285 | dprintk(KERN_INFO "md: "); | 1357 | dprintk(KERN_INFO "md: "); |
| @@ -1288,22 +1360,32 @@ repeat: | |||
| 1288 | 1360 | ||
| 1289 | dprintk("%s ", bdevname(rdev->bdev,b)); | 1361 | dprintk("%s ", bdevname(rdev->bdev,b)); |
| 1290 | if (!rdev->faulty) { | 1362 | if (!rdev->faulty) { |
| 1291 | err += write_disk_sb(rdev); | 1363 | md_super_write(mddev,rdev, |
| 1364 | rdev->sb_offset<<1, MD_SB_BYTES, | ||
| 1365 | rdev->sb_page); | ||
| 1366 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", | ||
| 1367 | bdevname(rdev->bdev,b), | ||
| 1368 | (unsigned long long)rdev->sb_offset); | ||
| 1369 | |||
| 1292 | } else | 1370 | } else |
| 1293 | dprintk(")\n"); | 1371 | dprintk(")\n"); |
| 1294 | if (!err && mddev->level == LEVEL_MULTIPATH) | 1372 | if (mddev->level == LEVEL_MULTIPATH) |
| 1295 | /* only need to write one superblock... */ | 1373 | /* only need to write one superblock... */ |
| 1296 | break; | 1374 | break; |
| 1297 | } | 1375 | } |
| 1298 | if (err) { | 1376 | wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); |
| 1299 | if (--count) { | 1377 | /* if there was a failure, sb_dirty was set to 1, and we re-write super */ |
| 1300 | printk(KERN_ERR "md: errors occurred during superblock" | 1378 | |
| 1301 | " update, repeating\n"); | 1379 | spin_lock(&mddev->write_lock); |
| 1302 | goto repeat; | 1380 | if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { |
| 1303 | } | 1381 | /* have to write it out again */ |
| 1304 | printk(KERN_ERR \ | 1382 | spin_unlock(&mddev->write_lock); |
| 1305 | "md: excessive errors occurred during superblock update, exiting\n"); | 1383 | goto repeat; |
| 1306 | } | 1384 | } |
| 1385 | mddev->sb_dirty = 0; | ||
| 1386 | spin_unlock(&mddev->write_lock); | ||
| 1387 | wake_up(&mddev->sb_wait); | ||
| 1388 | |||
| 1307 | } | 1389 | } |
| 1308 | 1390 | ||
| 1309 | /* | 1391 | /* |
| @@ -1607,12 +1689,19 @@ static int do_md_run(mddev_t * mddev) | |||
| 1607 | 1689 | ||
| 1608 | mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ | 1690 | mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ |
| 1609 | 1691 | ||
| 1610 | err = mddev->pers->run(mddev); | 1692 | /* before we start the array running, initialise the bitmap */ |
| 1693 | err = bitmap_create(mddev); | ||
| 1694 | if (err) | ||
| 1695 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | ||
| 1696 | mdname(mddev), err); | ||
| 1697 | else | ||
| 1698 | err = mddev->pers->run(mddev); | ||
| 1611 | if (err) { | 1699 | if (err) { |
| 1612 | printk(KERN_ERR "md: pers->run() failed ...\n"); | 1700 | printk(KERN_ERR "md: pers->run() failed ...\n"); |
| 1613 | module_put(mddev->pers->owner); | 1701 | module_put(mddev->pers->owner); |
| 1614 | mddev->pers = NULL; | 1702 | mddev->pers = NULL; |
| 1615 | return -EINVAL; | 1703 | bitmap_destroy(mddev); |
| 1704 | return err; | ||
| 1616 | } | 1705 | } |
| 1617 | atomic_set(&mddev->writes_pending,0); | 1706 | atomic_set(&mddev->writes_pending,0); |
| 1618 | mddev->safemode = 0; | 1707 | mddev->safemode = 0; |
| @@ -1725,6 +1814,14 @@ static int do_md_stop(mddev_t * mddev, int ro) | |||
| 1725 | if (ro) | 1814 | if (ro) |
| 1726 | set_disk_ro(disk, 1); | 1815 | set_disk_ro(disk, 1); |
| 1727 | } | 1816 | } |
| 1817 | |||
| 1818 | bitmap_destroy(mddev); | ||
| 1819 | if (mddev->bitmap_file) { | ||
| 1820 | atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); | ||
| 1821 | fput(mddev->bitmap_file); | ||
| 1822 | mddev->bitmap_file = NULL; | ||
| 1823 | } | ||
| 1824 | |||
| 1728 | /* | 1825 | /* |
| 1729 | * Free resources if final stop | 1826 | * Free resources if final stop |
| 1730 | */ | 1827 | */ |
| @@ -1983,6 +2080,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
| 1983 | return 0; | 2080 | return 0; |
| 1984 | } | 2081 | } |
| 1985 | 2082 | ||
| 2083 | static int get_bitmap_file(mddev_t * mddev, void * arg) | ||
| 2084 | { | ||
| 2085 | mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ | ||
| 2086 | char *ptr, *buf = NULL; | ||
| 2087 | int err = -ENOMEM; | ||
| 2088 | |||
| 2089 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
| 2090 | if (!file) | ||
| 2091 | goto out; | ||
| 2092 | |||
| 2093 | /* bitmap disabled, zero the first byte and copy out */ | ||
| 2094 | if (!mddev->bitmap || !mddev->bitmap->file) { | ||
| 2095 | file->pathname[0] = '\0'; | ||
| 2096 | goto copy_out; | ||
| 2097 | } | ||
| 2098 | |||
| 2099 | buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); | ||
| 2100 | if (!buf) | ||
| 2101 | goto out; | ||
| 2102 | |||
| 2103 | ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); | ||
| 2104 | if (!ptr) | ||
| 2105 | goto out; | ||
| 2106 | |||
| 2107 | strcpy(file->pathname, ptr); | ||
| 2108 | |||
| 2109 | copy_out: | ||
| 2110 | err = 0; | ||
| 2111 | if (copy_to_user(arg, file, sizeof(*file))) | ||
| 2112 | err = -EFAULT; | ||
| 2113 | out: | ||
| 2114 | kfree(buf); | ||
| 2115 | kfree(file); | ||
| 2116 | return err; | ||
| 2117 | } | ||
| 2118 | |||
| 1986 | static int get_disk_info(mddev_t * mddev, void __user * arg) | 2119 | static int get_disk_info(mddev_t * mddev, void __user * arg) |
| 1987 | { | 2120 | { |
| 1988 | mdu_disk_info_t info; | 2121 | mdu_disk_info_t info; |
| @@ -2078,11 +2211,25 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
| 2078 | PTR_ERR(rdev)); | 2211 | PTR_ERR(rdev)); |
| 2079 | return PTR_ERR(rdev); | 2212 | return PTR_ERR(rdev); |
| 2080 | } | 2213 | } |
| 2214 | /* set save_raid_disk if appropriate */ | ||
| 2215 | if (!mddev->persistent) { | ||
| 2216 | if (info->state & (1<<MD_DISK_SYNC) && | ||
| 2217 | info->raid_disk < mddev->raid_disks) | ||
| 2218 | rdev->raid_disk = info->raid_disk; | ||
| 2219 | else | ||
| 2220 | rdev->raid_disk = -1; | ||
| 2221 | } else | ||
| 2222 | super_types[mddev->major_version]. | ||
| 2223 | validate_super(mddev, rdev); | ||
| 2224 | rdev->saved_raid_disk = rdev->raid_disk; | ||
| 2225 | |||
| 2081 | rdev->in_sync = 0; /* just to be sure */ | 2226 | rdev->in_sync = 0; /* just to be sure */ |
| 2082 | rdev->raid_disk = -1; | 2227 | rdev->raid_disk = -1; |
| 2083 | err = bind_rdev_to_array(rdev, mddev); | 2228 | err = bind_rdev_to_array(rdev, mddev); |
| 2084 | if (err) | 2229 | if (err) |
| 2085 | export_rdev(rdev); | 2230 | export_rdev(rdev); |
| 2231 | |||
| 2232 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
| 2086 | if (mddev->thread) | 2233 | if (mddev->thread) |
| 2087 | md_wakeup_thread(mddev->thread); | 2234 | md_wakeup_thread(mddev->thread); |
| 2088 | return err; | 2235 | return err; |
| @@ -2256,6 +2403,49 @@ abort_export: | |||
| 2256 | return err; | 2403 | return err; |
| 2257 | } | 2404 | } |
| 2258 | 2405 | ||
| 2406 | /* similar to deny_write_access, but accounts for our holding a reference | ||
| 2407 | * to the file ourselves */ | ||
| 2408 | static int deny_bitmap_write_access(struct file * file) | ||
| 2409 | { | ||
| 2410 | struct inode *inode = file->f_mapping->host; | ||
| 2411 | |||
| 2412 | spin_lock(&inode->i_lock); | ||
| 2413 | if (atomic_read(&inode->i_writecount) > 1) { | ||
| 2414 | spin_unlock(&inode->i_lock); | ||
| 2415 | return -ETXTBSY; | ||
| 2416 | } | ||
| 2417 | atomic_set(&inode->i_writecount, -1); | ||
| 2418 | spin_unlock(&inode->i_lock); | ||
| 2419 | |||
| 2420 | return 0; | ||
| 2421 | } | ||
| 2422 | |||
| 2423 | static int set_bitmap_file(mddev_t *mddev, int fd) | ||
| 2424 | { | ||
| 2425 | int err; | ||
| 2426 | |||
| 2427 | if (mddev->pers) | ||
| 2428 | return -EBUSY; | ||
| 2429 | |||
| 2430 | mddev->bitmap_file = fget(fd); | ||
| 2431 | |||
| 2432 | if (mddev->bitmap_file == NULL) { | ||
| 2433 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", | ||
| 2434 | mdname(mddev)); | ||
| 2435 | return -EBADF; | ||
| 2436 | } | ||
| 2437 | |||
| 2438 | err = deny_bitmap_write_access(mddev->bitmap_file); | ||
| 2439 | if (err) { | ||
| 2440 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | ||
| 2441 | mdname(mddev)); | ||
| 2442 | fput(mddev->bitmap_file); | ||
| 2443 | mddev->bitmap_file = NULL; | ||
| 2444 | } else | ||
| 2445 | mddev->bitmap_offset = 0; /* file overrides offset */ | ||
| 2446 | return err; | ||
| 2447 | } | ||
| 2448 | |||
| 2259 | /* | 2449 | /* |
| 2260 | * set_array_info is used two different ways | 2450 | * set_array_info is used two different ways |
| 2261 | * The original usage is when creating a new array. | 2451 | * The original usage is when creating a new array. |
| @@ -2567,8 +2757,10 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
| 2567 | /* | 2757 | /* |
| 2568 | * Commands querying/configuring an existing array: | 2758 | * Commands querying/configuring an existing array: |
| 2569 | */ | 2759 | */ |
| 2570 | /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ | 2760 | /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, |
| 2571 | if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { | 2761 | * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ |
| 2762 | if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY | ||
| 2763 | && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { | ||
| 2572 | err = -ENODEV; | 2764 | err = -ENODEV; |
| 2573 | goto abort_unlock; | 2765 | goto abort_unlock; |
| 2574 | } | 2766 | } |
| @@ -2582,6 +2774,10 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
| 2582 | err = get_array_info(mddev, argp); | 2774 | err = get_array_info(mddev, argp); |
| 2583 | goto done_unlock; | 2775 | goto done_unlock; |
| 2584 | 2776 | ||
| 2777 | case GET_BITMAP_FILE: | ||
| 2778 | err = get_bitmap_file(mddev, (void *)arg); | ||
| 2779 | goto done_unlock; | ||
| 2780 | |||
| 2585 | case GET_DISK_INFO: | 2781 | case GET_DISK_INFO: |
| 2586 | err = get_disk_info(mddev, argp); | 2782 | err = get_disk_info(mddev, argp); |
| 2587 | goto done_unlock; | 2783 | goto done_unlock; |
| @@ -2662,6 +2858,10 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
| 2662 | err = do_md_run (mddev); | 2858 | err = do_md_run (mddev); |
| 2663 | goto done_unlock; | 2859 | goto done_unlock; |
| 2664 | 2860 | ||
| 2861 | case SET_BITMAP_FILE: | ||
| 2862 | err = set_bitmap_file(mddev, (int)arg); | ||
| 2863 | goto done_unlock; | ||
| 2864 | |||
| 2665 | default: | 2865 | default: |
| 2666 | if (_IOC_TYPE(cmd) == MD_MAJOR) | 2866 | if (_IOC_TYPE(cmd) == MD_MAJOR) |
| 2667 | printk(KERN_WARNING "md: %s(pid %d) used" | 2867 | printk(KERN_WARNING "md: %s(pid %d) used" |
| @@ -2773,10 +2973,10 @@ static int md_thread(void * arg) | |||
| 2773 | while (thread->run) { | 2973 | while (thread->run) { |
| 2774 | void (*run)(mddev_t *); | 2974 | void (*run)(mddev_t *); |
| 2775 | 2975 | ||
| 2776 | wait_event_interruptible(thread->wqueue, | 2976 | wait_event_interruptible_timeout(thread->wqueue, |
| 2777 | test_bit(THREAD_WAKEUP, &thread->flags)); | 2977 | test_bit(THREAD_WAKEUP, &thread->flags), |
| 2778 | if (current->flags & PF_FREEZE) | 2978 | thread->timeout); |
| 2779 | refrigerator(PF_FREEZE); | 2979 | try_to_freeze(); |
| 2780 | 2980 | ||
| 2781 | clear_bit(THREAD_WAKEUP, &thread->flags); | 2981 | clear_bit(THREAD_WAKEUP, &thread->flags); |
| 2782 | 2982 | ||
| @@ -2820,6 +3020,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
| 2820 | thread->run = run; | 3020 | thread->run = run; |
| 2821 | thread->mddev = mddev; | 3021 | thread->mddev = mddev; |
| 2822 | thread->name = name; | 3022 | thread->name = name; |
| 3023 | thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
| 2823 | ret = kernel_thread(md_thread, thread, 0); | 3024 | ret = kernel_thread(md_thread, thread, 0); |
| 2824 | if (ret < 0) { | 3025 | if (ret < 0) { |
| 2825 | kfree(thread); | 3026 | kfree(thread); |
| @@ -2858,13 +3059,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 2858 | 3059 | ||
| 2859 | if (!rdev || rdev->faulty) | 3060 | if (!rdev || rdev->faulty) |
| 2860 | return; | 3061 | return; |
| 2861 | 3062 | /* | |
| 2862 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", | 3063 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", |
| 2863 | mdname(mddev), | 3064 | mdname(mddev), |
| 2864 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), | 3065 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), |
| 2865 | __builtin_return_address(0),__builtin_return_address(1), | 3066 | __builtin_return_address(0),__builtin_return_address(1), |
| 2866 | __builtin_return_address(2),__builtin_return_address(3)); | 3067 | __builtin_return_address(2),__builtin_return_address(3)); |
| 2867 | 3068 | */ | |
| 2868 | if (!mddev->pers->error_handler) | 3069 | if (!mddev->pers->error_handler) |
| 2869 | return; | 3070 | return; |
| 2870 | mddev->pers->error_handler(mddev,rdev); | 3071 | mddev->pers->error_handler(mddev,rdev); |
| @@ -3018,6 +3219,7 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
| 3018 | struct list_head *tmp2; | 3219 | struct list_head *tmp2; |
| 3019 | mdk_rdev_t *rdev; | 3220 | mdk_rdev_t *rdev; |
| 3020 | int i; | 3221 | int i; |
| 3222 | struct bitmap *bitmap; | ||
| 3021 | 3223 | ||
| 3022 | if (v == (void*)1) { | 3224 | if (v == (void*)1) { |
| 3023 | seq_printf(seq, "Personalities : "); | 3225 | seq_printf(seq, "Personalities : "); |
| @@ -3070,10 +3272,35 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
| 3070 | if (mddev->pers) { | 3272 | if (mddev->pers) { |
| 3071 | mddev->pers->status (seq, mddev); | 3273 | mddev->pers->status (seq, mddev); |
| 3072 | seq_printf(seq, "\n "); | 3274 | seq_printf(seq, "\n "); |
| 3073 | if (mddev->curr_resync > 2) | 3275 | if (mddev->curr_resync > 2) { |
| 3074 | status_resync (seq, mddev); | 3276 | status_resync (seq, mddev); |
| 3075 | else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) | 3277 | seq_printf(seq, "\n "); |
| 3076 | seq_printf(seq, " resync=DELAYED"); | 3278 | } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) |
| 3279 | seq_printf(seq, " resync=DELAYED\n "); | ||
| 3280 | } else | ||
| 3281 | seq_printf(seq, "\n "); | ||
| 3282 | |||
| 3283 | if ((bitmap = mddev->bitmap)) { | ||
| 3284 | unsigned long chunk_kb; | ||
| 3285 | unsigned long flags; | ||
| 3286 | spin_lock_irqsave(&bitmap->lock, flags); | ||
| 3287 | chunk_kb = bitmap->chunksize >> 10; | ||
| 3288 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " | ||
| 3289 | "%lu%s chunk", | ||
| 3290 | bitmap->pages - bitmap->missing_pages, | ||
| 3291 | bitmap->pages, | ||
| 3292 | (bitmap->pages - bitmap->missing_pages) | ||
| 3293 | << (PAGE_SHIFT - 10), | ||
| 3294 | chunk_kb ? chunk_kb : bitmap->chunksize, | ||
| 3295 | chunk_kb ? "KB" : "B"); | ||
| 3296 | if (bitmap->file) { | ||
| 3297 | seq_printf(seq, ", file: "); | ||
| 3298 | seq_path(seq, bitmap->file->f_vfsmnt, | ||
| 3299 | bitmap->file->f_dentry," \t\n"); | ||
| 3300 | } | ||
| 3301 | |||
| 3302 | seq_printf(seq, "\n"); | ||
| 3303 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 3077 | } | 3304 | } |
| 3078 | 3305 | ||
| 3079 | seq_printf(seq, "\n"); | 3306 | seq_printf(seq, "\n"); |
| @@ -3176,19 +3403,28 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) | |||
| 3176 | } | 3403 | } |
| 3177 | 3404 | ||
| 3178 | 3405 | ||
| 3179 | void md_write_start(mddev_t *mddev) | 3406 | /* md_write_start(mddev, bi) |
| 3407 | * If we need to update some array metadata (e.g. 'active' flag | ||
| 3408 | * in superblock) before writing, schedule a superblock update | ||
| 3409 | * and wait for it to complete. | ||
| 3410 | */ | ||
| 3411 | void md_write_start(mddev_t *mddev, struct bio *bi) | ||
| 3180 | { | 3412 | { |
| 3181 | if (!atomic_read(&mddev->writes_pending)) { | 3413 | DEFINE_WAIT(w); |
| 3182 | mddev_lock_uninterruptible(mddev); | 3414 | if (bio_data_dir(bi) != WRITE) |
| 3415 | return; | ||
| 3416 | |||
| 3417 | atomic_inc(&mddev->writes_pending); | ||
| 3418 | if (mddev->in_sync) { | ||
| 3419 | spin_lock(&mddev->write_lock); | ||
| 3183 | if (mddev->in_sync) { | 3420 | if (mddev->in_sync) { |
| 3184 | mddev->in_sync = 0; | 3421 | mddev->in_sync = 0; |
| 3185 | del_timer(&mddev->safemode_timer); | 3422 | mddev->sb_dirty = 1; |
| 3186 | md_update_sb(mddev); | 3423 | md_wakeup_thread(mddev->thread); |
| 3187 | } | 3424 | } |
| 3188 | atomic_inc(&mddev->writes_pending); | 3425 | spin_unlock(&mddev->write_lock); |
| 3189 | mddev_unlock(mddev); | 3426 | } |
| 3190 | } else | 3427 | wait_event(mddev->sb_wait, mddev->sb_dirty==0); |
| 3191 | atomic_inc(&mddev->writes_pending); | ||
| 3192 | } | 3428 | } |
| 3193 | 3429 | ||
| 3194 | void md_write_end(mddev_t *mddev) | 3430 | void md_write_end(mddev_t *mddev) |
| @@ -3201,37 +3437,6 @@ void md_write_end(mddev_t *mddev) | |||
| 3201 | } | 3437 | } |
| 3202 | } | 3438 | } |
| 3203 | 3439 | ||
| 3204 | static inline void md_enter_safemode(mddev_t *mddev) | ||
| 3205 | { | ||
| 3206 | if (!mddev->safemode) return; | ||
| 3207 | if (mddev->safemode == 2 && | ||
| 3208 | (atomic_read(&mddev->writes_pending) || mddev->in_sync || | ||
| 3209 | mddev->recovery_cp != MaxSector)) | ||
| 3210 | return; /* avoid the lock */ | ||
| 3211 | mddev_lock_uninterruptible(mddev); | ||
| 3212 | if (mddev->safemode && !atomic_read(&mddev->writes_pending) && | ||
| 3213 | !mddev->in_sync && mddev->recovery_cp == MaxSector) { | ||
| 3214 | mddev->in_sync = 1; | ||
| 3215 | md_update_sb(mddev); | ||
| 3216 | } | ||
| 3217 | mddev_unlock(mddev); | ||
| 3218 | |||
| 3219 | if (mddev->safemode == 1) | ||
| 3220 | mddev->safemode = 0; | ||
| 3221 | } | ||
| 3222 | |||
| 3223 | void md_handle_safemode(mddev_t *mddev) | ||
| 3224 | { | ||
| 3225 | if (signal_pending(current)) { | ||
| 3226 | printk(KERN_INFO "md: %s in immediate safe mode\n", | ||
| 3227 | mdname(mddev)); | ||
| 3228 | mddev->safemode = 2; | ||
| 3229 | flush_signals(current); | ||
| 3230 | } | ||
| 3231 | md_enter_safemode(mddev); | ||
| 3232 | } | ||
| 3233 | |||
| 3234 | |||
| 3235 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); | 3440 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
| 3236 | 3441 | ||
| 3237 | #define SYNC_MARKS 10 | 3442 | #define SYNC_MARKS 10 |
| @@ -3241,12 +3446,13 @@ static void md_do_sync(mddev_t *mddev) | |||
| 3241 | mddev_t *mddev2; | 3446 | mddev_t *mddev2; |
| 3242 | unsigned int currspeed = 0, | 3447 | unsigned int currspeed = 0, |
| 3243 | window; | 3448 | window; |
| 3244 | sector_t max_sectors,j; | 3449 | sector_t max_sectors,j, io_sectors; |
| 3245 | unsigned long mark[SYNC_MARKS]; | 3450 | unsigned long mark[SYNC_MARKS]; |
| 3246 | sector_t mark_cnt[SYNC_MARKS]; | 3451 | sector_t mark_cnt[SYNC_MARKS]; |
| 3247 | int last_mark,m; | 3452 | int last_mark,m; |
| 3248 | struct list_head *tmp; | 3453 | struct list_head *tmp; |
| 3249 | sector_t last_check; | 3454 | sector_t last_check; |
| 3455 | int skipped = 0; | ||
| 3250 | 3456 | ||
| 3251 | /* just incase thread restarts... */ | 3457 | /* just incase thread restarts... */ |
| 3252 | if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) | 3458 | if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) |
| @@ -3312,7 +3518,7 @@ static void md_do_sync(mddev_t *mddev) | |||
| 3312 | 3518 | ||
| 3313 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 3519 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
| 3314 | /* resync follows the size requested by the personality, | 3520 | /* resync follows the size requested by the personality, |
| 3315 | * which default to physical size, but can be virtual size | 3521 | * which defaults to physical size, but can be virtual size |
| 3316 | */ | 3522 | */ |
| 3317 | max_sectors = mddev->resync_max_sectors; | 3523 | max_sectors = mddev->resync_max_sectors; |
| 3318 | else | 3524 | else |
| @@ -3327,13 +3533,15 @@ static void md_do_sync(mddev_t *mddev) | |||
| 3327 | sysctl_speed_limit_max); | 3533 | sysctl_speed_limit_max); |
| 3328 | 3534 | ||
| 3329 | is_mddev_idle(mddev); /* this also initializes IO event counters */ | 3535 | is_mddev_idle(mddev); /* this also initializes IO event counters */ |
| 3330 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 3536 | /* we don't use the checkpoint if there's a bitmap */ |
| 3537 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap) | ||
| 3331 | j = mddev->recovery_cp; | 3538 | j = mddev->recovery_cp; |
| 3332 | else | 3539 | else |
| 3333 | j = 0; | 3540 | j = 0; |
| 3541 | io_sectors = 0; | ||
| 3334 | for (m = 0; m < SYNC_MARKS; m++) { | 3542 | for (m = 0; m < SYNC_MARKS; m++) { |
| 3335 | mark[m] = jiffies; | 3543 | mark[m] = jiffies; |
| 3336 | mark_cnt[m] = j; | 3544 | mark_cnt[m] = io_sectors; |
| 3337 | } | 3545 | } |
| 3338 | last_mark = 0; | 3546 | last_mark = 0; |
| 3339 | mddev->resync_mark = mark[last_mark]; | 3547 | mddev->resync_mark = mark[last_mark]; |
| @@ -3358,21 +3566,29 @@ static void md_do_sync(mddev_t *mddev) | |||
| 3358 | } | 3566 | } |
| 3359 | 3567 | ||
| 3360 | while (j < max_sectors) { | 3568 | while (j < max_sectors) { |
| 3361 | int sectors; | 3569 | sector_t sectors; |
| 3362 | 3570 | ||
| 3363 | sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); | 3571 | skipped = 0; |
| 3364 | if (sectors < 0) { | 3572 | sectors = mddev->pers->sync_request(mddev, j, &skipped, |
| 3573 | currspeed < sysctl_speed_limit_min); | ||
| 3574 | if (sectors == 0) { | ||
| 3365 | set_bit(MD_RECOVERY_ERR, &mddev->recovery); | 3575 | set_bit(MD_RECOVERY_ERR, &mddev->recovery); |
| 3366 | goto out; | 3576 | goto out; |
| 3367 | } | 3577 | } |
| 3368 | atomic_add(sectors, &mddev->recovery_active); | 3578 | |
| 3579 | if (!skipped) { /* actual IO requested */ | ||
| 3580 | io_sectors += sectors; | ||
| 3581 | atomic_add(sectors, &mddev->recovery_active); | ||
| 3582 | } | ||
| 3583 | |||
| 3369 | j += sectors; | 3584 | j += sectors; |
| 3370 | if (j>1) mddev->curr_resync = j; | 3585 | if (j>1) mddev->curr_resync = j; |
| 3371 | 3586 | ||
| 3372 | if (last_check + window > j || j == max_sectors) | 3587 | |
| 3588 | if (last_check + window > io_sectors || j == max_sectors) | ||
| 3373 | continue; | 3589 | continue; |
| 3374 | 3590 | ||
| 3375 | last_check = j; | 3591 | last_check = io_sectors; |
| 3376 | 3592 | ||
| 3377 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || | 3593 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || |
| 3378 | test_bit(MD_RECOVERY_ERR, &mddev->recovery)) | 3594 | test_bit(MD_RECOVERY_ERR, &mddev->recovery)) |
| @@ -3386,7 +3602,7 @@ static void md_do_sync(mddev_t *mddev) | |||
| 3386 | mddev->resync_mark = mark[next]; | 3602 | mddev->resync_mark = mark[next]; |
| 3387 | mddev->resync_mark_cnt = mark_cnt[next]; | 3603 | mddev->resync_mark_cnt = mark_cnt[next]; |
| 3388 | mark[next] = jiffies; | 3604 | mark[next] = jiffies; |
| 3389 | mark_cnt[next] = j - atomic_read(&mddev->recovery_active); | 3605 | mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); |
| 3390 | last_mark = next; | 3606 | last_mark = next; |
| 3391 | } | 3607 | } |
| 3392 | 3608 | ||
| @@ -3413,7 +3629,8 @@ static void md_do_sync(mddev_t *mddev) | |||
| 3413 | mddev->queue->unplug_fn(mddev->queue); | 3629 | mddev->queue->unplug_fn(mddev->queue); |
| 3414 | cond_resched(); | 3630 | cond_resched(); |
| 3415 | 3631 | ||
| 3416 | currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; | 3632 | currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 |
| 3633 | /((jiffies-mddev->resync_mark)/HZ +1) +1; | ||
| 3417 | 3634 | ||
| 3418 | if (currspeed > sysctl_speed_limit_min) { | 3635 | if (currspeed > sysctl_speed_limit_min) { |
| 3419 | if ((currspeed > sysctl_speed_limit_max) || | 3636 | if ((currspeed > sysctl_speed_limit_max) || |
| @@ -3433,7 +3650,7 @@ static void md_do_sync(mddev_t *mddev) | |||
| 3433 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | 3650 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
| 3434 | 3651 | ||
| 3435 | /* tell personality that we are finished */ | 3652 | /* tell personality that we are finished */ |
| 3436 | mddev->pers->sync_request(mddev, max_sectors, 1); | 3653 | mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); |
| 3437 | 3654 | ||
| 3438 | if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && | 3655 | if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && |
| 3439 | mddev->curr_resync > 2 && | 3656 | mddev->curr_resync > 2 && |
| @@ -3447,7 +3664,6 @@ static void md_do_sync(mddev_t *mddev) | |||
| 3447 | mddev->recovery_cp = MaxSector; | 3664 | mddev->recovery_cp = MaxSector; |
| 3448 | } | 3665 | } |
| 3449 | 3666 | ||
| 3450 | md_enter_safemode(mddev); | ||
| 3451 | skip: | 3667 | skip: |
| 3452 | mddev->curr_resync = 0; | 3668 | mddev->curr_resync = 0; |
| 3453 | wake_up(&resync_wait); | 3669 | wake_up(&resync_wait); |
| @@ -3484,20 +3700,48 @@ void md_check_recovery(mddev_t *mddev) | |||
| 3484 | struct list_head *rtmp; | 3700 | struct list_head *rtmp; |
| 3485 | 3701 | ||
| 3486 | 3702 | ||
| 3487 | dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); | 3703 | if (mddev->bitmap) |
| 3704 | bitmap_daemon_work(mddev->bitmap); | ||
| 3488 | 3705 | ||
| 3489 | if (mddev->ro) | 3706 | if (mddev->ro) |
| 3490 | return; | 3707 | return; |
| 3708 | |||
| 3709 | if (signal_pending(current)) { | ||
| 3710 | if (mddev->pers->sync_request) { | ||
| 3711 | printk(KERN_INFO "md: %s in immediate safe mode\n", | ||
| 3712 | mdname(mddev)); | ||
| 3713 | mddev->safemode = 2; | ||
| 3714 | } | ||
| 3715 | flush_signals(current); | ||
| 3716 | } | ||
| 3717 | |||
| 3491 | if ( ! ( | 3718 | if ( ! ( |
| 3492 | mddev->sb_dirty || | 3719 | mddev->sb_dirty || |
| 3493 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || | 3720 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || |
| 3494 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) | 3721 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) || |
| 3722 | (mddev->safemode == 1) || | ||
| 3723 | (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) | ||
| 3724 | && !mddev->in_sync && mddev->recovery_cp == MaxSector) | ||
| 3495 | )) | 3725 | )) |
| 3496 | return; | 3726 | return; |
| 3727 | |||
| 3497 | if (mddev_trylock(mddev)==0) { | 3728 | if (mddev_trylock(mddev)==0) { |
| 3498 | int spares =0; | 3729 | int spares =0; |
| 3730 | |||
| 3731 | spin_lock(&mddev->write_lock); | ||
| 3732 | if (mddev->safemode && !atomic_read(&mddev->writes_pending) && | ||
| 3733 | !mddev->in_sync && mddev->recovery_cp == MaxSector) { | ||
| 3734 | mddev->in_sync = 1; | ||
| 3735 | mddev->sb_dirty = 1; | ||
| 3736 | } | ||
| 3737 | if (mddev->safemode == 1) | ||
| 3738 | mddev->safemode = 0; | ||
| 3739 | spin_unlock(&mddev->write_lock); | ||
| 3740 | |||
| 3499 | if (mddev->sb_dirty) | 3741 | if (mddev->sb_dirty) |
| 3500 | md_update_sb(mddev); | 3742 | md_update_sb(mddev); |
| 3743 | |||
| 3744 | |||
| 3501 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | 3745 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
| 3502 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { | 3746 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { |
| 3503 | /* resync/recovery still happening */ | 3747 | /* resync/recovery still happening */ |
| @@ -3515,6 +3759,14 @@ void md_check_recovery(mddev_t *mddev) | |||
| 3515 | mddev->pers->spare_active(mddev); | 3759 | mddev->pers->spare_active(mddev); |
| 3516 | } | 3760 | } |
| 3517 | md_update_sb(mddev); | 3761 | md_update_sb(mddev); |
| 3762 | |||
| 3763 | /* if array is no-longer degraded, then any saved_raid_disk | ||
| 3764 | * information must be scrapped | ||
| 3765 | */ | ||
| 3766 | if (!mddev->degraded) | ||
| 3767 | ITERATE_RDEV(mddev,rdev,rtmp) | ||
| 3768 | rdev->saved_raid_disk = -1; | ||
| 3769 | |||
| 3518 | mddev->recovery = 0; | 3770 | mddev->recovery = 0; |
| 3519 | /* flag recovery needed just to double check */ | 3771 | /* flag recovery needed just to double check */ |
| 3520 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3772 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| @@ -3557,6 +3809,13 @@ void md_check_recovery(mddev_t *mddev) | |||
| 3557 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | 3809 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); |
| 3558 | if (!spares) | 3810 | if (!spares) |
| 3559 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 3811 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
| 3812 | if (spares && mddev->bitmap && ! mddev->bitmap->file) { | ||
| 3813 | /* We are adding a device or devices to an array | ||
| 3814 | * which has the bitmap stored on all devices. | ||
| 3815 | * So make sure all bitmap pages get written | ||
| 3816 | */ | ||
| 3817 | bitmap_write_all(mddev->bitmap); | ||
| 3818 | } | ||
| 3560 | mddev->sync_thread = md_register_thread(md_do_sync, | 3819 | mddev->sync_thread = md_register_thread(md_do_sync, |
| 3561 | mddev, | 3820 | mddev, |
| 3562 | "%s_resync"); | 3821 | "%s_resync"); |
| @@ -3624,6 +3883,8 @@ static int __init md_init(void) | |||
| 3624 | " MD_SB_DISKS=%d\n", | 3883 | " MD_SB_DISKS=%d\n", |
| 3625 | MD_MAJOR_VERSION, MD_MINOR_VERSION, | 3884 | MD_MAJOR_VERSION, MD_MINOR_VERSION, |
| 3626 | MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); | 3885 | MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); |
| 3886 | printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, | ||
| 3887 | BITMAP_MINOR); | ||
| 3627 | 3888 | ||
| 3628 | if (register_blkdev(MAJOR_NR, "md")) | 3889 | if (register_blkdev(MAJOR_NR, "md")) |
| 3629 | return -1; | 3890 | return -1; |
| @@ -3739,7 +4000,6 @@ EXPORT_SYMBOL(md_error); | |||
| 3739 | EXPORT_SYMBOL(md_done_sync); | 4000 | EXPORT_SYMBOL(md_done_sync); |
| 3740 | EXPORT_SYMBOL(md_write_start); | 4001 | EXPORT_SYMBOL(md_write_start); |
| 3741 | EXPORT_SYMBOL(md_write_end); | 4002 | EXPORT_SYMBOL(md_write_end); |
| 3742 | EXPORT_SYMBOL(md_handle_safemode); | ||
| 3743 | EXPORT_SYMBOL(md_register_thread); | 4003 | EXPORT_SYMBOL(md_register_thread); |
| 3744 | EXPORT_SYMBOL(md_unregister_thread); | 4004 | EXPORT_SYMBOL(md_unregister_thread); |
| 3745 | EXPORT_SYMBOL(md_wakeup_thread); | 4005 | EXPORT_SYMBOL(md_wakeup_thread); |
