diff options
author | NeilBrown <neilb@cse.unsw.edu.au> | 2005-06-21 20:17:14 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-06-21 22:07:43 -0400 |
commit | 32a7627cf3a35396a8e834faf34e38ae9f3b1309 (patch) | |
tree | 3fe7764f5d8e39d835a397e1099358d924b02981 /drivers/md/md.c | |
parent | 57afd89f98a990747445f01c458ecae64263b2f8 (diff) |
[PATCH] md: optimised resync using Bitmap based intent logging
With this patch, the intent to write to some block in the array can be logged
to a bitmap file. Each bit represents some number of sectors and is set
before any update happens, and only cleared when all writes relating to all
sectors are complete.
After an unclean shutdown, information in this bitmap can be used to optimise
resync - only sectors which could be out-of-sync need to be updated.
Also if a drive is removed and then added back into an array, the recovery can
make use of the bitmap to optimise reconstruction. This is not implemented in
this patch.
Currently the bitmap is stored in a file which must (obviously) be stored on a
separate device.
The patch only provided infrastructure. It does not update any personalities
to bitmap intent logging.
Md arrays can still be used with no bitmap file. This patch has minimal
impact on such arrays.
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 172 |
1 files changed, 158 insertions, 14 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index fa608a1a5c20..c402f6cc7047 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -19,6 +19,9 @@ | |||
19 | 19 | ||
20 | Neil Brown <neilb@cse.unsw.edu.au>. | 20 | Neil Brown <neilb@cse.unsw.edu.au>. |
21 | 21 | ||
22 | - persistent bitmap code | ||
23 | Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. | ||
24 | |||
22 | This program is free software; you can redistribute it and/or modify | 25 | This program is free software; you can redistribute it and/or modify |
23 | it under the terms of the GNU General Public License as published by | 26 | it under the terms of the GNU General Public License as published by |
24 | the Free Software Foundation; either version 2, or (at your option) | 27 | the Free Software Foundation; either version 2, or (at your option) |
@@ -33,6 +36,7 @@ | |||
33 | #include <linux/config.h> | 36 | #include <linux/config.h> |
34 | #include <linux/linkage.h> | 37 | #include <linux/linkage.h> |
35 | #include <linux/raid/md.h> | 38 | #include <linux/raid/md.h> |
39 | #include <linux/raid/bitmap.h> | ||
36 | #include <linux/sysctl.h> | 40 | #include <linux/sysctl.h> |
37 | #include <linux/devfs_fs_kernel.h> | 41 | #include <linux/devfs_fs_kernel.h> |
38 | #include <linux/buffer_head.h> /* for invalidate_bdev */ | 42 | #include <linux/buffer_head.h> /* for invalidate_bdev */ |
@@ -40,6 +44,8 @@ | |||
40 | 44 | ||
41 | #include <linux/init.h> | 45 | #include <linux/init.h> |
42 | 46 | ||
47 | #include <linux/file.h> | ||
48 | |||
43 | #ifdef CONFIG_KMOD | 49 | #ifdef CONFIG_KMOD |
44 | #include <linux/kmod.h> | 50 | #include <linux/kmod.h> |
45 | #endif | 51 | #endif |
@@ -1198,8 +1204,11 @@ void md_print_devices(void) | |||
1198 | printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); | 1204 | printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); |
1199 | printk("md: **********************************\n"); | 1205 | printk("md: **********************************\n"); |
1200 | ITERATE_MDDEV(mddev,tmp) { | 1206 | ITERATE_MDDEV(mddev,tmp) { |
1201 | printk("%s: ", mdname(mddev)); | ||
1202 | 1207 | ||
1208 | if (mddev->bitmap) | ||
1209 | bitmap_print_sb(mddev->bitmap); | ||
1210 | else | ||
1211 | printk("%s: ", mdname(mddev)); | ||
1203 | ITERATE_RDEV(mddev,rdev,tmp2) | 1212 | ITERATE_RDEV(mddev,rdev,tmp2) |
1204 | printk("<%s>", bdevname(rdev->bdev,b)); | 1213 | printk("<%s>", bdevname(rdev->bdev,b)); |
1205 | printk("\n"); | 1214 | printk("\n"); |
@@ -1287,7 +1296,7 @@ repeat: | |||
1287 | "md: updating %s RAID superblock on device (in sync %d)\n", | 1296 | "md: updating %s RAID superblock on device (in sync %d)\n", |
1288 | mdname(mddev),mddev->in_sync); | 1297 | mdname(mddev),mddev->in_sync); |
1289 | 1298 | ||
1290 | err = 0; | 1299 | err = bitmap_update_sb(mddev->bitmap); |
1291 | ITERATE_RDEV(mddev,rdev,tmp) { | 1300 | ITERATE_RDEV(mddev,rdev,tmp) { |
1292 | char b[BDEVNAME_SIZE]; | 1301 | char b[BDEVNAME_SIZE]; |
1293 | dprintk(KERN_INFO "md: "); | 1302 | dprintk(KERN_INFO "md: "); |
@@ -1624,12 +1633,19 @@ static int do_md_run(mddev_t * mddev) | |||
1624 | 1633 | ||
1625 | mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ | 1634 | mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ |
1626 | 1635 | ||
1627 | err = mddev->pers->run(mddev); | 1636 | /* before we start the array running, initialise the bitmap */ |
1637 | err = bitmap_create(mddev); | ||
1638 | if (err) | ||
1639 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | ||
1640 | mdname(mddev), err); | ||
1641 | else | ||
1642 | err = mddev->pers->run(mddev); | ||
1628 | if (err) { | 1643 | if (err) { |
1629 | printk(KERN_ERR "md: pers->run() failed ...\n"); | 1644 | printk(KERN_ERR "md: pers->run() failed ...\n"); |
1630 | module_put(mddev->pers->owner); | 1645 | module_put(mddev->pers->owner); |
1631 | mddev->pers = NULL; | 1646 | mddev->pers = NULL; |
1632 | return -EINVAL; | 1647 | bitmap_destroy(mddev); |
1648 | return err; | ||
1633 | } | 1649 | } |
1634 | atomic_set(&mddev->writes_pending,0); | 1650 | atomic_set(&mddev->writes_pending,0); |
1635 | mddev->safemode = 0; | 1651 | mddev->safemode = 0; |
@@ -1742,6 +1758,14 @@ static int do_md_stop(mddev_t * mddev, int ro) | |||
1742 | if (ro) | 1758 | if (ro) |
1743 | set_disk_ro(disk, 1); | 1759 | set_disk_ro(disk, 1); |
1744 | } | 1760 | } |
1761 | |||
1762 | bitmap_destroy(mddev); | ||
1763 | if (mddev->bitmap_file) { | ||
1764 | atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); | ||
1765 | fput(mddev->bitmap_file); | ||
1766 | mddev->bitmap_file = NULL; | ||
1767 | } | ||
1768 | |||
1745 | /* | 1769 | /* |
1746 | * Free resources if final stop | 1770 | * Free resources if final stop |
1747 | */ | 1771 | */ |
@@ -2000,6 +2024,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
2000 | return 0; | 2024 | return 0; |
2001 | } | 2025 | } |
2002 | 2026 | ||
2027 | static int get_bitmap_file(mddev_t * mddev, void * arg) | ||
2028 | { | ||
2029 | mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ | ||
2030 | char *ptr, *buf = NULL; | ||
2031 | int err = -ENOMEM; | ||
2032 | |||
2033 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
2034 | if (!file) | ||
2035 | goto out; | ||
2036 | |||
2037 | /* bitmap disabled, zero the first byte and copy out */ | ||
2038 | if (!mddev->bitmap || !mddev->bitmap->file) { | ||
2039 | file->pathname[0] = '\0'; | ||
2040 | goto copy_out; | ||
2041 | } | ||
2042 | |||
2043 | buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); | ||
2044 | if (!buf) | ||
2045 | goto out; | ||
2046 | |||
2047 | ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); | ||
2048 | if (!ptr) | ||
2049 | goto out; | ||
2050 | |||
2051 | strcpy(file->pathname, ptr); | ||
2052 | |||
2053 | copy_out: | ||
2054 | err = 0; | ||
2055 | if (copy_to_user(arg, file, sizeof(*file))) | ||
2056 | err = -EFAULT; | ||
2057 | out: | ||
2058 | kfree(buf); | ||
2059 | kfree(file); | ||
2060 | return err; | ||
2061 | } | ||
2062 | |||
2003 | static int get_disk_info(mddev_t * mddev, void __user * arg) | 2063 | static int get_disk_info(mddev_t * mddev, void __user * arg) |
2004 | { | 2064 | { |
2005 | mdu_disk_info_t info; | 2065 | mdu_disk_info_t info; |
@@ -2275,6 +2335,48 @@ abort_export: | |||
2275 | return err; | 2335 | return err; |
2276 | } | 2336 | } |
2277 | 2337 | ||
2338 | /* similar to deny_write_access, but accounts for our holding a reference | ||
2339 | * to the file ourselves */ | ||
2340 | static int deny_bitmap_write_access(struct file * file) | ||
2341 | { | ||
2342 | struct inode *inode = file->f_mapping->host; | ||
2343 | |||
2344 | spin_lock(&inode->i_lock); | ||
2345 | if (atomic_read(&inode->i_writecount) > 1) { | ||
2346 | spin_unlock(&inode->i_lock); | ||
2347 | return -ETXTBSY; | ||
2348 | } | ||
2349 | atomic_set(&inode->i_writecount, -1); | ||
2350 | spin_unlock(&inode->i_lock); | ||
2351 | |||
2352 | return 0; | ||
2353 | } | ||
2354 | |||
2355 | static int set_bitmap_file(mddev_t *mddev, int fd) | ||
2356 | { | ||
2357 | int err; | ||
2358 | |||
2359 | if (mddev->pers) | ||
2360 | return -EBUSY; | ||
2361 | |||
2362 | mddev->bitmap_file = fget(fd); | ||
2363 | |||
2364 | if (mddev->bitmap_file == NULL) { | ||
2365 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", | ||
2366 | mdname(mddev)); | ||
2367 | return -EBADF; | ||
2368 | } | ||
2369 | |||
2370 | err = deny_bitmap_write_access(mddev->bitmap_file); | ||
2371 | if (err) { | ||
2372 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | ||
2373 | mdname(mddev)); | ||
2374 | fput(mddev->bitmap_file); | ||
2375 | mddev->bitmap_file = NULL; | ||
2376 | } | ||
2377 | return err; | ||
2378 | } | ||
2379 | |||
2278 | /* | 2380 | /* |
2279 | * set_array_info is used two different ways | 2381 | * set_array_info is used two different ways |
2280 | * The original usage is when creating a new array. | 2382 | * The original usage is when creating a new array. |
@@ -2586,8 +2688,10 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
2586 | /* | 2688 | /* |
2587 | * Commands querying/configuring an existing array: | 2689 | * Commands querying/configuring an existing array: |
2588 | */ | 2690 | */ |
2589 | /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ | 2691 | /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, |
2590 | if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { | 2692 | * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ |
2693 | if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY | ||
2694 | && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { | ||
2591 | err = -ENODEV; | 2695 | err = -ENODEV; |
2592 | goto abort_unlock; | 2696 | goto abort_unlock; |
2593 | } | 2697 | } |
@@ -2601,6 +2705,10 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
2601 | err = get_array_info(mddev, argp); | 2705 | err = get_array_info(mddev, argp); |
2602 | goto done_unlock; | 2706 | goto done_unlock; |
2603 | 2707 | ||
2708 | case GET_BITMAP_FILE: | ||
2709 | err = get_bitmap_file(mddev, (void *)arg); | ||
2710 | goto done_unlock; | ||
2711 | |||
2604 | case GET_DISK_INFO: | 2712 | case GET_DISK_INFO: |
2605 | err = get_disk_info(mddev, argp); | 2713 | err = get_disk_info(mddev, argp); |
2606 | goto done_unlock; | 2714 | goto done_unlock; |
@@ -2681,6 +2789,10 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
2681 | err = do_md_run (mddev); | 2789 | err = do_md_run (mddev); |
2682 | goto done_unlock; | 2790 | goto done_unlock; |
2683 | 2791 | ||
2792 | case SET_BITMAP_FILE: | ||
2793 | err = set_bitmap_file(mddev, (int)arg); | ||
2794 | goto done_unlock; | ||
2795 | |||
2684 | default: | 2796 | default: |
2685 | if (_IOC_TYPE(cmd) == MD_MAJOR) | 2797 | if (_IOC_TYPE(cmd) == MD_MAJOR) |
2686 | printk(KERN_WARNING "md: %s(pid %d) used" | 2798 | printk(KERN_WARNING "md: %s(pid %d) used" |
@@ -2792,8 +2904,9 @@ static int md_thread(void * arg) | |||
2792 | while (thread->run) { | 2904 | while (thread->run) { |
2793 | void (*run)(mddev_t *); | 2905 | void (*run)(mddev_t *); |
2794 | 2906 | ||
2795 | wait_event_interruptible(thread->wqueue, | 2907 | wait_event_interruptible_timeout(thread->wqueue, |
2796 | test_bit(THREAD_WAKEUP, &thread->flags)); | 2908 | test_bit(THREAD_WAKEUP, &thread->flags), |
2909 | thread->timeout); | ||
2797 | if (current->flags & PF_FREEZE) | 2910 | if (current->flags & PF_FREEZE) |
2798 | refrigerator(PF_FREEZE); | 2911 | refrigerator(PF_FREEZE); |
2799 | 2912 | ||
@@ -2839,6 +2952,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
2839 | thread->run = run; | 2952 | thread->run = run; |
2840 | thread->mddev = mddev; | 2953 | thread->mddev = mddev; |
2841 | thread->name = name; | 2954 | thread->name = name; |
2955 | thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
2842 | ret = kernel_thread(md_thread, thread, 0); | 2956 | ret = kernel_thread(md_thread, thread, 0); |
2843 | if (ret < 0) { | 2957 | if (ret < 0) { |
2844 | kfree(thread); | 2958 | kfree(thread); |
@@ -2877,13 +2991,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
2877 | 2991 | ||
2878 | if (!rdev || rdev->faulty) | 2992 | if (!rdev || rdev->faulty) |
2879 | return; | 2993 | return; |
2880 | 2994 | /* | |
2881 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", | 2995 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", |
2882 | mdname(mddev), | 2996 | mdname(mddev), |
2883 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), | 2997 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), |
2884 | __builtin_return_address(0),__builtin_return_address(1), | 2998 | __builtin_return_address(0),__builtin_return_address(1), |
2885 | __builtin_return_address(2),__builtin_return_address(3)); | 2999 | __builtin_return_address(2),__builtin_return_address(3)); |
2886 | 3000 | */ | |
2887 | if (!mddev->pers->error_handler) | 3001 | if (!mddev->pers->error_handler) |
2888 | return; | 3002 | return; |
2889 | mddev->pers->error_handler(mddev,rdev); | 3003 | mddev->pers->error_handler(mddev,rdev); |
@@ -3037,6 +3151,7 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
3037 | struct list_head *tmp2; | 3151 | struct list_head *tmp2; |
3038 | mdk_rdev_t *rdev; | 3152 | mdk_rdev_t *rdev; |
3039 | int i; | 3153 | int i; |
3154 | struct bitmap *bitmap; | ||
3040 | 3155 | ||
3041 | if (v == (void*)1) { | 3156 | if (v == (void*)1) { |
3042 | seq_printf(seq, "Personalities : "); | 3157 | seq_printf(seq, "Personalities : "); |
@@ -3089,10 +3204,36 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
3089 | if (mddev->pers) { | 3204 | if (mddev->pers) { |
3090 | mddev->pers->status (seq, mddev); | 3205 | mddev->pers->status (seq, mddev); |
3091 | seq_printf(seq, "\n "); | 3206 | seq_printf(seq, "\n "); |
3092 | if (mddev->curr_resync > 2) | 3207 | if (mddev->curr_resync > 2) { |
3093 | status_resync (seq, mddev); | 3208 | status_resync (seq, mddev); |
3094 | else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) | 3209 | seq_printf(seq, "\n "); |
3095 | seq_printf(seq, " resync=DELAYED"); | 3210 | } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) |
3211 | seq_printf(seq, " resync=DELAYED\n "); | ||
3212 | } else | ||
3213 | seq_printf(seq, "\n "); | ||
3214 | |||
3215 | if ((bitmap = mddev->bitmap)) { | ||
3216 | char *buf, *path; | ||
3217 | unsigned long chunk_kb; | ||
3218 | unsigned long flags; | ||
3219 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
3220 | spin_lock_irqsave(&bitmap->lock, flags); | ||
3221 | chunk_kb = bitmap->chunksize >> 10; | ||
3222 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " | ||
3223 | "%lu%s chunk", | ||
3224 | bitmap->pages - bitmap->missing_pages, | ||
3225 | bitmap->pages, | ||
3226 | (bitmap->pages - bitmap->missing_pages) | ||
3227 | << (PAGE_SHIFT - 10), | ||
3228 | chunk_kb ? chunk_kb : bitmap->chunksize, | ||
3229 | chunk_kb ? "KB" : "B"); | ||
3230 | if (bitmap->file && buf) { | ||
3231 | path = file_path(bitmap->file, buf, PAGE_SIZE); | ||
3232 | seq_printf(seq, ", file: %s", path ? path : ""); | ||
3233 | } | ||
3234 | seq_printf(seq, "\n"); | ||
3235 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
3236 | kfree(buf); | ||
3096 | } | 3237 | } |
3097 | 3238 | ||
3098 | seq_printf(seq, "\n"); | 3239 | seq_printf(seq, "\n"); |
@@ -3328,7 +3469,8 @@ static void md_do_sync(mddev_t *mddev) | |||
3328 | sysctl_speed_limit_max); | 3469 | sysctl_speed_limit_max); |
3329 | 3470 | ||
3330 | is_mddev_idle(mddev); /* this also initializes IO event counters */ | 3471 | is_mddev_idle(mddev); /* this also initializes IO event counters */ |
3331 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 3472 | /* we don't use the checkpoint if there's a bitmap */ |
3473 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap) | ||
3332 | j = mddev->recovery_cp; | 3474 | j = mddev->recovery_cp; |
3333 | else | 3475 | else |
3334 | j = 0; | 3476 | j = 0; |
@@ -3673,6 +3815,8 @@ static int __init md_init(void) | |||
3673 | " MD_SB_DISKS=%d\n", | 3815 | " MD_SB_DISKS=%d\n", |
3674 | MD_MAJOR_VERSION, MD_MINOR_VERSION, | 3816 | MD_MAJOR_VERSION, MD_MINOR_VERSION, |
3675 | MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); | 3817 | MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); |
3818 | printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, | ||
3819 | BITMAP_MINOR); | ||
3676 | 3820 | ||
3677 | if (register_blkdev(MAJOR_NR, "md")) | 3821 | if (register_blkdev(MAJOR_NR, "md")) |
3678 | return -1; | 3822 | return -1; |