aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@cse.unsw.edu.au>2005-06-21 20:17:14 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-21 22:07:43 -0400
commit32a7627cf3a35396a8e834faf34e38ae9f3b1309 (patch)
tree3fe7764f5d8e39d835a397e1099358d924b02981 /drivers/md/md.c
parent57afd89f98a990747445f01c458ecae64263b2f8 (diff)
[PATCH] md: optimised resync using Bitmap based intent logging
With this patch, the intent to write to some block in the array can be logged to a bitmap file. Each bit represents some number of sectors and is set before any update happens, and only cleared when all writes relating to all sectors are complete. After an unclean shutdown, information in this bitmap can be used to optimise resync - only sectors which could be out-of-sync need to be updated. Also if a drive is removed and then added back into an array, the recovery can make use of the bitmap to optimise reconstruction. This is not implemented in this patch. Currently the bitmap is stored in a file which must (obviously) be stored on a separate device. The patch only provided infrastructure. It does not update any personalities to bitmap intent logging. Md arrays can still be used with no bitmap file. This patch has minimal impact on such arrays. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c172
1 files changed, 158 insertions, 14 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fa608a1a5c20..c402f6cc7047 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -19,6 +19,9 @@
19 19
20 Neil Brown <neilb@cse.unsw.edu.au>. 20 Neil Brown <neilb@cse.unsw.edu.au>.
21 21
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
22 This program is free software; you can redistribute it and/or modify 25 This program is free software; you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by 26 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation; either version 2, or (at your option) 27 the Free Software Foundation; either version 2, or (at your option)
@@ -33,6 +36,7 @@
33#include <linux/config.h> 36#include <linux/config.h>
34#include <linux/linkage.h> 37#include <linux/linkage.h>
35#include <linux/raid/md.h> 38#include <linux/raid/md.h>
39#include <linux/raid/bitmap.h>
36#include <linux/sysctl.h> 40#include <linux/sysctl.h>
37#include <linux/devfs_fs_kernel.h> 41#include <linux/devfs_fs_kernel.h>
38#include <linux/buffer_head.h> /* for invalidate_bdev */ 42#include <linux/buffer_head.h> /* for invalidate_bdev */
@@ -40,6 +44,8 @@
40 44
41#include <linux/init.h> 45#include <linux/init.h>
42 46
47#include <linux/file.h>
48
43#ifdef CONFIG_KMOD 49#ifdef CONFIG_KMOD
44#include <linux/kmod.h> 50#include <linux/kmod.h>
45#endif 51#endif
@@ -1198,8 +1204,11 @@ void md_print_devices(void)
1198 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1204 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1199 printk("md: **********************************\n"); 1205 printk("md: **********************************\n");
1200 ITERATE_MDDEV(mddev,tmp) { 1206 ITERATE_MDDEV(mddev,tmp) {
1201 printk("%s: ", mdname(mddev));
1202 1207
1208 if (mddev->bitmap)
1209 bitmap_print_sb(mddev->bitmap);
1210 else
1211 printk("%s: ", mdname(mddev));
1203 ITERATE_RDEV(mddev,rdev,tmp2) 1212 ITERATE_RDEV(mddev,rdev,tmp2)
1204 printk("<%s>", bdevname(rdev->bdev,b)); 1213 printk("<%s>", bdevname(rdev->bdev,b));
1205 printk("\n"); 1214 printk("\n");
@@ -1287,7 +1296,7 @@ repeat:
1287 "md: updating %s RAID superblock on device (in sync %d)\n", 1296 "md: updating %s RAID superblock on device (in sync %d)\n",
1288 mdname(mddev),mddev->in_sync); 1297 mdname(mddev),mddev->in_sync);
1289 1298
1290 err = 0; 1299 err = bitmap_update_sb(mddev->bitmap);
1291 ITERATE_RDEV(mddev,rdev,tmp) { 1300 ITERATE_RDEV(mddev,rdev,tmp) {
1292 char b[BDEVNAME_SIZE]; 1301 char b[BDEVNAME_SIZE];
1293 dprintk(KERN_INFO "md: "); 1302 dprintk(KERN_INFO "md: ");
@@ -1624,12 +1633,19 @@ static int do_md_run(mddev_t * mddev)
1624 1633
1625 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1634 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
1626 1635
1627 err = mddev->pers->run(mddev); 1636 /* before we start the array running, initialise the bitmap */
1637 err = bitmap_create(mddev);
1638 if (err)
1639 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
1640 mdname(mddev), err);
1641 else
1642 err = mddev->pers->run(mddev);
1628 if (err) { 1643 if (err) {
1629 printk(KERN_ERR "md: pers->run() failed ...\n"); 1644 printk(KERN_ERR "md: pers->run() failed ...\n");
1630 module_put(mddev->pers->owner); 1645 module_put(mddev->pers->owner);
1631 mddev->pers = NULL; 1646 mddev->pers = NULL;
1632 return -EINVAL; 1647 bitmap_destroy(mddev);
1648 return err;
1633 } 1649 }
1634 atomic_set(&mddev->writes_pending,0); 1650 atomic_set(&mddev->writes_pending,0);
1635 mddev->safemode = 0; 1651 mddev->safemode = 0;
@@ -1742,6 +1758,14 @@ static int do_md_stop(mddev_t * mddev, int ro)
1742 if (ro) 1758 if (ro)
1743 set_disk_ro(disk, 1); 1759 set_disk_ro(disk, 1);
1744 } 1760 }
1761
1762 bitmap_destroy(mddev);
1763 if (mddev->bitmap_file) {
1764 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
1765 fput(mddev->bitmap_file);
1766 mddev->bitmap_file = NULL;
1767 }
1768
1745 /* 1769 /*
1746 * Free resources if final stop 1770 * Free resources if final stop
1747 */ 1771 */
@@ -2000,6 +2024,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
2000 return 0; 2024 return 0;
2001} 2025}
2002 2026
2027static int get_bitmap_file(mddev_t * mddev, void * arg)
2028{
2029 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
2030 char *ptr, *buf = NULL;
2031 int err = -ENOMEM;
2032
2033 file = kmalloc(sizeof(*file), GFP_KERNEL);
2034 if (!file)
2035 goto out;
2036
2037 /* bitmap disabled, zero the first byte and copy out */
2038 if (!mddev->bitmap || !mddev->bitmap->file) {
2039 file->pathname[0] = '\0';
2040 goto copy_out;
2041 }
2042
2043 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
2044 if (!buf)
2045 goto out;
2046
2047 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
2048 if (!ptr)
2049 goto out;
2050
2051 strcpy(file->pathname, ptr);
2052
2053copy_out:
2054 err = 0;
2055 if (copy_to_user(arg, file, sizeof(*file)))
2056 err = -EFAULT;
2057out:
2058 kfree(buf);
2059 kfree(file);
2060 return err;
2061}
2062
2003static int get_disk_info(mddev_t * mddev, void __user * arg) 2063static int get_disk_info(mddev_t * mddev, void __user * arg)
2004{ 2064{
2005 mdu_disk_info_t info; 2065 mdu_disk_info_t info;
@@ -2275,6 +2335,48 @@ abort_export:
2275 return err; 2335 return err;
2276} 2336}
2277 2337
2338/* similar to deny_write_access, but accounts for our holding a reference
2339 * to the file ourselves */
2340static int deny_bitmap_write_access(struct file * file)
2341{
2342 struct inode *inode = file->f_mapping->host;
2343
2344 spin_lock(&inode->i_lock);
2345 if (atomic_read(&inode->i_writecount) > 1) {
2346 spin_unlock(&inode->i_lock);
2347 return -ETXTBSY;
2348 }
2349 atomic_set(&inode->i_writecount, -1);
2350 spin_unlock(&inode->i_lock);
2351
2352 return 0;
2353}
2354
2355static int set_bitmap_file(mddev_t *mddev, int fd)
2356{
2357 int err;
2358
2359 if (mddev->pers)
2360 return -EBUSY;
2361
2362 mddev->bitmap_file = fget(fd);
2363
2364 if (mddev->bitmap_file == NULL) {
2365 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
2366 mdname(mddev));
2367 return -EBADF;
2368 }
2369
2370 err = deny_bitmap_write_access(mddev->bitmap_file);
2371 if (err) {
2372 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
2373 mdname(mddev));
2374 fput(mddev->bitmap_file);
2375 mddev->bitmap_file = NULL;
2376 }
2377 return err;
2378}
2379
2278/* 2380/*
2279 * set_array_info is used two different ways 2381 * set_array_info is used two different ways
2280 * The original usage is when creating a new array. 2382 * The original usage is when creating a new array.
@@ -2586,8 +2688,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2586 /* 2688 /*
2587 * Commands querying/configuring an existing array: 2689 * Commands querying/configuring an existing array:
2588 */ 2690 */
2589 /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ 2691 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
2590 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { 2692 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
2693 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
2694 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
2591 err = -ENODEV; 2695 err = -ENODEV;
2592 goto abort_unlock; 2696 goto abort_unlock;
2593 } 2697 }
@@ -2601,6 +2705,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2601 err = get_array_info(mddev, argp); 2705 err = get_array_info(mddev, argp);
2602 goto done_unlock; 2706 goto done_unlock;
2603 2707
2708 case GET_BITMAP_FILE:
2709 err = get_bitmap_file(mddev, (void *)arg);
2710 goto done_unlock;
2711
2604 case GET_DISK_INFO: 2712 case GET_DISK_INFO:
2605 err = get_disk_info(mddev, argp); 2713 err = get_disk_info(mddev, argp);
2606 goto done_unlock; 2714 goto done_unlock;
@@ -2681,6 +2789,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2681 err = do_md_run (mddev); 2789 err = do_md_run (mddev);
2682 goto done_unlock; 2790 goto done_unlock;
2683 2791
2792 case SET_BITMAP_FILE:
2793 err = set_bitmap_file(mddev, (int)arg);
2794 goto done_unlock;
2795
2684 default: 2796 default:
2685 if (_IOC_TYPE(cmd) == MD_MAJOR) 2797 if (_IOC_TYPE(cmd) == MD_MAJOR)
2686 printk(KERN_WARNING "md: %s(pid %d) used" 2798 printk(KERN_WARNING "md: %s(pid %d) used"
@@ -2792,8 +2904,9 @@ static int md_thread(void * arg)
2792 while (thread->run) { 2904 while (thread->run) {
2793 void (*run)(mddev_t *); 2905 void (*run)(mddev_t *);
2794 2906
2795 wait_event_interruptible(thread->wqueue, 2907 wait_event_interruptible_timeout(thread->wqueue,
2796 test_bit(THREAD_WAKEUP, &thread->flags)); 2908 test_bit(THREAD_WAKEUP, &thread->flags),
2909 thread->timeout);
2797 if (current->flags & PF_FREEZE) 2910 if (current->flags & PF_FREEZE)
2798 refrigerator(PF_FREEZE); 2911 refrigerator(PF_FREEZE);
2799 2912
@@ -2839,6 +2952,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
2839 thread->run = run; 2952 thread->run = run;
2840 thread->mddev = mddev; 2953 thread->mddev = mddev;
2841 thread->name = name; 2954 thread->name = name;
2955 thread->timeout = MAX_SCHEDULE_TIMEOUT;
2842 ret = kernel_thread(md_thread, thread, 0); 2956 ret = kernel_thread(md_thread, thread, 0);
2843 if (ret < 0) { 2957 if (ret < 0) {
2844 kfree(thread); 2958 kfree(thread);
@@ -2877,13 +2991,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
2877 2991
2878 if (!rdev || rdev->faulty) 2992 if (!rdev || rdev->faulty)
2879 return; 2993 return;
2880 2994/*
2881 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 2995 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
2882 mdname(mddev), 2996 mdname(mddev),
2883 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 2997 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
2884 __builtin_return_address(0),__builtin_return_address(1), 2998 __builtin_return_address(0),__builtin_return_address(1),
2885 __builtin_return_address(2),__builtin_return_address(3)); 2999 __builtin_return_address(2),__builtin_return_address(3));
2886 3000*/
2887 if (!mddev->pers->error_handler) 3001 if (!mddev->pers->error_handler)
2888 return; 3002 return;
2889 mddev->pers->error_handler(mddev,rdev); 3003 mddev->pers->error_handler(mddev,rdev);
@@ -3037,6 +3151,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
3037 struct list_head *tmp2; 3151 struct list_head *tmp2;
3038 mdk_rdev_t *rdev; 3152 mdk_rdev_t *rdev;
3039 int i; 3153 int i;
3154 struct bitmap *bitmap;
3040 3155
3041 if (v == (void*)1) { 3156 if (v == (void*)1) {
3042 seq_printf(seq, "Personalities : "); 3157 seq_printf(seq, "Personalities : ");
@@ -3089,10 +3204,36 @@ static int md_seq_show(struct seq_file *seq, void *v)
3089 if (mddev->pers) { 3204 if (mddev->pers) {
3090 mddev->pers->status (seq, mddev); 3205 mddev->pers->status (seq, mddev);
3091 seq_printf(seq, "\n "); 3206 seq_printf(seq, "\n ");
3092 if (mddev->curr_resync > 2) 3207 if (mddev->curr_resync > 2) {
3093 status_resync (seq, mddev); 3208 status_resync (seq, mddev);
3094 else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3209 seq_printf(seq, "\n ");
3095 seq_printf(seq, " resync=DELAYED"); 3210 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3211 seq_printf(seq, " resync=DELAYED\n ");
3212 } else
3213 seq_printf(seq, "\n ");
3214
3215 if ((bitmap = mddev->bitmap)) {
3216 char *buf, *path;
3217 unsigned long chunk_kb;
3218 unsigned long flags;
3219 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
3220 spin_lock_irqsave(&bitmap->lock, flags);
3221 chunk_kb = bitmap->chunksize >> 10;
3222 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
3223 "%lu%s chunk",
3224 bitmap->pages - bitmap->missing_pages,
3225 bitmap->pages,
3226 (bitmap->pages - bitmap->missing_pages)
3227 << (PAGE_SHIFT - 10),
3228 chunk_kb ? chunk_kb : bitmap->chunksize,
3229 chunk_kb ? "KB" : "B");
3230 if (bitmap->file && buf) {
3231 path = file_path(bitmap->file, buf, PAGE_SIZE);
3232 seq_printf(seq, ", file: %s", path ? path : "");
3233 }
3234 seq_printf(seq, "\n");
3235 spin_unlock_irqrestore(&bitmap->lock, flags);
3236 kfree(buf);
3096 } 3237 }
3097 3238
3098 seq_printf(seq, "\n"); 3239 seq_printf(seq, "\n");
@@ -3328,7 +3469,8 @@ static void md_do_sync(mddev_t *mddev)
3328 sysctl_speed_limit_max); 3469 sysctl_speed_limit_max);
3329 3470
3330 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3471 is_mddev_idle(mddev); /* this also initializes IO event counters */
3331 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3472 /* we don't use the checkpoint if there's a bitmap */
3473 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap)
3332 j = mddev->recovery_cp; 3474 j = mddev->recovery_cp;
3333 else 3475 else
3334 j = 0; 3476 j = 0;
@@ -3673,6 +3815,8 @@ static int __init md_init(void)
3673 " MD_SB_DISKS=%d\n", 3815 " MD_SB_DISKS=%d\n",
3674 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3816 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3675 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3817 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3818 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR,
3819 BITMAP_MINOR);
3676 3820
3677 if (register_blkdev(MAJOR_NR, "md")) 3821 if (register_blkdev(MAJOR_NR, "md"))
3678 return -1; 3822 return -1;