From 36fa30636fb84b209210299684e1be66d9e58217 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:45 -0700 Subject: [PATCH] md: all hot-add and hot-remove of md intent logging bitmaps Both file-bitmaps and superblock bitmaps are supported. If you add a bitmap file on the array device, you lose. This introduces a 'default_bitmap_offset' field in mddev, as the ioctl used for adding a superblock bitmap doesn't have room for giving an offset. Later, this value will be setable via sysfs. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 8c14ba565a45..817062bf7352 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -278,6 +278,10 @@ struct mddev_s * start of bitmap. May be * negative, but not '0' */ + long default_bitmap_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ struct list_head all_mddevs; }; @@ -314,6 +318,12 @@ struct mdk_personality_s int (*resize) (mddev_t *mddev, sector_t sectors); int (*reshape) (mddev_t *mddev, int raid_disks); int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); + /* quiesce moves between quiescence states + * 0 - fully active + * 1 - no new requests allowed + * others - reserved + */ + void (*quiesce) (mddev_t *mddev, int state); }; -- cgit v1.2.2 From 8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:45 -0700 Subject: [PATCH] md: support write-mostly device in raid1 This allows a device in a raid1 to be marked as "write mostly". Read requests will only be sent if there is no other option. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 3 +++ include/linux/raid/md_p.h | 11 +++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 817062bf7352..7ef78e15ce04 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -181,6 +181,9 @@ struct mdk_rdev_s int faulty; /* if faulty do not issue IO requests */ int in_sync; /* device is a full member of the array */ + unsigned long flags; /* Should include faulty and in_sync here. */ +#define WriteMostly 4 /* Avoid reading if at all possible */ + int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ int saved_raid_disk; /* role that device used to have in the diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index dc65cd435494..4f047f84fb1f 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -79,6 +79,11 @@ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. + * read requests will only be sent here in + * dire need + */ + typedef struct mdp_device_descriptor_s { __u32 number; /* 0 Device number in the entire set */ __u32 major; /* 1 Device major number */ @@ -193,7 +198,7 @@ struct mdp_superblock_1 { __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ - __u32 layout; /* only for raid5 currently */ + __u32 layout; /* only for raid5 and raid10 currently */ __u64 size; /* used size of component devices, in 512byte sectors */ __u32 chunksize; /* in 512byte sectors */ @@ -212,7 +217,9 @@ struct mdp_superblock_1 { __u32 dev_number; /* permanent identifier of this device - not role in raid */ __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ - __u8 pad2[64-56]; /* set to 0 when writing */ + __u8 devflags; /* per-device flags. Only one defined...*/ +#define WriteMostly1 1 /* mask for writemostly flag in above */ + __u8 pad2[64-57]; /* set to 0 when writing */ /* array state information - 64 bytes */ __u64 utime; /* 40 bits second, 24 btes microseconds */ -- cgit v1.2.2 From 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:47 -0700 Subject: [PATCH] md: add write-behind support for md/raid1 If a device is flagged 'WriteMostly' and the array has a bitmap, and the bitmap superblock indicates that write_behind is allowed, then write_behind is enabled for WriteMostly devices. Write requests will be acknowledges as complete to the caller (via b_end_io) when all non-WriteMostly devices have completed the write, but will not be cleared from the bitmap until all devices complete. This requires memory allocation to make a local copy of the data being written. If there is insufficient memory, then we fall-back on normal write semantics. Signed-Off-By: Paul Clements Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/bitmap.h | 15 ++++++++++----- include/linux/raid/md_k.h | 3 +++ include/linux/raid/raid1.h | 13 +++++++++++++ 3 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 4bf1659f8aa8..9de99198caf1 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -7,7 +7,7 @@ #define BITMAP_H 1 #define BITMAP_MAJOR 3 -#define BITMAP_MINOR 38 +#define BITMAP_MINOR 39 /* * in-memory bitmap: @@ -147,8 +147,9 @@ typedef struct bitmap_super_s { __u32 state; /* 48 bitmap state information */ __u32 chunksize; /* 52 the bitmap chunk size in bytes */ __u32 daemon_sleep; /* 56 seconds between disk flushes */ + __u32 write_behind; /* 60 number of outstanding write-behind writes */ - __u8 pad[256 - 60]; /* set to zero */ + __u8 pad[256 - 64]; /* set to zero */ } bitmap_super_t; /* notes: @@ -226,6 +227,9 @@ struct bitmap { unsigned long flags; + unsigned long max_write_behind; /* write-behind mode */ + atomic_t behind_writes; + /* * the bitmap daemon - periodically wakes up and sweeps the bitmap * file, cleaning up bits and flushing out pages to disk as necessary @@ -260,9 +264,10 @@ int bitmap_setallbits(struct bitmap *bitmap); void bitmap_write_all(struct bitmap *bitmap); /* these are exported */ -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, - int success); +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); void bitmap_close_sync(struct bitmap *bitmap); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 7ef78e15ce04..2514e5fcda7f 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -275,6 +275,9 @@ struct mddev_s atomic_t writes_pending; request_queue_t *queue; /* for plugging ... */ + atomic_t write_behind; /* outstanding async IO */ + unsigned int max_write_behind; /* 0 = sync */ + struct bitmap *bitmap; /* the bitmap for the device */ struct file *bitmap_file; /* the bitmap file */ long bitmap_offset; /* offset from superblock of diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h index 9d93cf12e890..60e19b667548 100644 --- a/include/linux/raid/raid1.h +++ b/include/linux/raid/raid1.h @@ -80,6 +80,9 @@ struct r1bio_s { atomic_t remaining; /* 'have we finished' count, * used from IRQ handlers */ + atomic_t behind_remaining; /* number of write-behind ios remaining + * in this BehindIO request + */ sector_t sector; int sectors; unsigned long state; @@ -107,4 +110,14 @@ struct r1bio_s { #define R1BIO_Uptodate 0 #define R1BIO_IsSync 1 #define R1BIO_Degraded 2 +#define R1BIO_BehindIO 3 +/* For write-behind requests, we call bi_end_io when + * the last non-write-behind device completes, providing + * any write was successful. Otherwise we call when + * any write-behind write succeeds, otherwise we call + * with failure when last write completes (and all failed). + * Record that bi_end_io was called with this flag... + */ +#define R1BIO_Returned 4 + #endif -- cgit v1.2.2 From 15945fee6f09bff1f86b1a735b5888dc59cf38e3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:47 -0700 Subject: [PATCH] md: support md/linear array with components greater than 2 terabytes. linear currently uses division by the size of the smallest componenet device to find which device a request goes to. If that smallest device is larger than 2 terabytes, then the division will not work on some systems. So we introduce a pre-shift, and take care not to make the hash table too large, much like the code in raid0. Also get rid of conf->nr_zones, which is not needed. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/linear.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h index e04c4fe45b53..7eaf290e10e7 100644 --- a/include/linux/raid/linear.h +++ b/include/linux/raid/linear.h @@ -14,8 +14,8 @@ typedef struct dev_info dev_info_t; struct linear_private_data { dev_info_t **hash_table; - dev_info_t *smallest; - int nr_zones; + sector_t hash_spacing; + int preshift; /* shift before dividing by hash_spacing */ dev_info_t disks[0]; }; -- cgit v1.2.2 From 71c0805cb48462c99fbe0e5fcc6c12d7b9929c09 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:51 -0700 Subject: [PATCH] md: allow md to load a superblock with feature-bit '1' set As this is used to flag an internal bitmap. Also, introduce symbolic names for feature bits. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_p.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 4f047f84fb1f..c100fa5d4bfa 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -238,5 +238,10 @@ struct mdp_superblock_1 { __u16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ }; +/* feature_map bits */ +#define MD_FEATURE_BITMAP_OFFSET 1 + +#define MD_FEATURE_ALL 1 + #endif -- cgit v1.2.2 From 773f7834425e83144c95fbbc553ced3c2b74b828 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:53 -0700 Subject: [PATCH] md: remove old cruft from md_k.h header file These inlines haven't been used for ages, they should go. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 64 ----------------------------------------------- 1 file changed, 64 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 2514e5fcda7f..8042f55dd323 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -85,70 +85,6 @@ typedef struct mdk_rdev_s mdk_rdev_t; #define MAX_CHUNK_SIZE (4096*1024) -/* - * default readahead - */ - -static inline int disk_faulty(mdp_disk_t * d) -{ - return d->state & (1 << MD_DISK_FAULTY); -} - -static inline int disk_active(mdp_disk_t * d) -{ - return d->state & (1 << MD_DISK_ACTIVE); -} - -static inline int disk_sync(mdp_disk_t * d) -{ - return d->state & (1 << MD_DISK_SYNC); -} - -static inline int disk_spare(mdp_disk_t * d) -{ - return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); -} - -static inline int disk_removed(mdp_disk_t * d) -{ - return d->state & (1 << MD_DISK_REMOVED); -} - -static inline void mark_disk_faulty(mdp_disk_t * d) -{ - d->state |= (1 << MD_DISK_FAULTY); -} - -static inline void mark_disk_active(mdp_disk_t * d) -{ - d->state |= (1 << MD_DISK_ACTIVE); -} - -static inline void mark_disk_sync(mdp_disk_t * d) -{ - d->state |= (1 << MD_DISK_SYNC); -} - -static inline void mark_disk_spare(mdp_disk_t * d) -{ - d->state = 0; -} - -static inline void mark_disk_removed(mdp_disk_t * d) -{ - d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); -} - -static inline void mark_disk_inactive(mdp_disk_t * d) -{ - d->state &= ~(1 << MD_DISK_ACTIVE); -} - -static inline void mark_disk_nonsync(mdp_disk_t * d) -{ - d->state &= ~(1 << MD_DISK_SYNC); -} - /* * MD's 'extended' device */ -- cgit v1.2.2 From 0002b2718dd04da67c21f8a7830de8d95a9b0345 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:53 -0700 Subject: [PATCH] md: limit size of sb read/written to appropriate amount version-1 superblocks are not (normally) 4K long, and can be of variable size. Writing the full 4K can cause corruption (but only in non-default configurations). With this patch the super-block-flavour can choose a size to read, and set a size to write based on what it finds. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 8042f55dd323..ebce949b1443 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -102,6 +102,7 @@ struct mdk_rdev_s int sb_loaded; sector_t data_offset; /* start of data in array */ sector_t sb_offset; + int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ /* A device can be in one of three states based on two flags: -- cgit v1.2.2 From 72626685dc66d455742a7f215a0535c551628b9e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:54 -0700 Subject: [PATCH] md: add write-intent-bitmap support to raid5 Most awkward part of this is delaying write requests until bitmap updates have been flushed. To achieve this, we have a sequence number (seq_flush) which is incremented each time the raid5 is unplugged. If the raid thread notices that this has changed, it flushes bitmap changes, and assigned the value of seq_flush to seq_write. When a write request arrives, it is given the number from seq_write, and that write request may not complete until seq_flush is larger than the saved seq number. We have a new queue for storing stripes which are waiting for a bitmap flush and an extra flag for stripes to record if the write was 'degraded' and so should not clear the a bit in the bitmap. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index d63ddcb4afad..176fc653c284 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -134,6 +134,7 @@ struct stripe_head { unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ spinlock_t lock; + int bm_seq; /* sequence number for bitmap flushes */ struct r5dev { struct bio req; struct bio_vec vec; @@ -165,12 +166,13 @@ struct stripe_head { /* * Stripe state */ -#define STRIPE_ERROR 1 #define STRIPE_HANDLE 2 #define STRIPE_SYNCING 3 #define STRIPE_INSYNC 4 #define STRIPE_PREREAD_ACTIVE 5 #define STRIPE_DELAYED 6 +#define STRIPE_DEGRADED 7 +#define STRIPE_BIT_DELAY 8 /* * Plugging: @@ -210,10 +212,20 @@ struct raid5_private_data { struct list_head handle_list; /* stripes needing handling */ struct list_head delayed_list; /* stripes that have plugged requests */ + struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ atomic_t preread_active_stripes; /* stripes with scheduled io */ char cache_name[20]; kmem_cache_t *slab_cache; /* for allocating stripes */ + + int seq_flush, seq_write; + int quiesce; + + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + /* * Free stripes pool */ -- cgit v1.2.2