aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/md.txt72
-rw-r--r--drivers/md/Kconfig9
-rw-r--r--drivers/md/bitmap.c449
-rw-r--r--drivers/md/bitmap.h19
-rw-r--r--drivers/md/faulty.c1
-rw-r--r--drivers/md/linear.c3
-rw-r--r--drivers/md/md.c393
-rw-r--r--drivers/md/md.h51
-rw-r--r--drivers/md/multipath.c3
-rw-r--r--drivers/md/raid0.c3
-rw-r--r--drivers/md/raid1.c217
-rw-r--r--drivers/md/raid1.h5
-rw-r--r--drivers/md/raid10.c116
-rw-r--r--drivers/md/raid5.c63
-rw-r--r--drivers/md/raid6algos.c20
-rw-r--r--fs/compat_ioctl.c18
-rw-r--r--include/linux/raid/pq.h19
17 files changed, 1146 insertions, 315 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 4edd39ec7db9..188f4768f1d5 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -233,9 +233,9 @@ All md devices contain:
233 233
234 resync_start 234 resync_start
235 The point at which resync should start. If no resync is needed, 235 The point at which resync should start. If no resync is needed,
236 this will be a very large number. At array creation it will 236 this will be a very large number (or 'none' since 2.6.30-rc1). At
237 default to 0, though starting the array as 'clean' will 237 array creation it will default to 0, though starting the array as
238 set it much larger. 238 'clean' will set it much larger.
239 239
240 new_dev 240 new_dev
241 This file can be written but not read. The value written should 241 This file can be written but not read. The value written should
@@ -296,6 +296,51 @@ All md devices contain:
296 active-idle 296 active-idle
297 like active, but no writes have been seen for a while (safe_mode_delay). 297 like active, but no writes have been seen for a while (safe_mode_delay).
298 298
299 bitmap/location
300 This indicates where the write-intent bitmap for the array is
301 stored.
302 It can be one of "none", "file" or "[+-]N".
303 "file" may later be extended to "file:/file/name"
304 "[+-]N" means that many sectors from the start of the metadata.
305 This is replicated on all devices. For arrays with externally
306 managed metadata, the offset is from the beginning of the
307 device.
308 bitmap/chunksize
309 The size, in bytes, of the chunk which will be represented by a
310 single bit. For RAID456, it is a portion of an individual
311 device. For RAID10, it is a portion of the array. For RAID1, it
312 is both (they come to the same thing).
313 bitmap/time_base
314 The time, in seconds, between looking for bits in the bitmap to
315 be cleared. In the current implementation, a bit will be cleared
316 between 2 and 3 times "time_base" after all the covered blocks
317 are known to be in-sync.
318 bitmap/backlog
319 When write-mostly devices are active in a RAID1, write requests
320 to those devices proceed in the background - the filesystem (or
321 other user of the device) does not have to wait for them.
322 'backlog' sets a limit on the number of concurrent background
323 writes. If there are more than this, new writes will by
324 synchronous.
325 bitmap/metadata
326 This can be either 'internal' or 'external'.
327 'internal' is the default and means the metadata for the bitmap
328 is stored in the first 256 bytes of the allocated space and is
329 managed by the md module.
330 'external' means that bitmap metadata is managed externally to
331 the kernel (i.e. by some userspace program)
332 bitmap/can_clear
333 This is either 'true' or 'false'. If 'true', then bits in the
334 bitmap will be cleared when the corresponding blocks are thought
335 to be in-sync. If 'false', bits will never be cleared.
336 This is automatically set to 'false' if a write happens on a
337 degraded array, or if the array becomes degraded during a write.
338 When metadata is managed externally, it should be set to true
339 once the array becomes non-degraded, and this fact has been
340 recorded in the metadata.
341
342
343
299 344
300As component devices are added to an md array, they appear in the 'md' 345As component devices are added to an md array, they appear in the 'md'
301directory as new directories named 346directory as new directories named
@@ -334,8 +379,9 @@ Each directory contains:
334 Writing "writemostly" sets the writemostly flag. 379 Writing "writemostly" sets the writemostly flag.
335 Writing "-writemostly" clears the writemostly flag. 380 Writing "-writemostly" clears the writemostly flag.
336 Writing "blocked" sets the "blocked" flag. 381 Writing "blocked" sets the "blocked" flag.
337 Writing "-blocked" clear the "blocked" flag and allows writes 382 Writing "-blocked" clears the "blocked" flag and allows writes
338 to complete. 383 to complete.
384 Writing "in_sync" sets the in_sync flag.
339 385
340 This file responds to select/poll. Any change to 'faulty' 386 This file responds to select/poll. Any change to 'faulty'
341 or 'blocked' causes an event. 387 or 'blocked' causes an event.
@@ -372,6 +418,24 @@ Each directory contains:
372 array. If a value less than the current component_size is 418 array. If a value less than the current component_size is
373 written, it will be rejected. 419 written, it will be rejected.
374 420
421 recovery_start
422
423 When the device is not 'in_sync', this records the number of
424 sectors from the start of the device which are known to be
425 correct. This is normally zero, but during a recovery
426 operation is will steadily increase, and if the recovery is
427 interrupted, restoring this value can cause recovery to
428 avoid repeating the earlier blocks. With v1.x metadata, this
429 value is saved and restored automatically.
430
431 This can be set whenever the device is not an active member of
432 the array, either before the array is activated, or before
433 the 'slot' is set.
434
435 Setting this to 'none' is equivalent to setting 'in_sync'.
436 Setting to any other value also clears the 'in_sync' flag.
437
438
375 439
376An active md device will also contain and entry for each active device 440An active md device will also contain and entry for each active device
377in the array. These are named 441in the array. These are named
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2158377a1359..acb3a4e404ff 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -185,11 +185,10 @@ config MD_MULTIPATH
185 tristate "Multipath I/O support" 185 tristate "Multipath I/O support"
186 depends on BLK_DEV_MD 186 depends on BLK_DEV_MD
187 help 187 help
188 Multipath-IO is the ability of certain devices to address the same 188 MD_MULTIPATH provides a simple multi-path personality for use
189 physical disk over multiple 'IO paths'. The code ensures that such 189 the MD framework. It is not under active development. New
190 paths can be defined and handled at runtime, and ensures that a 190 projects should consider using DM_MULTIPATH which has more
191 transparent failover to the backup path(s) happens if a IO errors 191 features and more testing.
192 arrives on the primary path.
193 192
194 If unsure, say N. 193 If unsure, say N.
195 194
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 60e2b322db11..26ac8aad0b19 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -212,7 +212,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
212 */ 212 */
213 213
214/* IO operations when bitmap is stored near all superblocks */ 214/* IO operations when bitmap is stored near all superblocks */
215static struct page *read_sb_page(mddev_t *mddev, long offset, 215static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
216 struct page *page, 216 struct page *page,
217 unsigned long index, int size) 217 unsigned long index, int size)
218{ 218{
@@ -287,27 +287,36 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
287 287
288 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 288 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
289 int size = PAGE_SIZE; 289 int size = PAGE_SIZE;
290 loff_t offset = mddev->bitmap_info.offset;
290 if (page->index == bitmap->file_pages-1) 291 if (page->index == bitmap->file_pages-1)
291 size = roundup(bitmap->last_page_size, 292 size = roundup(bitmap->last_page_size,
292 bdev_logical_block_size(rdev->bdev)); 293 bdev_logical_block_size(rdev->bdev));
293 /* Just make sure we aren't corrupting data or 294 /* Just make sure we aren't corrupting data or
294 * metadata 295 * metadata
295 */ 296 */
296 if (bitmap->offset < 0) { 297 if (mddev->external) {
298 /* Bitmap could be anywhere. */
299 if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) >
300 rdev->data_offset &&
301 rdev->sb_start + offset <
302 rdev->data_offset + mddev->dev_sectors +
303 (PAGE_SIZE/512))
304 goto bad_alignment;
305 } else if (offset < 0) {
297 /* DATA BITMAP METADATA */ 306 /* DATA BITMAP METADATA */
298 if (bitmap->offset 307 if (offset
299 + (long)(page->index * (PAGE_SIZE/512)) 308 + (long)(page->index * (PAGE_SIZE/512))
300 + size/512 > 0) 309 + size/512 > 0)
301 /* bitmap runs in to metadata */ 310 /* bitmap runs in to metadata */
302 goto bad_alignment; 311 goto bad_alignment;
303 if (rdev->data_offset + mddev->dev_sectors 312 if (rdev->data_offset + mddev->dev_sectors
304 > rdev->sb_start + bitmap->offset) 313 > rdev->sb_start + offset)
305 /* data runs in to bitmap */ 314 /* data runs in to bitmap */
306 goto bad_alignment; 315 goto bad_alignment;
307 } else if (rdev->sb_start < rdev->data_offset) { 316 } else if (rdev->sb_start < rdev->data_offset) {
308 /* METADATA BITMAP DATA */ 317 /* METADATA BITMAP DATA */
309 if (rdev->sb_start 318 if (rdev->sb_start
310 + bitmap->offset 319 + offset
311 + page->index*(PAGE_SIZE/512) + size/512 320 + page->index*(PAGE_SIZE/512) + size/512
312 > rdev->data_offset) 321 > rdev->data_offset)
313 /* bitmap runs in to data */ 322 /* bitmap runs in to data */
@@ -316,7 +325,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
316 /* DATA METADATA BITMAP - no problems */ 325 /* DATA METADATA BITMAP - no problems */
317 } 326 }
318 md_super_write(mddev, rdev, 327 md_super_write(mddev, rdev,
319 rdev->sb_start + bitmap->offset 328 rdev->sb_start + offset
320 + page->index * (PAGE_SIZE/512), 329 + page->index * (PAGE_SIZE/512),
321 size, 330 size,
322 page); 331 page);
@@ -488,6 +497,8 @@ void bitmap_update_sb(struct bitmap *bitmap)
488 497
489 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ 498 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
490 return; 499 return;
500 if (bitmap->mddev->bitmap_info.external)
501 return;
491 spin_lock_irqsave(&bitmap->lock, flags); 502 spin_lock_irqsave(&bitmap->lock, flags);
492 if (!bitmap->sb_page) { /* no superblock */ 503 if (!bitmap->sb_page) { /* no superblock */
493 spin_unlock_irqrestore(&bitmap->lock, flags); 504 spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -501,6 +512,9 @@ void bitmap_update_sb(struct bitmap *bitmap)
501 bitmap->events_cleared = bitmap->mddev->events; 512 bitmap->events_cleared = bitmap->mddev->events;
502 sb->events_cleared = cpu_to_le64(bitmap->events_cleared); 513 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
503 } 514 }
515 /* Just in case these have been changed via sysfs: */
516 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
517 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
504 kunmap_atomic(sb, KM_USER0); 518 kunmap_atomic(sb, KM_USER0);
505 write_page(bitmap, bitmap->sb_page, 1); 519 write_page(bitmap, bitmap->sb_page, 1);
506} 520}
@@ -550,7 +564,8 @@ static int bitmap_read_sb(struct bitmap *bitmap)
550 564
551 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); 565 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes);
552 } else { 566 } else {
553 bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 567 bitmap->sb_page = read_sb_page(bitmap->mddev,
568 bitmap->mddev->bitmap_info.offset,
554 NULL, 569 NULL,
555 0, sizeof(bitmap_super_t)); 570 0, sizeof(bitmap_super_t));
556 } 571 }
@@ -563,7 +578,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
563 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 578 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
564 579
565 chunksize = le32_to_cpu(sb->chunksize); 580 chunksize = le32_to_cpu(sb->chunksize);
566 daemon_sleep = le32_to_cpu(sb->daemon_sleep); 581 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
567 write_behind = le32_to_cpu(sb->write_behind); 582 write_behind = le32_to_cpu(sb->write_behind);
568 583
569 /* verify that the bitmap-specific fields are valid */ 584 /* verify that the bitmap-specific fields are valid */
@@ -576,7 +591,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
576 reason = "bitmap chunksize too small"; 591 reason = "bitmap chunksize too small";
577 else if ((1 << ffz(~chunksize)) != chunksize) 592 else if ((1 << ffz(~chunksize)) != chunksize)
578 reason = "bitmap chunksize not a power of 2"; 593 reason = "bitmap chunksize not a power of 2";
579 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) 594 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
580 reason = "daemon sleep period out of range"; 595 reason = "daemon sleep period out of range";
581 else if (write_behind > COUNTER_MAX) 596 else if (write_behind > COUNTER_MAX)
582 reason = "write-behind limit out of range (0 - 16383)"; 597 reason = "write-behind limit out of range (0 - 16383)";
@@ -610,10 +625,9 @@ static int bitmap_read_sb(struct bitmap *bitmap)
610 } 625 }
611success: 626success:
612 /* assign fields using values from superblock */ 627 /* assign fields using values from superblock */
613 bitmap->chunksize = chunksize; 628 bitmap->mddev->bitmap_info.chunksize = chunksize;
614 bitmap->daemon_sleep = daemon_sleep; 629 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
615 bitmap->daemon_lastrun = jiffies; 630 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
616 bitmap->max_write_behind = write_behind;
617 bitmap->flags |= le32_to_cpu(sb->state); 631 bitmap->flags |= le32_to_cpu(sb->state);
618 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) 632 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
619 bitmap->flags |= BITMAP_HOSTENDIAN; 633 bitmap->flags |= BITMAP_HOSTENDIAN;
@@ -664,16 +678,26 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
664 * general bitmap file operations 678 * general bitmap file operations
665 */ 679 */
666 680
681/*
682 * on-disk bitmap:
683 *
684 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
685 * file a page at a time. There's a superblock at the start of the file.
686 */
667/* calculate the index of the page that contains this bit */ 687/* calculate the index of the page that contains this bit */
668static inline unsigned long file_page_index(unsigned long chunk) 688static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk)
669{ 689{
670 return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT; 690 if (!bitmap->mddev->bitmap_info.external)
691 chunk += sizeof(bitmap_super_t) << 3;
692 return chunk >> PAGE_BIT_SHIFT;
671} 693}
672 694
673/* calculate the (bit) offset of this bit within a page */ 695/* calculate the (bit) offset of this bit within a page */
674static inline unsigned long file_page_offset(unsigned long chunk) 696static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk)
675{ 697{
676 return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1); 698 if (!bitmap->mddev->bitmap_info.external)
699 chunk += sizeof(bitmap_super_t) << 3;
700 return chunk & (PAGE_BITS - 1);
677} 701}
678 702
679/* 703/*
@@ -686,8 +710,9 @@ static inline unsigned long file_page_offset(unsigned long chunk)
686static inline struct page *filemap_get_page(struct bitmap *bitmap, 710static inline struct page *filemap_get_page(struct bitmap *bitmap,
687 unsigned long chunk) 711 unsigned long chunk)
688{ 712{
689 if (file_page_index(chunk) >= bitmap->file_pages) return NULL; 713 if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL;
690 return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; 714 return bitmap->filemap[file_page_index(bitmap, chunk)
715 - file_page_index(bitmap, 0)];
691} 716}
692 717
693 718
@@ -710,7 +735,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
710 spin_unlock_irqrestore(&bitmap->lock, flags); 735 spin_unlock_irqrestore(&bitmap->lock, flags);
711 736
712 while (pages--) 737 while (pages--)
713 if (map[pages]->index != 0) /* 0 is sb_page, release it below */ 738 if (map[pages] != sb_page) /* 0 is sb_page, release it below */
714 free_buffers(map[pages]); 739 free_buffers(map[pages]);
715 kfree(map); 740 kfree(map);
716 kfree(attr); 741 kfree(attr);
@@ -821,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
821 846
822 page = filemap_get_page(bitmap, chunk); 847 page = filemap_get_page(bitmap, chunk);
823 if (!page) return; 848 if (!page) return;
824 bit = file_page_offset(chunk); 849 bit = file_page_offset(bitmap, chunk);
825 850
826 /* set the bit */ 851 /* set the bit */
827 kaddr = kmap_atomic(page, KM_USER0); 852 kaddr = kmap_atomic(page, KM_USER0);
@@ -907,7 +932,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
907 chunks = bitmap->chunks; 932 chunks = bitmap->chunks;
908 file = bitmap->file; 933 file = bitmap->file;
909 934
910 BUG_ON(!file && !bitmap->offset); 935 BUG_ON(!file && !bitmap->mddev->bitmap_info.offset);
911 936
912#ifdef INJECT_FAULTS_3 937#ifdef INJECT_FAULTS_3
913 outofdate = 1; 938 outofdate = 1;
@@ -919,14 +944,17 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
919 "recovery\n", bmname(bitmap)); 944 "recovery\n", bmname(bitmap));
920 945
921 bytes = (chunks + 7) / 8; 946 bytes = (chunks + 7) / 8;
947 if (!bitmap->mddev->bitmap_info.external)
948 bytes += sizeof(bitmap_super_t);
922 949
923 num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; 950
951 num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
924 952
925 if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { 953 if (file && i_size_read(file->f_mapping->host) < bytes) {
926 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", 954 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
927 bmname(bitmap), 955 bmname(bitmap),
928 (unsigned long) i_size_read(file->f_mapping->host), 956 (unsigned long) i_size_read(file->f_mapping->host),
929 bytes + sizeof(bitmap_super_t)); 957 bytes);
930 goto err; 958 goto err;
931 } 959 }
932 960
@@ -947,17 +975,16 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
947 975
948 for (i = 0; i < chunks; i++) { 976 for (i = 0; i < chunks; i++) {
949 int b; 977 int b;
950 index = file_page_index(i); 978 index = file_page_index(bitmap, i);
951 bit = file_page_offset(i); 979 bit = file_page_offset(bitmap, i);
952 if (index != oldindex) { /* this is a new page, read it in */ 980 if (index != oldindex) { /* this is a new page, read it in */
953 int count; 981 int count;
954 /* unmap the old page, we're done with it */ 982 /* unmap the old page, we're done with it */
955 if (index == num_pages-1) 983 if (index == num_pages-1)
956 count = bytes + sizeof(bitmap_super_t) 984 count = bytes - index * PAGE_SIZE;
957 - index * PAGE_SIZE;
958 else 985 else
959 count = PAGE_SIZE; 986 count = PAGE_SIZE;
960 if (index == 0) { 987 if (index == 0 && bitmap->sb_page) {
961 /* 988 /*
962 * if we're here then the superblock page 989 * if we're here then the superblock page
963 * contains some bits (PAGE_SIZE != sizeof sb) 990 * contains some bits (PAGE_SIZE != sizeof sb)
@@ -967,14 +994,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
967 offset = sizeof(bitmap_super_t); 994 offset = sizeof(bitmap_super_t);
968 if (!file) 995 if (!file)
969 read_sb_page(bitmap->mddev, 996 read_sb_page(bitmap->mddev,
970 bitmap->offset, 997 bitmap->mddev->bitmap_info.offset,
971 page, 998 page,
972 index, count); 999 index, count);
973 } else if (file) { 1000 } else if (file) {
974 page = read_page(file, index, bitmap, count); 1001 page = read_page(file, index, bitmap, count);
975 offset = 0; 1002 offset = 0;
976 } else { 1003 } else {
977 page = read_sb_page(bitmap->mddev, bitmap->offset, 1004 page = read_sb_page(bitmap->mddev,
1005 bitmap->mddev->bitmap_info.offset,
978 NULL, 1006 NULL,
979 index, count); 1007 index, count);
980 offset = 0; 1008 offset = 0;
@@ -1078,23 +1106,32 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1078 * out to disk 1106 * out to disk
1079 */ 1107 */
1080 1108
1081void bitmap_daemon_work(struct bitmap *bitmap) 1109void bitmap_daemon_work(mddev_t *mddev)
1082{ 1110{
1111 struct bitmap *bitmap;
1083 unsigned long j; 1112 unsigned long j;
1084 unsigned long flags; 1113 unsigned long flags;
1085 struct page *page = NULL, *lastpage = NULL; 1114 struct page *page = NULL, *lastpage = NULL;
1086 int blocks; 1115 int blocks;
1087 void *paddr; 1116 void *paddr;
1088 1117
1089 if (bitmap == NULL) 1118 /* Use a mutex to guard daemon_work against
1119 * bitmap_destroy.
1120 */
1121 mutex_lock(&mddev->bitmap_info.mutex);
1122 bitmap = mddev->bitmap;
1123 if (bitmap == NULL) {
1124 mutex_unlock(&mddev->bitmap_info.mutex);
1090 return; 1125 return;
1091 if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ)) 1126 }
1127 if (time_before(jiffies, bitmap->daemon_lastrun
1128 + bitmap->mddev->bitmap_info.daemon_sleep))
1092 goto done; 1129 goto done;
1093 1130
1094 bitmap->daemon_lastrun = jiffies; 1131 bitmap->daemon_lastrun = jiffies;
1095 if (bitmap->allclean) { 1132 if (bitmap->allclean) {
1096 bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1133 bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1097 return; 1134 goto done;
1098 } 1135 }
1099 bitmap->allclean = 1; 1136 bitmap->allclean = 1;
1100 1137
@@ -1142,7 +1179,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1142 /* We are possibly going to clear some bits, so make 1179 /* We are possibly going to clear some bits, so make
1143 * sure that events_cleared is up-to-date. 1180 * sure that events_cleared is up-to-date.
1144 */ 1181 */
1145 if (bitmap->need_sync) { 1182 if (bitmap->need_sync &&
1183 bitmap->mddev->bitmap_info.external == 0) {
1146 bitmap_super_t *sb; 1184 bitmap_super_t *sb;
1147 bitmap->need_sync = 0; 1185 bitmap->need_sync = 0;
1148 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 1186 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -1152,7 +1190,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1152 write_page(bitmap, bitmap->sb_page, 1); 1190 write_page(bitmap, bitmap->sb_page, 1);
1153 } 1191 }
1154 spin_lock_irqsave(&bitmap->lock, flags); 1192 spin_lock_irqsave(&bitmap->lock, flags);
1155 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1193 if (!bitmap->need_sync)
1194 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1156 } 1195 }
1157 bmc = bitmap_get_counter(bitmap, 1196 bmc = bitmap_get_counter(bitmap,
1158 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1197 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
@@ -1167,7 +1206,7 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1167 if (*bmc == 2) { 1206 if (*bmc == 2) {
1168 *bmc=1; /* maybe clear the bit next time */ 1207 *bmc=1; /* maybe clear the bit next time */
1169 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1208 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1170 } else if (*bmc == 1) { 1209 } else if (*bmc == 1 && !bitmap->need_sync) {
1171 /* we can clear the bit */ 1210 /* we can clear the bit */
1172 *bmc = 0; 1211 *bmc = 0;
1173 bitmap_count_page(bitmap, 1212 bitmap_count_page(bitmap,
@@ -1177,9 +1216,11 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1177 /* clear the bit */ 1216 /* clear the bit */
1178 paddr = kmap_atomic(page, KM_USER0); 1217 paddr = kmap_atomic(page, KM_USER0);
1179 if (bitmap->flags & BITMAP_HOSTENDIAN) 1218 if (bitmap->flags & BITMAP_HOSTENDIAN)
1180 clear_bit(file_page_offset(j), paddr); 1219 clear_bit(file_page_offset(bitmap, j),
1220 paddr);
1181 else 1221 else
1182 ext2_clear_bit(file_page_offset(j), paddr); 1222 ext2_clear_bit(file_page_offset(bitmap, j),
1223 paddr);
1183 kunmap_atomic(paddr, KM_USER0); 1224 kunmap_atomic(paddr, KM_USER0);
1184 } 1225 }
1185 } else 1226 } else
@@ -1202,7 +1243,9 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1202 1243
1203 done: 1244 done:
1204 if (bitmap->allclean == 0) 1245 if (bitmap->allclean == 0)
1205 bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ; 1246 bitmap->mddev->thread->timeout =
1247 bitmap->mddev->bitmap_info.daemon_sleep;
1248 mutex_unlock(&mddev->bitmap_info.mutex);
1206} 1249}
1207 1250
1208static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1251static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1332,6 +1375,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1332 bitmap->events_cleared < bitmap->mddev->events) { 1375 bitmap->events_cleared < bitmap->mddev->events) {
1333 bitmap->events_cleared = bitmap->mddev->events; 1376 bitmap->events_cleared = bitmap->mddev->events;
1334 bitmap->need_sync = 1; 1377 bitmap->need_sync = 1;
1378 sysfs_notify_dirent(bitmap->sysfs_can_clear);
1335 } 1379 }
1336 1380
1337 if (!success && ! (*bmc & NEEDED_MASK)) 1381 if (!success && ! (*bmc & NEEDED_MASK))
@@ -1470,7 +1514,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1470 return; 1514 return;
1471 } 1515 }
1472 if (time_before(jiffies, (bitmap->last_end_sync 1516 if (time_before(jiffies, (bitmap->last_end_sync
1473 + bitmap->daemon_sleep * HZ))) 1517 + bitmap->mddev->bitmap_info.daemon_sleep)))
1474 return; 1518 return;
1475 wait_event(bitmap->mddev->recovery_wait, 1519 wait_event(bitmap->mddev->recovery_wait,
1476 atomic_read(&bitmap->mddev->recovery_active) == 0); 1520 atomic_read(&bitmap->mddev->recovery_active) == 0);
@@ -1522,6 +1566,12 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1522 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); 1566 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
1523 bitmap_set_memory_bits(bitmap, sec, 1); 1567 bitmap_set_memory_bits(bitmap, sec, 1);
1524 bitmap_file_set_bit(bitmap, sec); 1568 bitmap_file_set_bit(bitmap, sec);
1569 if (sec < bitmap->mddev->recovery_cp)
1570 /* We are asserting that the array is dirty,
1571 * so move the recovery_cp address back so
1572 * that it is obvious that it is dirty
1573 */
1574 bitmap->mddev->recovery_cp = sec;
1525 } 1575 }
1526} 1576}
1527 1577
@@ -1531,7 +1581,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1531void bitmap_flush(mddev_t *mddev) 1581void bitmap_flush(mddev_t *mddev)
1532{ 1582{
1533 struct bitmap *bitmap = mddev->bitmap; 1583 struct bitmap *bitmap = mddev->bitmap;
1534 int sleep; 1584 long sleep;
1535 1585
1536 if (!bitmap) /* there was no bitmap */ 1586 if (!bitmap) /* there was no bitmap */
1537 return; 1587 return;
@@ -1539,12 +1589,13 @@ void bitmap_flush(mddev_t *mddev)
1539 /* run the daemon_work three time to ensure everything is flushed 1589 /* run the daemon_work three time to ensure everything is flushed
1540 * that can be 1590 * that can be
1541 */ 1591 */
1542 sleep = bitmap->daemon_sleep; 1592 sleep = mddev->bitmap_info.daemon_sleep * 2;
1543 bitmap->daemon_sleep = 0; 1593 bitmap->daemon_lastrun -= sleep;
1544 bitmap_daemon_work(bitmap); 1594 bitmap_daemon_work(mddev);
1545 bitmap_daemon_work(bitmap); 1595 bitmap->daemon_lastrun -= sleep;
1546 bitmap_daemon_work(bitmap); 1596 bitmap_daemon_work(mddev);
1547 bitmap->daemon_sleep = sleep; 1597 bitmap->daemon_lastrun -= sleep;
1598 bitmap_daemon_work(mddev);
1548 bitmap_update_sb(bitmap); 1599 bitmap_update_sb(bitmap);
1549} 1600}
1550 1601
@@ -1574,6 +1625,7 @@ static void bitmap_free(struct bitmap *bitmap)
1574 kfree(bp); 1625 kfree(bp);
1575 kfree(bitmap); 1626 kfree(bitmap);
1576} 1627}
1628
1577void bitmap_destroy(mddev_t *mddev) 1629void bitmap_destroy(mddev_t *mddev)
1578{ 1630{
1579 struct bitmap *bitmap = mddev->bitmap; 1631 struct bitmap *bitmap = mddev->bitmap;
@@ -1581,10 +1633,15 @@ void bitmap_destroy(mddev_t *mddev)
1581 if (!bitmap) /* there was no bitmap */ 1633 if (!bitmap) /* there was no bitmap */
1582 return; 1634 return;
1583 1635
1636 mutex_lock(&mddev->bitmap_info.mutex);
1584 mddev->bitmap = NULL; /* disconnect from the md device */ 1637 mddev->bitmap = NULL; /* disconnect from the md device */
1638 mutex_unlock(&mddev->bitmap_info.mutex);
1585 if (mddev->thread) 1639 if (mddev->thread)
1586 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1640 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1587 1641
1642 if (bitmap->sysfs_can_clear)
1643 sysfs_put(bitmap->sysfs_can_clear);
1644
1588 bitmap_free(bitmap); 1645 bitmap_free(bitmap);
1589} 1646}
1590 1647
@@ -1598,16 +1655,17 @@ int bitmap_create(mddev_t *mddev)
1598 sector_t blocks = mddev->resync_max_sectors; 1655 sector_t blocks = mddev->resync_max_sectors;
1599 unsigned long chunks; 1656 unsigned long chunks;
1600 unsigned long pages; 1657 unsigned long pages;
1601 struct file *file = mddev->bitmap_file; 1658 struct file *file = mddev->bitmap_info.file;
1602 int err; 1659 int err;
1603 sector_t start; 1660 sector_t start;
1661 struct sysfs_dirent *bm;
1604 1662
1605 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1663 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1606 1664
1607 if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ 1665 if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
1608 return 0; 1666 return 0;
1609 1667
1610 BUG_ON(file && mddev->bitmap_offset); 1668 BUG_ON(file && mddev->bitmap_info.offset);
1611 1669
1612 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1670 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1613 if (!bitmap) 1671 if (!bitmap)
@@ -1620,8 +1678,14 @@ int bitmap_create(mddev_t *mddev)
1620 1678
1621 bitmap->mddev = mddev; 1679 bitmap->mddev = mddev;
1622 1680
1681 bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
1682 if (bm) {
1683 bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
1684 sysfs_put(bm);
1685 } else
1686 bitmap->sysfs_can_clear = NULL;
1687
1623 bitmap->file = file; 1688 bitmap->file = file;
1624 bitmap->offset = mddev->bitmap_offset;
1625 if (file) { 1689 if (file) {
1626 get_file(file); 1690 get_file(file);
1627 /* As future accesses to this file will use bmap, 1691 /* As future accesses to this file will use bmap,
@@ -1630,12 +1694,22 @@ int bitmap_create(mddev_t *mddev)
1630 */ 1694 */
1631 vfs_fsync(file, file->f_dentry, 1); 1695 vfs_fsync(file, file->f_dentry, 1);
1632 } 1696 }
1633 /* read superblock from bitmap file (this sets bitmap->chunksize) */ 1697 /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
1634 err = bitmap_read_sb(bitmap); 1698 if (!mddev->bitmap_info.external)
1699 err = bitmap_read_sb(bitmap);
1700 else {
1701 err = 0;
1702 if (mddev->bitmap_info.chunksize == 0 ||
1703 mddev->bitmap_info.daemon_sleep == 0)
1704 /* chunksize and time_base need to be
1705 * set first. */
1706 err = -EINVAL;
1707 }
1635 if (err) 1708 if (err)
1636 goto error; 1709 goto error;
1637 1710
1638 bitmap->chunkshift = ffz(~bitmap->chunksize); 1711 bitmap->daemon_lastrun = jiffies;
1712 bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
1639 1713
1640 /* now that chunksize and chunkshift are set, we can use these macros */ 1714 /* now that chunksize and chunkshift are set, we can use these macros */
1641 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> 1715 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
@@ -1677,7 +1751,8 @@ int bitmap_create(mddev_t *mddev)
1677 1751
1678 mddev->bitmap = bitmap; 1752 mddev->bitmap = bitmap;
1679 1753
1680 mddev->thread->timeout = bitmap->daemon_sleep * HZ; 1754 mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
1755 md_wakeup_thread(mddev->thread);
1681 1756
1682 bitmap_update_sb(bitmap); 1757 bitmap_update_sb(bitmap);
1683 1758
@@ -1688,6 +1763,264 @@ int bitmap_create(mddev_t *mddev)
1688 return err; 1763 return err;
1689} 1764}
1690 1765
1766static ssize_t
1767location_show(mddev_t *mddev, char *page)
1768{
1769 ssize_t len;
1770 if (mddev->bitmap_info.file) {
1771 len = sprintf(page, "file");
1772 } else if (mddev->bitmap_info.offset) {
1773 len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset);
1774 } else
1775 len = sprintf(page, "none");
1776 len += sprintf(page+len, "\n");
1777 return len;
1778}
1779
1780static ssize_t
1781location_store(mddev_t *mddev, const char *buf, size_t len)
1782{
1783
1784 if (mddev->pers) {
1785 if (!mddev->pers->quiesce)
1786 return -EBUSY;
1787 if (mddev->recovery || mddev->sync_thread)
1788 return -EBUSY;
1789 }
1790
1791 if (mddev->bitmap || mddev->bitmap_info.file ||
1792 mddev->bitmap_info.offset) {
1793 /* bitmap already configured. Only option is to clear it */
1794 if (strncmp(buf, "none", 4) != 0)
1795 return -EBUSY;
1796 if (mddev->pers) {
1797 mddev->pers->quiesce(mddev, 1);
1798 bitmap_destroy(mddev);
1799 mddev->pers->quiesce(mddev, 0);
1800 }
1801 mddev->bitmap_info.offset = 0;
1802 if (mddev->bitmap_info.file) {
1803 struct file *f = mddev->bitmap_info.file;
1804 mddev->bitmap_info.file = NULL;
1805 restore_bitmap_write_access(f);
1806 fput(f);
1807 }
1808 } else {
1809 /* No bitmap, OK to set a location */
1810 long long offset;
1811 if (strncmp(buf, "none", 4) == 0)
1812 /* nothing to be done */;
1813 else if (strncmp(buf, "file:", 5) == 0) {
1814 /* Not supported yet */
1815 return -EINVAL;
1816 } else {
1817 int rv;
1818 if (buf[0] == '+')
1819 rv = strict_strtoll(buf+1, 10, &offset);
1820 else
1821 rv = strict_strtoll(buf, 10, &offset);
1822 if (rv)
1823 return rv;
1824 if (offset == 0)
1825 return -EINVAL;
1826 if (mddev->bitmap_info.external == 0 &&
1827 mddev->major_version == 0 &&
1828 offset != mddev->bitmap_info.default_offset)
1829 return -EINVAL;
1830 mddev->bitmap_info.offset = offset;
1831 if (mddev->pers) {
1832 mddev->pers->quiesce(mddev, 1);
1833 rv = bitmap_create(mddev);
1834 if (rv) {
1835 bitmap_destroy(mddev);
1836 mddev->bitmap_info.offset = 0;
1837 }
1838 mddev->pers->quiesce(mddev, 0);
1839 if (rv)
1840 return rv;
1841 }
1842 }
1843 }
1844 if (!mddev->external) {
1845 /* Ensure new bitmap info is stored in
1846 * metadata promptly.
1847 */
1848 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1849 md_wakeup_thread(mddev->thread);
1850 }
1851 return len;
1852}
1853
1854static struct md_sysfs_entry bitmap_location =
1855__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
1856
1857static ssize_t
1858timeout_show(mddev_t *mddev, char *page)
1859{
1860 ssize_t len;
1861 unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
1862 unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ;
1863
1864 len = sprintf(page, "%lu", secs);
1865 if (jifs)
1866 len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs));
1867 len += sprintf(page+len, "\n");
1868 return len;
1869}
1870
1871static ssize_t
1872timeout_store(mddev_t *mddev, const char *buf, size_t len)
1873{
1874 /* timeout can be set at any time */
1875 unsigned long timeout;
1876 int rv = strict_strtoul_scaled(buf, &timeout, 4);
1877 if (rv)
1878 return rv;
1879
1880 /* just to make sure we don't overflow... */
1881 if (timeout >= LONG_MAX / HZ)
1882 return -EINVAL;
1883
1884 timeout = timeout * HZ / 10000;
1885
1886 if (timeout >= MAX_SCHEDULE_TIMEOUT)
1887 timeout = MAX_SCHEDULE_TIMEOUT-1;
1888 if (timeout < 1)
1889 timeout = 1;
1890 mddev->bitmap_info.daemon_sleep = timeout;
1891 if (mddev->thread) {
1892 /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then
1893 * the bitmap is all clean and we don't need to
1894 * adjust the timeout right now
1895 */
1896 if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) {
1897 mddev->thread->timeout = timeout;
1898 md_wakeup_thread(mddev->thread);
1899 }
1900 }
1901 return len;
1902}
1903
1904static struct md_sysfs_entry bitmap_timeout =
1905__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
1906
1907static ssize_t
1908backlog_show(mddev_t *mddev, char *page)
1909{
1910 return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
1911}
1912
1913static ssize_t
1914backlog_store(mddev_t *mddev, const char *buf, size_t len)
1915{
1916 unsigned long backlog;
1917 int rv = strict_strtoul(buf, 10, &backlog);
1918 if (rv)
1919 return rv;
1920 if (backlog > COUNTER_MAX)
1921 return -EINVAL;
1922 mddev->bitmap_info.max_write_behind = backlog;
1923 return len;
1924}
1925
1926static struct md_sysfs_entry bitmap_backlog =
1927__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
1928
1929static ssize_t
1930chunksize_show(mddev_t *mddev, char *page)
1931{
1932 return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
1933}
1934
1935static ssize_t
1936chunksize_store(mddev_t *mddev, const char *buf, size_t len)
1937{
1938 /* Can only be changed when no bitmap is active */
1939 int rv;
1940 unsigned long csize;
1941 if (mddev->bitmap)
1942 return -EBUSY;
1943 rv = strict_strtoul(buf, 10, &csize);
1944 if (rv)
1945 return rv;
1946 if (csize < 512 ||
1947 !is_power_of_2(csize))
1948 return -EINVAL;
1949 mddev->bitmap_info.chunksize = csize;
1950 return len;
1951}
1952
1953static struct md_sysfs_entry bitmap_chunksize =
1954__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
1955
1956static ssize_t metadata_show(mddev_t *mddev, char *page)
1957{
1958 return sprintf(page, "%s\n", (mddev->bitmap_info.external
1959 ? "external" : "internal"));
1960}
1961
1962static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len)
1963{
1964 if (mddev->bitmap ||
1965 mddev->bitmap_info.file ||
1966 mddev->bitmap_info.offset)
1967 return -EBUSY;
1968 if (strncmp(buf, "external", 8) == 0)
1969 mddev->bitmap_info.external = 1;
1970 else if (strncmp(buf, "internal", 8) == 0)
1971 mddev->bitmap_info.external = 0;
1972 else
1973 return -EINVAL;
1974 return len;
1975}
1976
1977static struct md_sysfs_entry bitmap_metadata =
1978__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
1979
1980static ssize_t can_clear_show(mddev_t *mddev, char *page)
1981{
1982 int len;
1983 if (mddev->bitmap)
1984 len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
1985 "false" : "true"));
1986 else
1987 len = sprintf(page, "\n");
1988 return len;
1989}
1990
1991static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
1992{
1993 if (mddev->bitmap == NULL)
1994 return -ENOENT;
1995 if (strncmp(buf, "false", 5) == 0)
1996 mddev->bitmap->need_sync = 1;
1997 else if (strncmp(buf, "true", 4) == 0) {
1998 if (mddev->degraded)
1999 return -EBUSY;
2000 mddev->bitmap->need_sync = 0;
2001 } else
2002 return -EINVAL;
2003 return len;
2004}
2005
2006static struct md_sysfs_entry bitmap_can_clear =
2007__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
2008
2009static struct attribute *md_bitmap_attrs[] = {
2010 &bitmap_location.attr,
2011 &bitmap_timeout.attr,
2012 &bitmap_backlog.attr,
2013 &bitmap_chunksize.attr,
2014 &bitmap_metadata.attr,
2015 &bitmap_can_clear.attr,
2016 NULL
2017};
2018struct attribute_group md_bitmap_group = {
2019 .name = "bitmap",
2020 .attrs = md_bitmap_attrs,
2021};
2022
2023
1691/* the bitmap API -- for raid personalities */ 2024/* the bitmap API -- for raid personalities */
1692EXPORT_SYMBOL(bitmap_startwrite); 2025EXPORT_SYMBOL(bitmap_startwrite);
1693EXPORT_SYMBOL(bitmap_endwrite); 2026EXPORT_SYMBOL(bitmap_endwrite);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index e98900671ca9..cb821d76d1b4 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -106,7 +106,7 @@ typedef __u16 bitmap_counter_t;
106#define BITMAP_BLOCK_SHIFT 9 106#define BITMAP_BLOCK_SHIFT 9
107 107
108/* how many blocks per chunk? (this is variable) */ 108/* how many blocks per chunk? (this is variable) */
109#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) 109#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
110#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) 110#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
111#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) 111#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
112 112
@@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t;
118 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) 118 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
119#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) 119#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
120 120
121/*
122 * on-disk bitmap:
123 *
124 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
125 * file a page at a time. There's a superblock at the start of the file.
126 */
127
128/* map chunks (bits) to file pages - offset by the size of the superblock */
129#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
130
131#endif 121#endif
132 122
133/* 123/*
@@ -209,7 +199,6 @@ struct bitmap {
209 int counter_bits; /* how many bits per block counter */ 199 int counter_bits; /* how many bits per block counter */
210 200
211 /* bitmap chunksize -- how much data does each bit represent? */ 201 /* bitmap chunksize -- how much data does each bit represent? */
212 unsigned long chunksize;
213 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ 202 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
214 unsigned long chunks; /* total number of data chunks for the array */ 203 unsigned long chunks; /* total number of data chunks for the array */
215 204
@@ -226,7 +215,6 @@ struct bitmap {
226 /* bitmap spinlock */ 215 /* bitmap spinlock */
227 spinlock_t lock; 216 spinlock_t lock;
228 217
229 long offset; /* offset from superblock if file is NULL */
230 struct file *file; /* backing disk file */ 218 struct file *file; /* backing disk file */
231 struct page *sb_page; /* cached copy of the bitmap file superblock */ 219 struct page *sb_page; /* cached copy of the bitmap file superblock */
232 struct page **filemap; /* list of cache pages for the file */ 220 struct page **filemap; /* list of cache pages for the file */
@@ -238,7 +226,6 @@ struct bitmap {
238 226
239 int allclean; 227 int allclean;
240 228
241 unsigned long max_write_behind; /* write-behind mode */
242 atomic_t behind_writes; 229 atomic_t behind_writes;
243 230
244 /* 231 /*
@@ -246,7 +233,6 @@ struct bitmap {
246 * file, cleaning up bits and flushing out pages to disk as necessary 233 * file, cleaning up bits and flushing out pages to disk as necessary
247 */ 234 */
248 unsigned long daemon_lastrun; /* jiffies of last run */ 235 unsigned long daemon_lastrun; /* jiffies of last run */
249 unsigned long daemon_sleep; /* how many seconds between updates? */
250 unsigned long last_end_sync; /* when we lasted called end_sync to 236 unsigned long last_end_sync; /* when we lasted called end_sync to
251 * update bitmap with resync progress */ 237 * update bitmap with resync progress */
252 238
@@ -254,6 +240,7 @@ struct bitmap {
254 wait_queue_head_t write_wait; 240 wait_queue_head_t write_wait;
255 wait_queue_head_t overflow_wait; 241 wait_queue_head_t overflow_wait;
256 242
243 struct sysfs_dirent *sysfs_can_clear;
257}; 244};
258 245
259/* the bitmap API */ 246/* the bitmap API */
@@ -282,7 +269,7 @@ void bitmap_close_sync(struct bitmap *bitmap);
282void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); 269void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
283 270
284void bitmap_unplug(struct bitmap *bitmap); 271void bitmap_unplug(struct bitmap *bitmap);
285void bitmap_daemon_work(struct bitmap *bitmap); 272void bitmap_daemon_work(mddev_t *mddev);
286#endif 273#endif
287 274
288#endif 275#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 87d88dbb667f..713acd02ab39 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -360,6 +360,7 @@ static void raid_exit(void)
360module_init(raid_init); 360module_init(raid_init);
361module_exit(raid_exit); 361module_exit(raid_exit);
362MODULE_LICENSE("GPL"); 362MODULE_LICENSE("GPL");
363MODULE_DESCRIPTION("Fault injection personality for MD");
363MODULE_ALIAS("md-personality-10"); /* faulty */ 364MODULE_ALIAS("md-personality-10"); /* faulty */
364MODULE_ALIAS("md-faulty"); 365MODULE_ALIAS("md-faulty");
365MODULE_ALIAS("md-level--5"); 366MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 1ceceb334d5e..00435bd20699 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -292,7 +292,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
292 int cpu; 292 int cpu;
293 293
294 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 294 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
295 bio_endio(bio, -EOPNOTSUPP); 295 md_barrier_request(mddev, bio);
296 return 0; 296 return 0;
297 } 297 }
298 298
@@ -383,6 +383,7 @@ static void linear_exit (void)
383module_init(linear_init); 383module_init(linear_init);
384module_exit(linear_exit); 384module_exit(linear_exit);
385MODULE_LICENSE("GPL"); 385MODULE_LICENSE("GPL");
386MODULE_DESCRIPTION("Linear device concatenation personality for MD");
386MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ 387MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
387MODULE_ALIAS("md-linear"); 388MODULE_ALIAS("md-linear");
388MODULE_ALIAS("md-level--1"); 389MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5f154ef1e4be..e1f3c1715cca 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -44,6 +44,7 @@
44#include <linux/random.h> 44#include <linux/random.h>
45#include <linux/reboot.h> 45#include <linux/reboot.h>
46#include <linux/file.h> 46#include <linux/file.h>
47#include <linux/compat.h>
47#include <linux/delay.h> 48#include <linux/delay.h>
48#include <linux/raid/md_p.h> 49#include <linux/raid/md_p.h>
49#include <linux/raid/md_u.h> 50#include <linux/raid/md_u.h>
@@ -68,6 +69,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
68#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 69#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69 70
70/* 71/*
72 * Default number of read corrections we'll attempt on an rdev
73 * before ejecting it from the array. We divide the read error
74 * count by 2 for every hour elapsed between read errors.
75 */
76#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
77/*
71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 78 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72 * is 1000 KB/sec, so the extra system load does not show up that much. 79 * is 1000 KB/sec, so the extra system load does not show up that much.
73 * Increase it if you want to have more _guaranteed_ speed. Note that 80 * Increase it if you want to have more _guaranteed_ speed. Note that
@@ -213,12 +220,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
213 return 0; 220 return 0;
214 } 221 }
215 rcu_read_lock(); 222 rcu_read_lock();
216 if (mddev->suspended) { 223 if (mddev->suspended || mddev->barrier) {
217 DEFINE_WAIT(__wait); 224 DEFINE_WAIT(__wait);
218 for (;;) { 225 for (;;) {
219 prepare_to_wait(&mddev->sb_wait, &__wait, 226 prepare_to_wait(&mddev->sb_wait, &__wait,
220 TASK_UNINTERRUPTIBLE); 227 TASK_UNINTERRUPTIBLE);
221 if (!mddev->suspended) 228 if (!mddev->suspended && !mddev->barrier)
222 break; 229 break;
223 rcu_read_unlock(); 230 rcu_read_unlock();
224 schedule(); 231 schedule();
@@ -260,10 +267,110 @@ static void mddev_resume(mddev_t *mddev)
260 267
261int mddev_congested(mddev_t *mddev, int bits) 268int mddev_congested(mddev_t *mddev, int bits)
262{ 269{
270 if (mddev->barrier)
271 return 1;
263 return mddev->suspended; 272 return mddev->suspended;
264} 273}
265EXPORT_SYMBOL(mddev_congested); 274EXPORT_SYMBOL(mddev_congested);
266 275
276/*
277 * Generic barrier handling for md
278 */
279
280#define POST_REQUEST_BARRIER ((void*)1)
281
282static void md_end_barrier(struct bio *bio, int err)
283{
284 mdk_rdev_t *rdev = bio->bi_private;
285 mddev_t *mddev = rdev->mddev;
286 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
287 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
288
289 rdev_dec_pending(rdev, mddev);
290
291 if (atomic_dec_and_test(&mddev->flush_pending)) {
292 if (mddev->barrier == POST_REQUEST_BARRIER) {
293 /* This was a post-request barrier */
294 mddev->barrier = NULL;
295 wake_up(&mddev->sb_wait);
296 } else
297 /* The pre-request barrier has finished */
298 schedule_work(&mddev->barrier_work);
299 }
300 bio_put(bio);
301}
302
303static void submit_barriers(mddev_t *mddev)
304{
305 mdk_rdev_t *rdev;
306
307 rcu_read_lock();
308 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
309 if (rdev->raid_disk >= 0 &&
310 !test_bit(Faulty, &rdev->flags)) {
311 /* Take two references, one is dropped
312 * when request finishes, one after
313 * we reclaim rcu_read_lock
314 */
315 struct bio *bi;
316 atomic_inc(&rdev->nr_pending);
317 atomic_inc(&rdev->nr_pending);
318 rcu_read_unlock();
319 bi = bio_alloc(GFP_KERNEL, 0);
320 bi->bi_end_io = md_end_barrier;
321 bi->bi_private = rdev;
322 bi->bi_bdev = rdev->bdev;
323 atomic_inc(&mddev->flush_pending);
324 submit_bio(WRITE_BARRIER, bi);
325 rcu_read_lock();
326 rdev_dec_pending(rdev, mddev);
327 }
328 rcu_read_unlock();
329}
330
331static void md_submit_barrier(struct work_struct *ws)
332{
333 mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
334 struct bio *bio = mddev->barrier;
335
336 atomic_set(&mddev->flush_pending, 1);
337
338 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
339 bio_endio(bio, -EOPNOTSUPP);
340 else if (bio->bi_size == 0)
341 /* an empty barrier - all done */
342 bio_endio(bio, 0);
343 else {
344 bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
345 if (mddev->pers->make_request(mddev->queue, bio))
346 generic_make_request(bio);
347 mddev->barrier = POST_REQUEST_BARRIER;
348 submit_barriers(mddev);
349 }
350 if (atomic_dec_and_test(&mddev->flush_pending)) {
351 mddev->barrier = NULL;
352 wake_up(&mddev->sb_wait);
353 }
354}
355
356void md_barrier_request(mddev_t *mddev, struct bio *bio)
357{
358 spin_lock_irq(&mddev->write_lock);
359 wait_event_lock_irq(mddev->sb_wait,
360 !mddev->barrier,
361 mddev->write_lock, /*nothing*/);
362 mddev->barrier = bio;
363 spin_unlock_irq(&mddev->write_lock);
364
365 atomic_set(&mddev->flush_pending, 1);
366 INIT_WORK(&mddev->barrier_work, md_submit_barrier);
367
368 submit_barriers(mddev);
369
370 if (atomic_dec_and_test(&mddev->flush_pending))
371 schedule_work(&mddev->barrier_work);
372}
373EXPORT_SYMBOL(md_barrier_request);
267 374
268static inline mddev_t *mddev_get(mddev_t *mddev) 375static inline mddev_t *mddev_get(mddev_t *mddev)
269{ 376{
@@ -363,6 +470,7 @@ static mddev_t * mddev_find(dev_t unit)
363 470
364 mutex_init(&new->open_mutex); 471 mutex_init(&new->open_mutex);
365 mutex_init(&new->reconfig_mutex); 472 mutex_init(&new->reconfig_mutex);
473 mutex_init(&new->bitmap_info.mutex);
366 INIT_LIST_HEAD(&new->disks); 474 INIT_LIST_HEAD(&new->disks);
367 INIT_LIST_HEAD(&new->all_mddevs); 475 INIT_LIST_HEAD(&new->all_mddevs);
368 init_timer(&new->safemode_timer); 476 init_timer(&new->safemode_timer);
@@ -370,6 +478,7 @@ static mddev_t * mddev_find(dev_t unit)
370 atomic_set(&new->openers, 0); 478 atomic_set(&new->openers, 0);
371 atomic_set(&new->active_io, 0); 479 atomic_set(&new->active_io, 0);
372 spin_lock_init(&new->write_lock); 480 spin_lock_init(&new->write_lock);
481 atomic_set(&new->flush_pending, 0);
373 init_waitqueue_head(&new->sb_wait); 482 init_waitqueue_head(&new->sb_wait);
374 init_waitqueue_head(&new->recovery_wait); 483 init_waitqueue_head(&new->recovery_wait);
375 new->reshape_position = MaxSector; 484 new->reshape_position = MaxSector;
@@ -748,7 +857,7 @@ struct super_type {
748 */ 857 */
749int md_check_no_bitmap(mddev_t *mddev) 858int md_check_no_bitmap(mddev_t *mddev)
750{ 859{
751 if (!mddev->bitmap_file && !mddev->bitmap_offset) 860 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
752 return 0; 861 return 0;
753 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 862 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
754 mdname(mddev), mddev->pers->name); 863 mdname(mddev), mddev->pers->name);
@@ -876,8 +985,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
876 mddev->raid_disks = sb->raid_disks; 985 mddev->raid_disks = sb->raid_disks;
877 mddev->dev_sectors = sb->size * 2; 986 mddev->dev_sectors = sb->size * 2;
878 mddev->events = ev1; 987 mddev->events = ev1;
879 mddev->bitmap_offset = 0; 988 mddev->bitmap_info.offset = 0;
880 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 989 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
881 990
882 if (mddev->minor_version >= 91) { 991 if (mddev->minor_version >= 91) {
883 mddev->reshape_position = sb->reshape_position; 992 mddev->reshape_position = sb->reshape_position;
@@ -911,8 +1020,9 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
911 mddev->max_disks = MD_SB_DISKS; 1020 mddev->max_disks = MD_SB_DISKS;
912 1021
913 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1022 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
914 mddev->bitmap_file == NULL) 1023 mddev->bitmap_info.file == NULL)
915 mddev->bitmap_offset = mddev->default_bitmap_offset; 1024 mddev->bitmap_info.offset =
1025 mddev->bitmap_info.default_offset;
916 1026
917 } else if (mddev->pers == NULL) { 1027 } else if (mddev->pers == NULL) {
918 /* Insist on good event counter while assembling */ 1028 /* Insist on good event counter while assembling */
@@ -1029,7 +1139,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1029 sb->layout = mddev->layout; 1139 sb->layout = mddev->layout;
1030 sb->chunk_size = mddev->chunk_sectors << 9; 1140 sb->chunk_size = mddev->chunk_sectors << 9;
1031 1141
1032 if (mddev->bitmap && mddev->bitmap_file == NULL) 1142 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1033 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1143 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1034 1144
1035 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1145 sb->disks[0].state = (1<<MD_DISK_REMOVED);
@@ -1107,7 +1217,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1107{ 1217{
1108 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1218 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1109 return 0; /* component must fit device */ 1219 return 0; /* component must fit device */
1110 if (rdev->mddev->bitmap_offset) 1220 if (rdev->mddev->bitmap_info.offset)
1111 return 0; /* can't move bitmap */ 1221 return 0; /* can't move bitmap */
1112 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1222 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1113 if (!num_sectors || num_sectors > rdev->sb_start) 1223 if (!num_sectors || num_sectors > rdev->sb_start)
@@ -1286,8 +1396,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1286 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1396 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1287 mddev->dev_sectors = le64_to_cpu(sb->size); 1397 mddev->dev_sectors = le64_to_cpu(sb->size);
1288 mddev->events = ev1; 1398 mddev->events = ev1;
1289 mddev->bitmap_offset = 0; 1399 mddev->bitmap_info.offset = 0;
1290 mddev->default_bitmap_offset = 1024 >> 9; 1400 mddev->bitmap_info.default_offset = 1024 >> 9;
1291 1401
1292 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1402 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1293 memcpy(mddev->uuid, sb->set_uuid, 16); 1403 memcpy(mddev->uuid, sb->set_uuid, 16);
@@ -1295,8 +1405,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1295 mddev->max_disks = (4096-256)/2; 1405 mddev->max_disks = (4096-256)/2;
1296 1406
1297 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1407 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1298 mddev->bitmap_file == NULL ) 1408 mddev->bitmap_info.file == NULL )
1299 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1409 mddev->bitmap_info.offset =
1410 (__s32)le32_to_cpu(sb->bitmap_offset);
1300 1411
1301 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1412 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1302 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1413 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
@@ -1390,19 +1501,17 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1390 sb->level = cpu_to_le32(mddev->level); 1501 sb->level = cpu_to_le32(mddev->level);
1391 sb->layout = cpu_to_le32(mddev->layout); 1502 sb->layout = cpu_to_le32(mddev->layout);
1392 1503
1393 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1504 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1394 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1505 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1395 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1506 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1396 } 1507 }
1397 1508
1398 if (rdev->raid_disk >= 0 && 1509 if (rdev->raid_disk >= 0 &&
1399 !test_bit(In_sync, &rdev->flags)) { 1510 !test_bit(In_sync, &rdev->flags)) {
1400 if (rdev->recovery_offset > 0) { 1511 sb->feature_map |=
1401 sb->feature_map |= 1512 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1402 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1513 sb->recovery_offset =
1403 sb->recovery_offset = 1514 cpu_to_le64(rdev->recovery_offset);
1404 cpu_to_le64(rdev->recovery_offset);
1405 }
1406 } 1515 }
1407 1516
1408 if (mddev->reshape_position != MaxSector) { 1517 if (mddev->reshape_position != MaxSector) {
@@ -1436,7 +1545,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1436 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1545 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1437 else if (test_bit(In_sync, &rdev2->flags)) 1546 else if (test_bit(In_sync, &rdev2->flags))
1438 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1547 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1439 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1548 else if (rdev2->raid_disk >= 0)
1440 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1549 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1441 else 1550 else
1442 sb->dev_roles[i] = cpu_to_le16(0xffff); 1551 sb->dev_roles[i] = cpu_to_le16(0xffff);
@@ -1458,7 +1567,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1458 max_sectors -= rdev->data_offset; 1567 max_sectors -= rdev->data_offset;
1459 if (!num_sectors || num_sectors > max_sectors) 1568 if (!num_sectors || num_sectors > max_sectors)
1460 num_sectors = max_sectors; 1569 num_sectors = max_sectors;
1461 } else if (rdev->mddev->bitmap_offset) { 1570 } else if (rdev->mddev->bitmap_info.offset) {
1462 /* minor version 0 with bitmap we can't move */ 1571 /* minor version 0 with bitmap we can't move */
1463 return 0; 1572 return 0;
1464 } else { 1573 } else {
@@ -2442,12 +2551,49 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2442static struct rdev_sysfs_entry rdev_size = 2551static struct rdev_sysfs_entry rdev_size =
2443__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2552__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2444 2553
2554
2555static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
2556{
2557 unsigned long long recovery_start = rdev->recovery_offset;
2558
2559 if (test_bit(In_sync, &rdev->flags) ||
2560 recovery_start == MaxSector)
2561 return sprintf(page, "none\n");
2562
2563 return sprintf(page, "%llu\n", recovery_start);
2564}
2565
2566static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2567{
2568 unsigned long long recovery_start;
2569
2570 if (cmd_match(buf, "none"))
2571 recovery_start = MaxSector;
2572 else if (strict_strtoull(buf, 10, &recovery_start))
2573 return -EINVAL;
2574
2575 if (rdev->mddev->pers &&
2576 rdev->raid_disk >= 0)
2577 return -EBUSY;
2578
2579 rdev->recovery_offset = recovery_start;
2580 if (recovery_start == MaxSector)
2581 set_bit(In_sync, &rdev->flags);
2582 else
2583 clear_bit(In_sync, &rdev->flags);
2584 return len;
2585}
2586
2587static struct rdev_sysfs_entry rdev_recovery_start =
2588__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2589
2445static struct attribute *rdev_default_attrs[] = { 2590static struct attribute *rdev_default_attrs[] = {
2446 &rdev_state.attr, 2591 &rdev_state.attr,
2447 &rdev_errors.attr, 2592 &rdev_errors.attr,
2448 &rdev_slot.attr, 2593 &rdev_slot.attr,
2449 &rdev_offset.attr, 2594 &rdev_offset.attr,
2450 &rdev_size.attr, 2595 &rdev_size.attr,
2596 &rdev_recovery_start.attr,
2451 NULL, 2597 NULL,
2452}; 2598};
2453static ssize_t 2599static ssize_t
@@ -2549,6 +2695,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2549 rdev->flags = 0; 2695 rdev->flags = 0;
2550 rdev->data_offset = 0; 2696 rdev->data_offset = 0;
2551 rdev->sb_events = 0; 2697 rdev->sb_events = 0;
2698 rdev->last_read_error.tv_sec = 0;
2699 rdev->last_read_error.tv_nsec = 0;
2552 atomic_set(&rdev->nr_pending, 0); 2700 atomic_set(&rdev->nr_pending, 0);
2553 atomic_set(&rdev->read_errors, 0); 2701 atomic_set(&rdev->read_errors, 0);
2554 atomic_set(&rdev->corrected_errors, 0); 2702 atomic_set(&rdev->corrected_errors, 0);
@@ -2659,6 +2807,47 @@ static void analyze_sbs(mddev_t * mddev)
2659 } 2807 }
2660} 2808}
2661 2809
2810/* Read a fixed-point number.
2811 * Numbers in sysfs attributes should be in "standard" units where
2812 * possible, so time should be in seconds.
2813 * However we internally use a a much smaller unit such as
2814 * milliseconds or jiffies.
2815 * This function takes a decimal number with a possible fractional
2816 * component, and produces an integer which is the result of
2817 * multiplying that number by 10^'scale'.
2818 * all without any floating-point arithmetic.
2819 */
2820int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
2821{
2822 unsigned long result = 0;
2823 long decimals = -1;
2824 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
2825 if (*cp == '.')
2826 decimals = 0;
2827 else if (decimals < scale) {
2828 unsigned int value;
2829 value = *cp - '0';
2830 result = result * 10 + value;
2831 if (decimals >= 0)
2832 decimals++;
2833 }
2834 cp++;
2835 }
2836 if (*cp == '\n')
2837 cp++;
2838 if (*cp)
2839 return -EINVAL;
2840 if (decimals < 0)
2841 decimals = 0;
2842 while (decimals < scale) {
2843 result *= 10;
2844 decimals ++;
2845 }
2846 *res = result;
2847 return 0;
2848}
2849
2850
2662static void md_safemode_timeout(unsigned long data); 2851static void md_safemode_timeout(unsigned long data);
2663 2852
2664static ssize_t 2853static ssize_t
@@ -2670,31 +2859,10 @@ safe_delay_show(mddev_t *mddev, char *page)
2670static ssize_t 2859static ssize_t
2671safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2860safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2672{ 2861{
2673 int scale=1;
2674 int dot=0;
2675 int i;
2676 unsigned long msec; 2862 unsigned long msec;
2677 char buf[30];
2678 2863
2679 /* remove a period, and count digits after it */ 2864 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
2680 if (len >= sizeof(buf))
2681 return -EINVAL;
2682 strlcpy(buf, cbuf, sizeof(buf));
2683 for (i=0; i<len; i++) {
2684 if (dot) {
2685 if (isdigit(buf[i])) {
2686 buf[i-1] = buf[i];
2687 scale *= 10;
2688 }
2689 buf[i] = 0;
2690 } else if (buf[i] == '.') {
2691 dot=1;
2692 buf[i] = 0;
2693 }
2694 }
2695 if (strict_strtoul(buf, 10, &msec) < 0)
2696 return -EINVAL; 2865 return -EINVAL;
2697 msec = (msec * 1000) / scale;
2698 if (msec == 0) 2866 if (msec == 0)
2699 mddev->safemode_delay = 0; 2867 mddev->safemode_delay = 0;
2700 else { 2868 else {
@@ -2970,7 +3138,9 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2970 3138
2971 if (mddev->pers) 3139 if (mddev->pers)
2972 return -EBUSY; 3140 return -EBUSY;
2973 if (!*buf || (*e && *e != '\n')) 3141 if (cmd_match(buf, "none"))
3142 n = MaxSector;
3143 else if (!*buf || (*e && *e != '\n'))
2974 return -EINVAL; 3144 return -EINVAL;
2975 3145
2976 mddev->recovery_cp = n; 3146 mddev->recovery_cp = n;
@@ -3166,6 +3336,29 @@ static struct md_sysfs_entry md_array_state =
3166__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3336__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3167 3337
3168static ssize_t 3338static ssize_t
3339max_corrected_read_errors_show(mddev_t *mddev, char *page) {
3340 return sprintf(page, "%d\n",
3341 atomic_read(&mddev->max_corr_read_errors));
3342}
3343
3344static ssize_t
3345max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
3346{
3347 char *e;
3348 unsigned long n = simple_strtoul(buf, &e, 10);
3349
3350 if (*buf && (*e == 0 || *e == '\n')) {
3351 atomic_set(&mddev->max_corr_read_errors, n);
3352 return len;
3353 }
3354 return -EINVAL;
3355}
3356
3357static struct md_sysfs_entry max_corr_read_errors =
3358__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3359 max_corrected_read_errors_store);
3360
3361static ssize_t
3169null_show(mddev_t *mddev, char *page) 3362null_show(mddev_t *mddev, char *page)
3170{ 3363{
3171 return -EINVAL; 3364 return -EINVAL;
@@ -3790,6 +3983,7 @@ static struct attribute *md_default_attrs[] = {
3790 &md_array_state.attr, 3983 &md_array_state.attr,
3791 &md_reshape_position.attr, 3984 &md_reshape_position.attr,
3792 &md_array_size.attr, 3985 &md_array_size.attr,
3986 &max_corr_read_errors.attr,
3793 NULL, 3987 NULL,
3794}; 3988};
3795 3989
@@ -3894,6 +4088,7 @@ static void mddev_delayed_delete(struct work_struct *ws)
3894 mddev->sysfs_action = NULL; 4088 mddev->sysfs_action = NULL;
3895 mddev->private = NULL; 4089 mddev->private = NULL;
3896 } 4090 }
4091 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
3897 kobject_del(&mddev->kobj); 4092 kobject_del(&mddev->kobj);
3898 kobject_put(&mddev->kobj); 4093 kobject_put(&mddev->kobj);
3899} 4094}
@@ -3985,6 +4180,8 @@ static int md_alloc(dev_t dev, char *name)
3985 disk->disk_name); 4180 disk->disk_name);
3986 error = 0; 4181 error = 0;
3987 } 4182 }
4183 if (sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4184 printk(KERN_DEBUG "pointless warning\n");
3988 abort: 4185 abort:
3989 mutex_unlock(&disks_mutex); 4186 mutex_unlock(&disks_mutex);
3990 if (!error) { 4187 if (!error) {
@@ -4206,6 +4403,8 @@ static int do_md_run(mddev_t * mddev)
4206 mddev->ro = 0; 4403 mddev->ro = 0;
4207 4404
4208 atomic_set(&mddev->writes_pending,0); 4405 atomic_set(&mddev->writes_pending,0);
4406 atomic_set(&mddev->max_corr_read_errors,
4407 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4209 mddev->safemode = 0; 4408 mddev->safemode = 0;
4210 mddev->safemode_timer.function = md_safemode_timeout; 4409 mddev->safemode_timer.function = md_safemode_timeout;
4211 mddev->safemode_timer.data = (unsigned long) mddev; 4410 mddev->safemode_timer.data = (unsigned long) mddev;
@@ -4310,7 +4509,7 @@ static int deny_bitmap_write_access(struct file * file)
4310 return 0; 4509 return 0;
4311} 4510}
4312 4511
4313static void restore_bitmap_write_access(struct file *file) 4512void restore_bitmap_write_access(struct file *file)
4314{ 4513{
4315 struct inode *inode = file->f_mapping->host; 4514 struct inode *inode = file->f_mapping->host;
4316 4515
@@ -4405,12 +4604,12 @@ out:
4405 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4604 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4406 4605
4407 bitmap_destroy(mddev); 4606 bitmap_destroy(mddev);
4408 if (mddev->bitmap_file) { 4607 if (mddev->bitmap_info.file) {
4409 restore_bitmap_write_access(mddev->bitmap_file); 4608 restore_bitmap_write_access(mddev->bitmap_info.file);
4410 fput(mddev->bitmap_file); 4609 fput(mddev->bitmap_info.file);
4411 mddev->bitmap_file = NULL; 4610 mddev->bitmap_info.file = NULL;
4412 } 4611 }
4413 mddev->bitmap_offset = 0; 4612 mddev->bitmap_info.offset = 0;
4414 4613
4415 /* make sure all md_delayed_delete calls have finished */ 4614 /* make sure all md_delayed_delete calls have finished */
4416 flush_scheduled_work(); 4615 flush_scheduled_work();
@@ -4451,6 +4650,11 @@ out:
4451 mddev->degraded = 0; 4650 mddev->degraded = 0;
4452 mddev->barriers_work = 0; 4651 mddev->barriers_work = 0;
4453 mddev->safemode = 0; 4652 mddev->safemode = 0;
4653 mddev->bitmap_info.offset = 0;
4654 mddev->bitmap_info.default_offset = 0;
4655 mddev->bitmap_info.chunksize = 0;
4656 mddev->bitmap_info.daemon_sleep = 0;
4657 mddev->bitmap_info.max_write_behind = 0;
4454 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4658 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4455 if (mddev->hold_active == UNTIL_STOP) 4659 if (mddev->hold_active == UNTIL_STOP)
4456 mddev->hold_active = 0; 4660 mddev->hold_active = 0;
@@ -4636,7 +4840,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4636 info.state = 0; 4840 info.state = 0;
4637 if (mddev->in_sync) 4841 if (mddev->in_sync)
4638 info.state = (1<<MD_SB_CLEAN); 4842 info.state = (1<<MD_SB_CLEAN);
4639 if (mddev->bitmap && mddev->bitmap_offset) 4843 if (mddev->bitmap && mddev->bitmap_info.offset)
4640 info.state = (1<<MD_SB_BITMAP_PRESENT); 4844 info.state = (1<<MD_SB_BITMAP_PRESENT);
4641 info.active_disks = insync; 4845 info.active_disks = insync;
4642 info.working_disks = working; 4846 info.working_disks = working;
@@ -4994,23 +5198,23 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
4994 if (fd >= 0) { 5198 if (fd >= 0) {
4995 if (mddev->bitmap) 5199 if (mddev->bitmap)
4996 return -EEXIST; /* cannot add when bitmap is present */ 5200 return -EEXIST; /* cannot add when bitmap is present */
4997 mddev->bitmap_file = fget(fd); 5201 mddev->bitmap_info.file = fget(fd);
4998 5202
4999 if (mddev->bitmap_file == NULL) { 5203 if (mddev->bitmap_info.file == NULL) {
5000 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 5204 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5001 mdname(mddev)); 5205 mdname(mddev));
5002 return -EBADF; 5206 return -EBADF;
5003 } 5207 }
5004 5208
5005 err = deny_bitmap_write_access(mddev->bitmap_file); 5209 err = deny_bitmap_write_access(mddev->bitmap_info.file);
5006 if (err) { 5210 if (err) {
5007 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 5211 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
5008 mdname(mddev)); 5212 mdname(mddev));
5009 fput(mddev->bitmap_file); 5213 fput(mddev->bitmap_info.file);
5010 mddev->bitmap_file = NULL; 5214 mddev->bitmap_info.file = NULL;
5011 return err; 5215 return err;
5012 } 5216 }
5013 mddev->bitmap_offset = 0; /* file overrides offset */ 5217 mddev->bitmap_info.offset = 0; /* file overrides offset */
5014 } else if (mddev->bitmap == NULL) 5218 } else if (mddev->bitmap == NULL)
5015 return -ENOENT; /* cannot remove what isn't there */ 5219 return -ENOENT; /* cannot remove what isn't there */
5016 err = 0; 5220 err = 0;
@@ -5025,11 +5229,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
5025 mddev->pers->quiesce(mddev, 0); 5229 mddev->pers->quiesce(mddev, 0);
5026 } 5230 }
5027 if (fd < 0) { 5231 if (fd < 0) {
5028 if (mddev->bitmap_file) { 5232 if (mddev->bitmap_info.file) {
5029 restore_bitmap_write_access(mddev->bitmap_file); 5233 restore_bitmap_write_access(mddev->bitmap_info.file);
5030 fput(mddev->bitmap_file); 5234 fput(mddev->bitmap_info.file);
5031 } 5235 }
5032 mddev->bitmap_file = NULL; 5236 mddev->bitmap_info.file = NULL;
5033 } 5237 }
5034 5238
5035 return err; 5239 return err;
@@ -5096,8 +5300,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5096 mddev->flags = 0; 5300 mddev->flags = 0;
5097 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5301 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5098 5302
5099 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 5303 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
5100 mddev->bitmap_offset = 0; 5304 mddev->bitmap_info.offset = 0;
5101 5305
5102 mddev->reshape_position = MaxSector; 5306 mddev->reshape_position = MaxSector;
5103 5307
@@ -5197,7 +5401,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5197 int state = 0; 5401 int state = 0;
5198 5402
5199 /* calculate expected state,ignoring low bits */ 5403 /* calculate expected state,ignoring low bits */
5200 if (mddev->bitmap && mddev->bitmap_offset) 5404 if (mddev->bitmap && mddev->bitmap_info.offset)
5201 state |= (1 << MD_SB_BITMAP_PRESENT); 5405 state |= (1 << MD_SB_BITMAP_PRESENT);
5202 5406
5203 if (mddev->major_version != info->major_version || 5407 if (mddev->major_version != info->major_version ||
@@ -5256,9 +5460,10 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5256 /* add the bitmap */ 5460 /* add the bitmap */
5257 if (mddev->bitmap) 5461 if (mddev->bitmap)
5258 return -EEXIST; 5462 return -EEXIST;
5259 if (mddev->default_bitmap_offset == 0) 5463 if (mddev->bitmap_info.default_offset == 0)
5260 return -EINVAL; 5464 return -EINVAL;
5261 mddev->bitmap_offset = mddev->default_bitmap_offset; 5465 mddev->bitmap_info.offset =
5466 mddev->bitmap_info.default_offset;
5262 mddev->pers->quiesce(mddev, 1); 5467 mddev->pers->quiesce(mddev, 1);
5263 rv = bitmap_create(mddev); 5468 rv = bitmap_create(mddev);
5264 if (rv) 5469 if (rv)
@@ -5273,7 +5478,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5273 mddev->pers->quiesce(mddev, 1); 5478 mddev->pers->quiesce(mddev, 1);
5274 bitmap_destroy(mddev); 5479 bitmap_destroy(mddev);
5275 mddev->pers->quiesce(mddev, 0); 5480 mddev->pers->quiesce(mddev, 0);
5276 mddev->bitmap_offset = 0; 5481 mddev->bitmap_info.offset = 0;
5277 } 5482 }
5278 } 5483 }
5279 md_update_sb(mddev, 1); 5484 md_update_sb(mddev, 1);
@@ -5524,6 +5729,25 @@ done:
5524abort: 5729abort:
5525 return err; 5730 return err;
5526} 5731}
5732#ifdef CONFIG_COMPAT
5733static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
5734 unsigned int cmd, unsigned long arg)
5735{
5736 switch (cmd) {
5737 case HOT_REMOVE_DISK:
5738 case HOT_ADD_DISK:
5739 case SET_DISK_FAULTY:
5740 case SET_BITMAP_FILE:
5741 /* These take in integer arg, do not convert */
5742 break;
5743 default:
5744 arg = (unsigned long)compat_ptr(arg);
5745 break;
5746 }
5747
5748 return md_ioctl(bdev, mode, cmd, arg);
5749}
5750#endif /* CONFIG_COMPAT */
5527 5751
5528static int md_open(struct block_device *bdev, fmode_t mode) 5752static int md_open(struct block_device *bdev, fmode_t mode)
5529{ 5753{
@@ -5589,6 +5813,9 @@ static const struct block_device_operations md_fops =
5589 .open = md_open, 5813 .open = md_open,
5590 .release = md_release, 5814 .release = md_release,
5591 .ioctl = md_ioctl, 5815 .ioctl = md_ioctl,
5816#ifdef CONFIG_COMPAT
5817 .compat_ioctl = md_compat_ioctl,
5818#endif
5592 .getgeo = md_getgeo, 5819 .getgeo = md_getgeo,
5593 .media_changed = md_media_changed, 5820 .media_changed = md_media_changed,
5594 .revalidate_disk= md_revalidate, 5821 .revalidate_disk= md_revalidate,
@@ -5982,14 +6209,14 @@ static int md_seq_show(struct seq_file *seq, void *v)
5982 unsigned long chunk_kb; 6209 unsigned long chunk_kb;
5983 unsigned long flags; 6210 unsigned long flags;
5984 spin_lock_irqsave(&bitmap->lock, flags); 6211 spin_lock_irqsave(&bitmap->lock, flags);
5985 chunk_kb = bitmap->chunksize >> 10; 6212 chunk_kb = mddev->bitmap_info.chunksize >> 10;
5986 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 6213 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5987 "%lu%s chunk", 6214 "%lu%s chunk",
5988 bitmap->pages - bitmap->missing_pages, 6215 bitmap->pages - bitmap->missing_pages,
5989 bitmap->pages, 6216 bitmap->pages,
5990 (bitmap->pages - bitmap->missing_pages) 6217 (bitmap->pages - bitmap->missing_pages)
5991 << (PAGE_SHIFT - 10), 6218 << (PAGE_SHIFT - 10),
5992 chunk_kb ? chunk_kb : bitmap->chunksize, 6219 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
5993 chunk_kb ? "KB" : "B"); 6220 chunk_kb ? "KB" : "B");
5994 if (bitmap->file) { 6221 if (bitmap->file) {
5995 seq_printf(seq, ", file: "); 6222 seq_printf(seq, ", file: ");
@@ -6338,12 +6565,14 @@ void md_do_sync(mddev_t *mddev)
6338 /* recovery follows the physical size of devices */ 6565 /* recovery follows the physical size of devices */
6339 max_sectors = mddev->dev_sectors; 6566 max_sectors = mddev->dev_sectors;
6340 j = MaxSector; 6567 j = MaxSector;
6341 list_for_each_entry(rdev, &mddev->disks, same_set) 6568 rcu_read_lock();
6569 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6342 if (rdev->raid_disk >= 0 && 6570 if (rdev->raid_disk >= 0 &&
6343 !test_bit(Faulty, &rdev->flags) && 6571 !test_bit(Faulty, &rdev->flags) &&
6344 !test_bit(In_sync, &rdev->flags) && 6572 !test_bit(In_sync, &rdev->flags) &&
6345 rdev->recovery_offset < j) 6573 rdev->recovery_offset < j)
6346 j = rdev->recovery_offset; 6574 j = rdev->recovery_offset;
6575 rcu_read_unlock();
6347 } 6576 }
6348 6577
6349 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 6578 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
@@ -6380,6 +6609,7 @@ void md_do_sync(mddev_t *mddev)
6380 desc, mdname(mddev)); 6609 desc, mdname(mddev));
6381 mddev->curr_resync = j; 6610 mddev->curr_resync = j;
6382 } 6611 }
6612 mddev->curr_resync_completed = mddev->curr_resync;
6383 6613
6384 while (j < max_sectors) { 6614 while (j < max_sectors) {
6385 sector_t sectors; 6615 sector_t sectors;
@@ -6512,22 +6742,29 @@ void md_do_sync(mddev_t *mddev)
6512 } else { 6742 } else {
6513 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6743 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6514 mddev->curr_resync = MaxSector; 6744 mddev->curr_resync = MaxSector;
6515 list_for_each_entry(rdev, &mddev->disks, same_set) 6745 rcu_read_lock();
6746 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6516 if (rdev->raid_disk >= 0 && 6747 if (rdev->raid_disk >= 0 &&
6517 !test_bit(Faulty, &rdev->flags) && 6748 !test_bit(Faulty, &rdev->flags) &&
6518 !test_bit(In_sync, &rdev->flags) && 6749 !test_bit(In_sync, &rdev->flags) &&
6519 rdev->recovery_offset < mddev->curr_resync) 6750 rdev->recovery_offset < mddev->curr_resync)
6520 rdev->recovery_offset = mddev->curr_resync; 6751 rdev->recovery_offset = mddev->curr_resync;
6752 rcu_read_unlock();
6521 } 6753 }
6522 } 6754 }
6523 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6755 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6524 6756
6525 skip: 6757 skip:
6758 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6759 /* We completed so min/max setting can be forgotten if used. */
6760 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6761 mddev->resync_min = 0;
6762 mddev->resync_max = MaxSector;
6763 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6764 mddev->resync_min = mddev->curr_resync_completed;
6526 mddev->curr_resync = 0; 6765 mddev->curr_resync = 0;
6527 mddev->curr_resync_completed = 0;
6528 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6766 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6529 /* We completed so max setting can be forgotten. */ 6767 mddev->curr_resync_completed = 0;
6530 mddev->resync_max = MaxSector;
6531 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6768 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6532 wake_up(&resync_wait); 6769 wake_up(&resync_wait);
6533 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 6770 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
@@ -6590,6 +6827,7 @@ static int remove_and_add_spares(mddev_t *mddev)
6590 nm, mdname(mddev)); 6827 nm, mdname(mddev));
6591 spares++; 6828 spares++;
6592 md_new_event(mddev); 6829 md_new_event(mddev);
6830 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6593 } else 6831 } else
6594 break; 6832 break;
6595 } 6833 }
@@ -6625,7 +6863,7 @@ void md_check_recovery(mddev_t *mddev)
6625 6863
6626 6864
6627 if (mddev->bitmap) 6865 if (mddev->bitmap)
6628 bitmap_daemon_work(mddev->bitmap); 6866 bitmap_daemon_work(mddev);
6629 6867
6630 if (mddev->ro) 6868 if (mddev->ro)
6631 return; 6869 return;
@@ -6995,5 +7233,6 @@ EXPORT_SYMBOL(md_unregister_thread);
6995EXPORT_SYMBOL(md_wakeup_thread); 7233EXPORT_SYMBOL(md_wakeup_thread);
6996EXPORT_SYMBOL(md_check_recovery); 7234EXPORT_SYMBOL(md_check_recovery);
6997MODULE_LICENSE("GPL"); 7235MODULE_LICENSE("GPL");
7236MODULE_DESCRIPTION("MD RAID framework");
6998MODULE_ALIAS("md"); 7237MODULE_ALIAS("md");
6999MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 7238MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f184b69ef337..8e4c75c00d46 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -97,6 +97,9 @@ struct mdk_rdev_s
97 atomic_t read_errors; /* number of consecutive read errors that 97 atomic_t read_errors; /* number of consecutive read errors that
98 * we have tried to ignore. 98 * we have tried to ignore.
99 */ 99 */
100 struct timespec last_read_error; /* monotonic time since our
101 * last read error
102 */
100 atomic_t corrected_errors; /* number of corrected read errors, 103 atomic_t corrected_errors; /* number of corrected read errors,
101 * for reporting to userspace and storing 104 * for reporting to userspace and storing
102 * in superblock. 105 * in superblock.
@@ -280,17 +283,38 @@ struct mddev_s
280 unsigned int max_write_behind; /* 0 = sync */ 283 unsigned int max_write_behind; /* 0 = sync */
281 284
282 struct bitmap *bitmap; /* the bitmap for the device */ 285 struct bitmap *bitmap; /* the bitmap for the device */
283 struct file *bitmap_file; /* the bitmap file */ 286 struct {
284 long bitmap_offset; /* offset from superblock of 287 struct file *file; /* the bitmap file */
285 * start of bitmap. May be 288 loff_t offset; /* offset from superblock of
286 * negative, but not '0' 289 * start of bitmap. May be
287 */ 290 * negative, but not '0'
288 long default_bitmap_offset; /* this is the offset to use when 291 * For external metadata, offset
289 * hot-adding a bitmap. It should 292 * from start of device.
290 * eventually be settable by sysfs. 293 */
291 */ 294 loff_t default_offset; /* this is the offset to use when
292 295 * hot-adding a bitmap. It should
296 * eventually be settable by sysfs.
297 */
298 struct mutex mutex;
299 unsigned long chunksize;
300 unsigned long daemon_sleep; /* how many seconds between updates? */
301 unsigned long max_write_behind; /* write-behind mode */
302 int external;
303 } bitmap_info;
304
305 atomic_t max_corr_read_errors; /* max read retries */
293 struct list_head all_mddevs; 306 struct list_head all_mddevs;
307
308 /* Generic barrier handling.
309 * If there is a pending barrier request, all other
310 * writes are blocked while the devices are flushed.
311 * The last to finish a flush schedules a worker to
312 * submit the barrier request (without the barrier flag),
313 * then submit more flush requests.
314 */
315 struct bio *barrier;
316 atomic_t flush_pending;
317 struct work_struct barrier_work;
294}; 318};
295 319
296 320
@@ -353,7 +377,7 @@ struct md_sysfs_entry {
353 ssize_t (*show)(mddev_t *, char *); 377 ssize_t (*show)(mddev_t *, char *);
354 ssize_t (*store)(mddev_t *, const char *, size_t); 378 ssize_t (*store)(mddev_t *, const char *, size_t);
355}; 379};
356 380extern struct attribute_group md_bitmap_group;
357 381
358static inline char * mdname (mddev_t * mddev) 382static inline char * mdname (mddev_t * mddev)
359{ 383{
@@ -431,6 +455,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
431extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 455extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
432 456
433extern int mddev_congested(mddev_t *mddev, int bits); 457extern int mddev_congested(mddev_t *mddev, int bits);
458extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
434extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 459extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
435 sector_t sector, int size, struct page *page); 460 sector_t sector, int size, struct page *page);
436extern void md_super_wait(mddev_t *mddev); 461extern void md_super_wait(mddev_t *mddev);
@@ -443,6 +468,8 @@ extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
443extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); 468extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
444extern int md_check_no_bitmap(mddev_t *mddev); 469extern int md_check_no_bitmap(mddev_t *mddev);
445extern int md_integrity_register(mddev_t *mddev); 470extern int md_integrity_register(mddev_t *mddev);
446void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 471extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
472extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
473extern void restore_bitmap_write_access(struct file *file);
447 474
448#endif /* _MD_MD_H */ 475#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index ee7646f974a0..32a662fc55c9 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -145,7 +145,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
145 int cpu; 145 int cpu;
146 146
147 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 147 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
148 bio_endio(bio, -EOPNOTSUPP); 148 md_barrier_request(mddev, bio);
149 return 0; 149 return 0;
150 } 150 }
151 151
@@ -581,6 +581,7 @@ static void __exit multipath_exit (void)
581module_init(multipath_init); 581module_init(multipath_init);
582module_exit(multipath_exit); 582module_exit(multipath_exit);
583MODULE_LICENSE("GPL"); 583MODULE_LICENSE("GPL");
584MODULE_DESCRIPTION("simple multi-path personality for MD");
584MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ 585MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
585MODULE_ALIAS("md-multipath"); 586MODULE_ALIAS("md-multipath");
586MODULE_ALIAS("md-level--4"); 587MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d3a4ce06015a..77605cdceaf1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -453,7 +453,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
453 int cpu; 453 int cpu;
454 454
455 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 455 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
456 bio_endio(bio, -EOPNOTSUPP); 456 md_barrier_request(mddev, bio);
457 return 0; 457 return 0;
458 } 458 }
459 459
@@ -567,6 +567,7 @@ static void raid0_exit (void)
567module_init(raid0_init); 567module_init(raid0_init);
568module_exit(raid0_exit); 568module_exit(raid0_exit);
569MODULE_LICENSE("GPL"); 569MODULE_LICENSE("GPL");
570MODULE_DESCRIPTION("RAID0 (striping) personality for MD");
570MODULE_ALIAS("md-personality-2"); /* RAID0 */ 571MODULE_ALIAS("md-personality-2"); /* RAID0 */
571MODULE_ALIAS("md-raid0"); 572MODULE_ALIAS("md-raid0");
572MODULE_ALIAS("md-level-0"); 573MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e07ce2e033a9..859bd3ffe435 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -677,6 +677,7 @@ static void raise_barrier(conf_t *conf)
677static void lower_barrier(conf_t *conf) 677static void lower_barrier(conf_t *conf)
678{ 678{
679 unsigned long flags; 679 unsigned long flags;
680 BUG_ON(conf->barrier <= 0);
680 spin_lock_irqsave(&conf->resync_lock, flags); 681 spin_lock_irqsave(&conf->resync_lock, flags);
681 conf->barrier--; 682 conf->barrier--;
682 spin_unlock_irqrestore(&conf->resync_lock, flags); 683 spin_unlock_irqrestore(&conf->resync_lock, flags);
@@ -801,6 +802,25 @@ static int make_request(struct request_queue *q, struct bio * bio)
801 802
802 md_write_start(mddev, bio); /* wait on superblock update early */ 803 md_write_start(mddev, bio); /* wait on superblock update early */
803 804
805 if (bio_data_dir(bio) == WRITE &&
806 bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
807 bio->bi_sector < mddev->suspend_hi) {
808 /* As the suspend_* range is controlled by
809 * userspace, we want an interruptible
810 * wait.
811 */
812 DEFINE_WAIT(w);
813 for (;;) {
814 flush_signals(current);
815 prepare_to_wait(&conf->wait_barrier,
816 &w, TASK_INTERRUPTIBLE);
817 if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
818 bio->bi_sector >= mddev->suspend_hi)
819 break;
820 schedule();
821 }
822 finish_wait(&conf->wait_barrier, &w);
823 }
804 if (unlikely(!mddev->barriers_work && 824 if (unlikely(!mddev->barriers_work &&
805 bio_rw_flagged(bio, BIO_RW_BARRIER))) { 825 bio_rw_flagged(bio, BIO_RW_BARRIER))) {
806 if (rw == WRITE) 826 if (rw == WRITE)
@@ -923,7 +943,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
923 943
924 /* do behind I/O ? */ 944 /* do behind I/O ? */
925 if (bitmap && 945 if (bitmap &&
926 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && 946 (atomic_read(&bitmap->behind_writes)
947 < mddev->bitmap_info.max_write_behind) &&
927 (behind_pages = alloc_behind_pages(bio)) != NULL) 948 (behind_pages = alloc_behind_pages(bio)) != NULL)
928 set_bit(R1BIO_BehindIO, &r1_bio->state); 949 set_bit(R1BIO_BehindIO, &r1_bio->state);
929 950
@@ -1941,74 +1962,48 @@ static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
1941 return mddev->dev_sectors; 1962 return mddev->dev_sectors;
1942} 1963}
1943 1964
1944static int run(mddev_t *mddev) 1965static conf_t *setup_conf(mddev_t *mddev)
1945{ 1966{
1946 conf_t *conf; 1967 conf_t *conf;
1947 int i, j, disk_idx; 1968 int i;
1948 mirror_info_t *disk; 1969 mirror_info_t *disk;
1949 mdk_rdev_t *rdev; 1970 mdk_rdev_t *rdev;
1971 int err = -ENOMEM;
1950 1972
1951 if (mddev->level != 1) {
1952 printk("raid1: %s: raid level not set to mirroring (%d)\n",
1953 mdname(mddev), mddev->level);
1954 goto out;
1955 }
1956 if (mddev->reshape_position != MaxSector) {
1957 printk("raid1: %s: reshape_position set but not supported\n",
1958 mdname(mddev));
1959 goto out;
1960 }
1961 /*
1962 * copy the already verified devices into our private RAID1
1963 * bookkeeping area. [whatever we allocate in run(),
1964 * should be freed in stop()]
1965 */
1966 conf = kzalloc(sizeof(conf_t), GFP_KERNEL); 1973 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1967 mddev->private = conf;
1968 if (!conf) 1974 if (!conf)
1969 goto out_no_mem; 1975 goto abort;
1970 1976
1971 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 1977 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1972 GFP_KERNEL); 1978 GFP_KERNEL);
1973 if (!conf->mirrors) 1979 if (!conf->mirrors)
1974 goto out_no_mem; 1980 goto abort;
1975 1981
1976 conf->tmppage = alloc_page(GFP_KERNEL); 1982 conf->tmppage = alloc_page(GFP_KERNEL);
1977 if (!conf->tmppage) 1983 if (!conf->tmppage)
1978 goto out_no_mem; 1984 goto abort;
1979 1985
1980 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 1986 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1981 if (!conf->poolinfo) 1987 if (!conf->poolinfo)
1982 goto out_no_mem; 1988 goto abort;
1983 conf->poolinfo->mddev = NULL;
1984 conf->poolinfo->raid_disks = mddev->raid_disks; 1989 conf->poolinfo->raid_disks = mddev->raid_disks;
1985 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 1990 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
1986 r1bio_pool_free, 1991 r1bio_pool_free,
1987 conf->poolinfo); 1992 conf->poolinfo);
1988 if (!conf->r1bio_pool) 1993 if (!conf->r1bio_pool)
1989 goto out_no_mem; 1994 goto abort;
1995
1990 conf->poolinfo->mddev = mddev; 1996 conf->poolinfo->mddev = mddev;
1991 1997
1992 spin_lock_init(&conf->device_lock); 1998 spin_lock_init(&conf->device_lock);
1993 mddev->queue->queue_lock = &conf->device_lock;
1994
1995 list_for_each_entry(rdev, &mddev->disks, same_set) { 1999 list_for_each_entry(rdev, &mddev->disks, same_set) {
1996 disk_idx = rdev->raid_disk; 2000 int disk_idx = rdev->raid_disk;
1997 if (disk_idx >= mddev->raid_disks 2001 if (disk_idx >= mddev->raid_disks
1998 || disk_idx < 0) 2002 || disk_idx < 0)
1999 continue; 2003 continue;
2000 disk = conf->mirrors + disk_idx; 2004 disk = conf->mirrors + disk_idx;
2001 2005
2002 disk->rdev = rdev; 2006 disk->rdev = rdev;
2003 disk_stack_limits(mddev->gendisk, rdev->bdev,
2004 rdev->data_offset << 9);
2005 /* as we don't honour merge_bvec_fn, we must never risk
2006 * violating it, so limit ->max_sector to one PAGE, as
2007 * a one page request is never in violation.
2008 */
2009 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2010 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
2011 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
2012 2007
2013 disk->head_position = 0; 2008 disk->head_position = 0;
2014 } 2009 }
@@ -2022,8 +2017,7 @@ static int run(mddev_t *mddev)
2022 bio_list_init(&conf->pending_bio_list); 2017 bio_list_init(&conf->pending_bio_list);
2023 bio_list_init(&conf->flushing_bio_list); 2018 bio_list_init(&conf->flushing_bio_list);
2024 2019
2025 2020 conf->last_used = -1;
2026 mddev->degraded = 0;
2027 for (i = 0; i < conf->raid_disks; i++) { 2021 for (i = 0; i < conf->raid_disks; i++) {
2028 2022
2029 disk = conf->mirrors + i; 2023 disk = conf->mirrors + i;
@@ -2031,38 +2025,97 @@ static int run(mddev_t *mddev)
2031 if (!disk->rdev || 2025 if (!disk->rdev ||
2032 !test_bit(In_sync, &disk->rdev->flags)) { 2026 !test_bit(In_sync, &disk->rdev->flags)) {
2033 disk->head_position = 0; 2027 disk->head_position = 0;
2034 mddev->degraded++;
2035 if (disk->rdev) 2028 if (disk->rdev)
2036 conf->fullsync = 1; 2029 conf->fullsync = 1;
2037 } 2030 } else if (conf->last_used < 0)
2031 /*
2032 * The first working device is used as a
2033 * starting point to read balancing.
2034 */
2035 conf->last_used = i;
2038 } 2036 }
2039 if (mddev->degraded == conf->raid_disks) { 2037
2038 err = -EIO;
2039 if (conf->last_used < 0) {
2040 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 2040 printk(KERN_ERR "raid1: no operational mirrors for %s\n",
2041 mdname(mddev)); 2041 mdname(mddev));
2042 goto out_free_conf; 2042 goto abort;
2043 } 2043 }
2044 if (conf->raid_disks - mddev->degraded == 1) 2044 err = -ENOMEM;
2045 mddev->recovery_cp = MaxSector; 2045 conf->thread = md_register_thread(raid1d, mddev, NULL);
2046 if (!conf->thread) {
2047 printk(KERN_ERR
2048 "raid1: couldn't allocate thread for %s\n",
2049 mdname(mddev));
2050 goto abort;
2051 }
2052
2053 return conf;
2054
2055 abort:
2056 if (conf) {
2057 if (conf->r1bio_pool)
2058 mempool_destroy(conf->r1bio_pool);
2059 kfree(conf->mirrors);
2060 safe_put_page(conf->tmppage);
2061 kfree(conf->poolinfo);
2062 kfree(conf);
2063 }
2064 return ERR_PTR(err);
2065}
2046 2066
2067static int run(mddev_t *mddev)
2068{
2069 conf_t *conf;
2070 int i;
2071 mdk_rdev_t *rdev;
2072
2073 if (mddev->level != 1) {
2074 printk("raid1: %s: raid level not set to mirroring (%d)\n",
2075 mdname(mddev), mddev->level);
2076 return -EIO;
2077 }
2078 if (mddev->reshape_position != MaxSector) {
2079 printk("raid1: %s: reshape_position set but not supported\n",
2080 mdname(mddev));
2081 return -EIO;
2082 }
2047 /* 2083 /*
2048 * find the first working one and use it as a starting point 2084 * copy the already verified devices into our private RAID1
2049 * to read balancing. 2085 * bookkeeping area. [whatever we allocate in run(),
2086 * should be freed in stop()]
2050 */ 2087 */
2051 for (j = 0; j < conf->raid_disks && 2088 if (mddev->private == NULL)
2052 (!conf->mirrors[j].rdev || 2089 conf = setup_conf(mddev);
2053 !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) 2090 else
2054 /* nothing */; 2091 conf = mddev->private;
2055 conf->last_used = j;
2056 2092
2093 if (IS_ERR(conf))
2094 return PTR_ERR(conf);
2057 2095
2058 mddev->thread = md_register_thread(raid1d, mddev, NULL); 2096 mddev->queue->queue_lock = &conf->device_lock;
2059 if (!mddev->thread) { 2097 list_for_each_entry(rdev, &mddev->disks, same_set) {
2060 printk(KERN_ERR 2098 disk_stack_limits(mddev->gendisk, rdev->bdev,
2061 "raid1: couldn't allocate thread for %s\n", 2099 rdev->data_offset << 9);
2062 mdname(mddev)); 2100 /* as we don't honour merge_bvec_fn, we must never risk
2063 goto out_free_conf; 2101 * violating it, so limit ->max_sector to one PAGE, as
2102 * a one page request is never in violation.
2103 */
2104 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2105 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
2106 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
2064 } 2107 }
2065 2108
2109 mddev->degraded = 0;
2110 for (i=0; i < conf->raid_disks; i++)
2111 if (conf->mirrors[i].rdev == NULL ||
2112 !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
2113 test_bit(Faulty, &conf->mirrors[i].rdev->flags))
2114 mddev->degraded++;
2115
2116 if (conf->raid_disks - mddev->degraded == 1)
2117 mddev->recovery_cp = MaxSector;
2118
2066 if (mddev->recovery_cp != MaxSector) 2119 if (mddev->recovery_cp != MaxSector)
2067 printk(KERN_NOTICE "raid1: %s is not clean" 2120 printk(KERN_NOTICE "raid1: %s is not clean"
2068 " -- starting background reconstruction\n", 2121 " -- starting background reconstruction\n",
@@ -2071,9 +2124,14 @@ static int run(mddev_t *mddev)
2071 "raid1: raid set %s active with %d out of %d mirrors\n", 2124 "raid1: raid set %s active with %d out of %d mirrors\n",
2072 mdname(mddev), mddev->raid_disks - mddev->degraded, 2125 mdname(mddev), mddev->raid_disks - mddev->degraded,
2073 mddev->raid_disks); 2126 mddev->raid_disks);
2127
2074 /* 2128 /*
2075 * Ok, everything is just fine now 2129 * Ok, everything is just fine now
2076 */ 2130 */
2131 mddev->thread = conf->thread;
2132 conf->thread = NULL;
2133 mddev->private = conf;
2134
2077 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 2135 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2078 2136
2079 mddev->queue->unplug_fn = raid1_unplug; 2137 mddev->queue->unplug_fn = raid1_unplug;
@@ -2081,23 +2139,6 @@ static int run(mddev_t *mddev)
2081 mddev->queue->backing_dev_info.congested_data = mddev; 2139 mddev->queue->backing_dev_info.congested_data = mddev;
2082 md_integrity_register(mddev); 2140 md_integrity_register(mddev);
2083 return 0; 2141 return 0;
2084
2085out_no_mem:
2086 printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
2087 mdname(mddev));
2088
2089out_free_conf:
2090 if (conf) {
2091 if (conf->r1bio_pool)
2092 mempool_destroy(conf->r1bio_pool);
2093 kfree(conf->mirrors);
2094 safe_put_page(conf->tmppage);
2095 kfree(conf->poolinfo);
2096 kfree(conf);
2097 mddev->private = NULL;
2098 }
2099out:
2100 return -EIO;
2101} 2142}
2102 2143
2103static int stop(mddev_t *mddev) 2144static int stop(mddev_t *mddev)
@@ -2271,6 +2312,9 @@ static void raid1_quiesce(mddev_t *mddev, int state)
2271 conf_t *conf = mddev->private; 2312 conf_t *conf = mddev->private;
2272 2313
2273 switch(state) { 2314 switch(state) {
2315 case 2: /* wake for suspend */
2316 wake_up(&conf->wait_barrier);
2317 break;
2274 case 1: 2318 case 1:
2275 raise_barrier(conf); 2319 raise_barrier(conf);
2276 break; 2320 break;
@@ -2280,6 +2324,23 @@ static void raid1_quiesce(mddev_t *mddev, int state)
2280 } 2324 }
2281} 2325}
2282 2326
2327static void *raid1_takeover(mddev_t *mddev)
2328{
2329 /* raid1 can take over:
2330 * raid5 with 2 devices, any layout or chunk size
2331 */
2332 if (mddev->level == 5 && mddev->raid_disks == 2) {
2333 conf_t *conf;
2334 mddev->new_level = 1;
2335 mddev->new_layout = 0;
2336 mddev->new_chunk_sectors = 0;
2337 conf = setup_conf(mddev);
2338 if (!IS_ERR(conf))
2339 conf->barrier = 1;
2340 return conf;
2341 }
2342 return ERR_PTR(-EINVAL);
2343}
2283 2344
2284static struct mdk_personality raid1_personality = 2345static struct mdk_personality raid1_personality =
2285{ 2346{
@@ -2299,6 +2360,7 @@ static struct mdk_personality raid1_personality =
2299 .size = raid1_size, 2360 .size = raid1_size,
2300 .check_reshape = raid1_reshape, 2361 .check_reshape = raid1_reshape,
2301 .quiesce = raid1_quiesce, 2362 .quiesce = raid1_quiesce,
2363 .takeover = raid1_takeover,
2302}; 2364};
2303 2365
2304static int __init raid_init(void) 2366static int __init raid_init(void)
@@ -2314,6 +2376,7 @@ static void raid_exit(void)
2314module_init(raid_init); 2376module_init(raid_init);
2315module_exit(raid_exit); 2377module_exit(raid_exit);
2316MODULE_LICENSE("GPL"); 2378MODULE_LICENSE("GPL");
2379MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
2317MODULE_ALIAS("md-personality-3"); /* RAID1 */ 2380MODULE_ALIAS("md-personality-3"); /* RAID1 */
2318MODULE_ALIAS("md-raid1"); 2381MODULE_ALIAS("md-raid1");
2319MODULE_ALIAS("md-level-1"); 2382MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e87b84deff68..5f2d443ae28a 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -59,6 +59,11 @@ struct r1_private_data_s {
59 59
60 mempool_t *r1bio_pool; 60 mempool_t *r1bio_pool;
61 mempool_t *r1buf_pool; 61 mempool_t *r1buf_pool;
62
63 /* When taking over an array from a different personality, we store
64 * the new thread here until we fully activate the array.
65 */
66 struct mdk_thread_s *thread;
62}; 67};
63 68
64typedef struct r1_private_data_s conf_t; 69typedef struct r1_private_data_s conf_t;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c2cb7b87b440..d119b7b75e71 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -804,7 +804,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
804 mdk_rdev_t *blocked_rdev; 804 mdk_rdev_t *blocked_rdev;
805 805
806 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 806 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
807 bio_endio(bio, -EOPNOTSUPP); 807 md_barrier_request(mddev, bio);
808 return 0; 808 return 0;
809 } 809 }
810 810
@@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1432 1432
1433 1433
1434/* 1434/*
1435 * Used by fix_read_error() to decay the per rdev read_errors.
1436 * We halve the read error count for every hour that has elapsed
1437 * since the last recorded read error.
1438 *
1439 */
1440static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1441{
1442 struct timespec cur_time_mon;
1443 unsigned long hours_since_last;
1444 unsigned int read_errors = atomic_read(&rdev->read_errors);
1445
1446 ktime_get_ts(&cur_time_mon);
1447
1448 if (rdev->last_read_error.tv_sec == 0 &&
1449 rdev->last_read_error.tv_nsec == 0) {
1450 /* first time we've seen a read error */
1451 rdev->last_read_error = cur_time_mon;
1452 return;
1453 }
1454
1455 hours_since_last = (cur_time_mon.tv_sec -
1456 rdev->last_read_error.tv_sec) / 3600;
1457
1458 rdev->last_read_error = cur_time_mon;
1459
1460 /*
1461 * if hours_since_last is > the number of bits in read_errors
1462 * just set read errors to 0. We do this to avoid
1463 * overflowing the shift of read_errors by hours_since_last.
1464 */
1465 if (hours_since_last >= 8 * sizeof(read_errors))
1466 atomic_set(&rdev->read_errors, 0);
1467 else
1468 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1469}
1470
1471/*
1435 * This is a kernel thread which: 1472 * This is a kernel thread which:
1436 * 1473 *
1437 * 1. Retries failed read operations on working mirrors. 1474 * 1. Retries failed read operations on working mirrors.
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1444 int sect = 0; /* Offset from r10_bio->sector */ 1481 int sect = 0; /* Offset from r10_bio->sector */
1445 int sectors = r10_bio->sectors; 1482 int sectors = r10_bio->sectors;
1446 mdk_rdev_t*rdev; 1483 mdk_rdev_t*rdev;
1484 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1485
1486 rcu_read_lock();
1487 {
1488 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1489 char b[BDEVNAME_SIZE];
1490 int cur_read_error_count = 0;
1491
1492 rdev = rcu_dereference(conf->mirrors[d].rdev);
1493 bdevname(rdev->bdev, b);
1494
1495 if (test_bit(Faulty, &rdev->flags)) {
1496 rcu_read_unlock();
1497 /* drive has already been failed, just ignore any
1498 more fix_read_error() attempts */
1499 return;
1500 }
1501
1502 check_decay_read_errors(mddev, rdev);
1503 atomic_inc(&rdev->read_errors);
1504 cur_read_error_count = atomic_read(&rdev->read_errors);
1505 if (cur_read_error_count > max_read_errors) {
1506 rcu_read_unlock();
1507 printk(KERN_NOTICE
1508 "raid10: %s: Raid device exceeded "
1509 "read_error threshold "
1510 "[cur %d:max %d]\n",
1511 b, cur_read_error_count, max_read_errors);
1512 printk(KERN_NOTICE
1513 "raid10: %s: Failing raid "
1514 "device\n", b);
1515 md_error(mddev, conf->mirrors[d].rdev);
1516 return;
1517 }
1518 }
1519 rcu_read_unlock();
1520
1447 while(sectors) { 1521 while(sectors) {
1448 int s = sectors; 1522 int s = sectors;
1449 int sl = r10_bio->read_slot; 1523 int sl = r10_bio->read_slot;
@@ -1488,6 +1562,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1488 /* write it back and re-read */ 1562 /* write it back and re-read */
1489 rcu_read_lock(); 1563 rcu_read_lock();
1490 while (sl != r10_bio->read_slot) { 1564 while (sl != r10_bio->read_slot) {
1565 char b[BDEVNAME_SIZE];
1491 int d; 1566 int d;
1492 if (sl==0) 1567 if (sl==0)
1493 sl = conf->copies; 1568 sl = conf->copies;
@@ -1503,9 +1578,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1503 r10_bio->devs[sl].addr + 1578 r10_bio->devs[sl].addr +
1504 sect + rdev->data_offset, 1579 sect + rdev->data_offset,
1505 s<<9, conf->tmppage, WRITE) 1580 s<<9, conf->tmppage, WRITE)
1506 == 0) 1581 == 0) {
1507 /* Well, this device is dead */ 1582 /* Well, this device is dead */
1583 printk(KERN_NOTICE
1584 "raid10:%s: read correction "
1585 "write failed"
1586 " (%d sectors at %llu on %s)\n",
1587 mdname(mddev), s,
1588 (unsigned long long)(sect+
1589 rdev->data_offset),
1590 bdevname(rdev->bdev, b));
1591 printk(KERN_NOTICE "raid10:%s: failing "
1592 "drive\n",
1593 bdevname(rdev->bdev, b));
1508 md_error(mddev, rdev); 1594 md_error(mddev, rdev);
1595 }
1509 rdev_dec_pending(rdev, mddev); 1596 rdev_dec_pending(rdev, mddev);
1510 rcu_read_lock(); 1597 rcu_read_lock();
1511 } 1598 }
@@ -1526,10 +1613,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1526 if (sync_page_io(rdev->bdev, 1613 if (sync_page_io(rdev->bdev,
1527 r10_bio->devs[sl].addr + 1614 r10_bio->devs[sl].addr +
1528 sect + rdev->data_offset, 1615 sect + rdev->data_offset,
1529 s<<9, conf->tmppage, READ) == 0) 1616 s<<9, conf->tmppage,
1617 READ) == 0) {
1530 /* Well, this device is dead */ 1618 /* Well, this device is dead */
1619 printk(KERN_NOTICE
1620 "raid10:%s: unable to read back "
1621 "corrected sectors"
1622 " (%d sectors at %llu on %s)\n",
1623 mdname(mddev), s,
1624 (unsigned long long)(sect+
1625 rdev->data_offset),
1626 bdevname(rdev->bdev, b));
1627 printk(KERN_NOTICE "raid10:%s: failing drive\n",
1628 bdevname(rdev->bdev, b));
1629
1531 md_error(mddev, rdev); 1630 md_error(mddev, rdev);
1532 else 1631 } else {
1533 printk(KERN_INFO 1632 printk(KERN_INFO
1534 "raid10:%s: read error corrected" 1633 "raid10:%s: read error corrected"
1535 " (%d sectors at %llu on %s)\n", 1634 " (%d sectors at %llu on %s)\n",
@@ -1537,6 +1636,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1537 (unsigned long long)(sect+ 1636 (unsigned long long)(sect+
1538 rdev->data_offset), 1637 rdev->data_offset),
1539 bdevname(rdev->bdev, b)); 1638 bdevname(rdev->bdev, b));
1639 }
1540 1640
1541 rdev_dec_pending(rdev, mddev); 1641 rdev_dec_pending(rdev, mddev);
1542 rcu_read_lock(); 1642 rcu_read_lock();
@@ -2275,13 +2375,6 @@ static void raid10_quiesce(mddev_t *mddev, int state)
2275 lower_barrier(conf); 2375 lower_barrier(conf);
2276 break; 2376 break;
2277 } 2377 }
2278 if (mddev->thread) {
2279 if (mddev->bitmap)
2280 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2281 else
2282 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2283 md_wakeup_thread(mddev->thread);
2284 }
2285} 2378}
2286 2379
2287static struct mdk_personality raid10_personality = 2380static struct mdk_personality raid10_personality =
@@ -2315,6 +2408,7 @@ static void raid_exit(void)
2315module_init(raid_init); 2408module_init(raid_init);
2316module_exit(raid_exit); 2409module_exit(raid_exit);
2317MODULE_LICENSE("GPL"); 2410MODULE_LICENSE("GPL");
2411MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
2318MODULE_ALIAS("md-personality-9"); /* RAID10 */ 2412MODULE_ALIAS("md-personality-9"); /* RAID10 */
2319MODULE_ALIAS("md-raid10"); 2413MODULE_ALIAS("md-raid10");
2320MODULE_ALIAS("md-level-10"); 2414MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d29215d966da..e84204eb12df 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2947,6 +2947,7 @@ static void handle_stripe5(struct stripe_head *sh)
2947 struct r5dev *dev; 2947 struct r5dev *dev;
2948 mdk_rdev_t *blocked_rdev = NULL; 2948 mdk_rdev_t *blocked_rdev = NULL;
2949 int prexor; 2949 int prexor;
2950 int dec_preread_active = 0;
2950 2951
2951 memset(&s, 0, sizeof(s)); 2952 memset(&s, 0, sizeof(s));
2952 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " 2953 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
@@ -3096,12 +3097,8 @@ static void handle_stripe5(struct stripe_head *sh)
3096 set_bit(STRIPE_INSYNC, &sh->state); 3097 set_bit(STRIPE_INSYNC, &sh->state);
3097 } 3098 }
3098 } 3099 }
3099 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3100 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3100 atomic_dec(&conf->preread_active_stripes); 3101 dec_preread_active = 1;
3101 if (atomic_read(&conf->preread_active_stripes) <
3102 IO_THRESHOLD)
3103 md_wakeup_thread(conf->mddev->thread);
3104 }
3105 } 3102 }
3106 3103
3107 /* Now to consider new write requests and what else, if anything 3104 /* Now to consider new write requests and what else, if anything
@@ -3208,6 +3205,16 @@ static void handle_stripe5(struct stripe_head *sh)
3208 3205
3209 ops_run_io(sh, &s); 3206 ops_run_io(sh, &s);
3210 3207
3208 if (dec_preread_active) {
3209 /* We delay this until after ops_run_io so that if make_request
3210 * is waiting on a barrier, it won't continue until the writes
3211 * have actually been submitted.
3212 */
3213 atomic_dec(&conf->preread_active_stripes);
3214 if (atomic_read(&conf->preread_active_stripes) <
3215 IO_THRESHOLD)
3216 md_wakeup_thread(conf->mddev->thread);
3217 }
3211 return_io(return_bi); 3218 return_io(return_bi);
3212} 3219}
3213 3220
@@ -3221,6 +3228,7 @@ static void handle_stripe6(struct stripe_head *sh)
3221 struct r6_state r6s; 3228 struct r6_state r6s;
3222 struct r5dev *dev, *pdev, *qdev; 3229 struct r5dev *dev, *pdev, *qdev;
3223 mdk_rdev_t *blocked_rdev = NULL; 3230 mdk_rdev_t *blocked_rdev = NULL;
3231 int dec_preread_active = 0;
3224 3232
3225 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3233 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3226 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3234 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
@@ -3358,7 +3366,6 @@ static void handle_stripe6(struct stripe_head *sh)
3358 * completed 3366 * completed
3359 */ 3367 */
3360 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3368 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3361 int qd_idx = sh->qd_idx;
3362 3369
3363 sh->reconstruct_state = reconstruct_state_idle; 3370 sh->reconstruct_state = reconstruct_state_idle;
3364 /* All the 'written' buffers and the parity blocks are ready to 3371 /* All the 'written' buffers and the parity blocks are ready to
@@ -3380,12 +3387,8 @@ static void handle_stripe6(struct stripe_head *sh)
3380 set_bit(STRIPE_INSYNC, &sh->state); 3387 set_bit(STRIPE_INSYNC, &sh->state);
3381 } 3388 }
3382 } 3389 }
3383 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3390 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3384 atomic_dec(&conf->preread_active_stripes); 3391 dec_preread_active = 1;
3385 if (atomic_read(&conf->preread_active_stripes) <
3386 IO_THRESHOLD)
3387 md_wakeup_thread(conf->mddev->thread);
3388 }
3389 } 3392 }
3390 3393
3391 /* Now to consider new write requests and what else, if anything 3394 /* Now to consider new write requests and what else, if anything
@@ -3494,6 +3497,18 @@ static void handle_stripe6(struct stripe_head *sh)
3494 3497
3495 ops_run_io(sh, &s); 3498 ops_run_io(sh, &s);
3496 3499
3500
3501 if (dec_preread_active) {
3502 /* We delay this until after ops_run_io so that if make_request
3503 * is waiting on a barrier, it won't continue until the writes
3504 * have actually been submitted.
3505 */
3506 atomic_dec(&conf->preread_active_stripes);
3507 if (atomic_read(&conf->preread_active_stripes) <
3508 IO_THRESHOLD)
3509 md_wakeup_thread(conf->mddev->thread);
3510 }
3511
3497 return_io(return_bi); 3512 return_io(return_bi);
3498} 3513}
3499 3514
@@ -3741,7 +3756,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3741{ 3756{
3742 mddev_t *mddev = q->queuedata; 3757 mddev_t *mddev = q->queuedata;
3743 raid5_conf_t *conf = mddev->private; 3758 raid5_conf_t *conf = mddev->private;
3744 unsigned int dd_idx; 3759 int dd_idx;
3745 struct bio* align_bi; 3760 struct bio* align_bi;
3746 mdk_rdev_t *rdev; 3761 mdk_rdev_t *rdev;
3747 3762
@@ -3866,7 +3881,13 @@ static int make_request(struct request_queue *q, struct bio * bi)
3866 int cpu, remaining; 3881 int cpu, remaining;
3867 3882
3868 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3883 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3869 bio_endio(bi, -EOPNOTSUPP); 3884 /* Drain all pending writes. We only really need
3885 * to ensure they have been submitted, but this is
3886 * easier.
3887 */
3888 mddev->pers->quiesce(mddev, 1);
3889 mddev->pers->quiesce(mddev, 0);
3890 md_barrier_request(mddev, bi);
3870 return 0; 3891 return 0;
3871 } 3892 }
3872 3893
@@ -3990,6 +4011,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
3990 finish_wait(&conf->wait_for_overlap, &w); 4011 finish_wait(&conf->wait_for_overlap, &w);
3991 set_bit(STRIPE_HANDLE, &sh->state); 4012 set_bit(STRIPE_HANDLE, &sh->state);
3992 clear_bit(STRIPE_DELAYED, &sh->state); 4013 clear_bit(STRIPE_DELAYED, &sh->state);
4014 if (mddev->barrier &&
4015 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4016 atomic_inc(&conf->preread_active_stripes);
3993 release_stripe(sh); 4017 release_stripe(sh);
3994 } else { 4018 } else {
3995 /* cannot get stripe for read-ahead, just give-up */ 4019 /* cannot get stripe for read-ahead, just give-up */
@@ -4009,6 +4033,14 @@ static int make_request(struct request_queue *q, struct bio * bi)
4009 4033
4010 bio_endio(bi, 0); 4034 bio_endio(bi, 0);
4011 } 4035 }
4036
4037 if (mddev->barrier) {
4038 /* We need to wait for the stripes to all be handled.
4039 * So: wait for preread_active_stripes to drop to 0.
4040 */
4041 wait_event(mddev->thread->wqueue,
4042 atomic_read(&conf->preread_active_stripes) == 0);
4043 }
4012 return 0; 4044 return 0;
4013} 4045}
4014 4046
@@ -5860,6 +5892,7 @@ static void raid5_exit(void)
5860module_init(raid5_init); 5892module_init(raid5_init);
5861module_exit(raid5_exit); 5893module_exit(raid5_exit);
5862MODULE_LICENSE("GPL"); 5894MODULE_LICENSE("GPL");
5895MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
5863MODULE_ALIAS("md-personality-4"); /* RAID5 */ 5896MODULE_ALIAS("md-personality-4"); /* RAID5 */
5864MODULE_ALIAS("md-raid5"); 5897MODULE_ALIAS("md-raid5");
5865MODULE_ALIAS("md-raid4"); 5898MODULE_ALIAS("md-raid4");
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 866215ac7f25..bffc61bff5ab 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -31,25 +31,6 @@ EXPORT_SYMBOL(raid6_empty_zero_page);
31struct raid6_calls raid6_call; 31struct raid6_calls raid6_call;
32EXPORT_SYMBOL_GPL(raid6_call); 32EXPORT_SYMBOL_GPL(raid6_call);
33 33
34/* Various routine sets */
35extern const struct raid6_calls raid6_intx1;
36extern const struct raid6_calls raid6_intx2;
37extern const struct raid6_calls raid6_intx4;
38extern const struct raid6_calls raid6_intx8;
39extern const struct raid6_calls raid6_intx16;
40extern const struct raid6_calls raid6_intx32;
41extern const struct raid6_calls raid6_mmxx1;
42extern const struct raid6_calls raid6_mmxx2;
43extern const struct raid6_calls raid6_sse1x1;
44extern const struct raid6_calls raid6_sse1x2;
45extern const struct raid6_calls raid6_sse2x1;
46extern const struct raid6_calls raid6_sse2x2;
47extern const struct raid6_calls raid6_sse2x4;
48extern const struct raid6_calls raid6_altivec1;
49extern const struct raid6_calls raid6_altivec2;
50extern const struct raid6_calls raid6_altivec4;
51extern const struct raid6_calls raid6_altivec8;
52
53const struct raid6_calls * const raid6_algos[] = { 34const struct raid6_calls * const raid6_algos[] = {
54 &raid6_intx1, 35 &raid6_intx1,
55 &raid6_intx2, 36 &raid6_intx2,
@@ -169,3 +150,4 @@ static void raid6_exit(void)
169subsys_initcall(raid6_select_algo); 150subsys_initcall(raid6_select_algo);
170module_exit(raid6_exit); 151module_exit(raid6_exit);
171MODULE_LICENSE("GPL"); 152MODULE_LICENSE("GPL");
153MODULE_DESCRIPTION("RAID6 Q-syndrome calculations");
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 278020d2449c..14cbc831422a 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -979,24 +979,6 @@ COMPATIBLE_IOCTL(FIGETBSZ)
979/* 'X' - originally XFS but some now in the VFS */ 979/* 'X' - originally XFS but some now in the VFS */
980COMPATIBLE_IOCTL(FIFREEZE) 980COMPATIBLE_IOCTL(FIFREEZE)
981COMPATIBLE_IOCTL(FITHAW) 981COMPATIBLE_IOCTL(FITHAW)
982/* RAID */
983COMPATIBLE_IOCTL(RAID_VERSION)
984COMPATIBLE_IOCTL(GET_ARRAY_INFO)
985COMPATIBLE_IOCTL(GET_DISK_INFO)
986COMPATIBLE_IOCTL(PRINT_RAID_DEBUG)
987COMPATIBLE_IOCTL(RAID_AUTORUN)
988COMPATIBLE_IOCTL(CLEAR_ARRAY)
989COMPATIBLE_IOCTL(ADD_NEW_DISK)
990COMPATIBLE_IOCTL(SET_ARRAY_INFO)
991COMPATIBLE_IOCTL(SET_DISK_INFO)
992COMPATIBLE_IOCTL(WRITE_RAID_INFO)
993COMPATIBLE_IOCTL(UNPROTECT_ARRAY)
994COMPATIBLE_IOCTL(PROTECT_ARRAY)
995COMPATIBLE_IOCTL(RUN_ARRAY)
996COMPATIBLE_IOCTL(STOP_ARRAY)
997COMPATIBLE_IOCTL(STOP_ARRAY_RO)
998COMPATIBLE_IOCTL(RESTART_ARRAY_RW)
999COMPATIBLE_IOCTL(GET_BITMAP_FILE)
1000COMPATIBLE_IOCTL(KDGETKEYCODE) 982COMPATIBLE_IOCTL(KDGETKEYCODE)
1001COMPATIBLE_IOCTL(KDSETKEYCODE) 983COMPATIBLE_IOCTL(KDSETKEYCODE)
1002COMPATIBLE_IOCTL(KDGKBTYPE) 984COMPATIBLE_IOCTL(KDGKBTYPE)
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index d92480f8285c..1cbbd2c11aa9 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -78,6 +78,25 @@ struct raid6_calls {
78/* Selected algorithm */ 78/* Selected algorithm */
79extern struct raid6_calls raid6_call; 79extern struct raid6_calls raid6_call;
80 80
81/* Various routine sets */
82extern const struct raid6_calls raid6_intx1;
83extern const struct raid6_calls raid6_intx2;
84extern const struct raid6_calls raid6_intx4;
85extern const struct raid6_calls raid6_intx8;
86extern const struct raid6_calls raid6_intx16;
87extern const struct raid6_calls raid6_intx32;
88extern const struct raid6_calls raid6_mmxx1;
89extern const struct raid6_calls raid6_mmxx2;
90extern const struct raid6_calls raid6_sse1x1;
91extern const struct raid6_calls raid6_sse1x2;
92extern const struct raid6_calls raid6_sse2x1;
93extern const struct raid6_calls raid6_sse2x2;
94extern const struct raid6_calls raid6_sse2x4;
95extern const struct raid6_calls raid6_altivec1;
96extern const struct raid6_calls raid6_altivec2;
97extern const struct raid6_calls raid6_altivec4;
98extern const struct raid6_calls raid6_altivec8;
99
81/* Algorithm list */ 100/* Algorithm list */
82extern const struct raid6_calls * const raid6_algos[]; 101extern const struct raid6_calls * const raid6_algos[];
83int raid6_select_algo(void); 102int raid6_select_algo(void);