aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@suse.de>2006-01-06 15:59:59 -0500
committerGreg Kroah-Hartman <gregkh@suse.de>2006-01-06 15:59:59 -0500
commitccf18968b1bbc2fb117190a1984ac2a826dac228 (patch)
tree7bc8fbf5722aecf1e84fa50c31c657864cba1daa /drivers/md
parente91c021c487110386a07facd0396e6c3b7cf9c1f (diff)
parentd99cf9d679a520d67f81d805b7cb91c68e1847f0 (diff)
Merge ../torvalds-2.6/
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c114
-rw-r--r--drivers/md/dm-crypt.c5
-rw-r--r--drivers/md/dm-io.h3
-rw-r--r--drivers/md/dm-ioctl.c21
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-raid1.c13
-rw-r--r--drivers/md/dm-snap.c25
-rw-r--r--drivers/md/dm.c95
-rw-r--r--drivers/md/dm.h5
-rw-r--r--drivers/md/faulty.c9
-rw-r--r--drivers/md/kcopyd.c3
-rw-r--r--drivers/md/linear.c14
-rw-r--r--drivers/md/md.c893
-rw-r--r--drivers/md/multipath.c22
-rw-r--r--drivers/md/raid0.c26
-rw-r--r--drivers/md/raid1.c726
-rw-r--r--drivers/md/raid10.c544
-rw-r--r--drivers/md/raid5.c174
-rw-r--r--drivers/md/raid6main.c348
19 files changed, 2150 insertions, 892 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 252d55df964..76a189ceb52 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -315,6 +315,8 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait)
315 if (bitmap->file == NULL) 315 if (bitmap->file == NULL)
316 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait); 316 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
317 317
318 flush_dcache_page(page); /* make sure visible to anyone reading the file */
319
318 if (wait) 320 if (wait)
319 lock_page(page); 321 lock_page(page);
320 else { 322 else {
@@ -341,7 +343,7 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait)
341 /* add to list to be waited for by daemon */ 343 /* add to list to be waited for by daemon */
342 struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO); 344 struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO);
343 item->page = page; 345 item->page = page;
344 page_cache_get(page); 346 get_page(page);
345 spin_lock(&bitmap->write_lock); 347 spin_lock(&bitmap->write_lock);
346 list_add(&item->list, &bitmap->complete_pages); 348 list_add(&item->list, &bitmap->complete_pages);
347 spin_unlock(&bitmap->write_lock); 349 spin_unlock(&bitmap->write_lock);
@@ -357,10 +359,10 @@ static struct page *read_page(struct file *file, unsigned long index,
357 struct inode *inode = file->f_mapping->host; 359 struct inode *inode = file->f_mapping->host;
358 struct page *page = NULL; 360 struct page *page = NULL;
359 loff_t isize = i_size_read(inode); 361 loff_t isize = i_size_read(inode);
360 unsigned long end_index = isize >> PAGE_CACHE_SHIFT; 362 unsigned long end_index = isize >> PAGE_SHIFT;
361 363
362 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_CACHE_SIZE, 364 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE,
363 (unsigned long long)index << PAGE_CACHE_SHIFT); 365 (unsigned long long)index << PAGE_SHIFT);
364 366
365 page = read_cache_page(inode->i_mapping, index, 367 page = read_cache_page(inode->i_mapping, index,
366 (filler_t *)inode->i_mapping->a_ops->readpage, file); 368 (filler_t *)inode->i_mapping->a_ops->readpage, file);
@@ -368,7 +370,7 @@ static struct page *read_page(struct file *file, unsigned long index,
368 goto out; 370 goto out;
369 wait_on_page_locked(page); 371 wait_on_page_locked(page);
370 if (!PageUptodate(page) || PageError(page)) { 372 if (!PageUptodate(page) || PageError(page)) {
371 page_cache_release(page); 373 put_page(page);
372 page = ERR_PTR(-EIO); 374 page = ERR_PTR(-EIO);
373 goto out; 375 goto out;
374 } 376 }
@@ -376,14 +378,14 @@ static struct page *read_page(struct file *file, unsigned long index,
376 if (index > end_index) /* we have read beyond EOF */ 378 if (index > end_index) /* we have read beyond EOF */
377 *bytes_read = 0; 379 *bytes_read = 0;
378 else if (index == end_index) /* possible short read */ 380 else if (index == end_index) /* possible short read */
379 *bytes_read = isize & ~PAGE_CACHE_MASK; 381 *bytes_read = isize & ~PAGE_MASK;
380 else 382 else
381 *bytes_read = PAGE_CACHE_SIZE; /* got a full page */ 383 *bytes_read = PAGE_SIZE; /* got a full page */
382out: 384out:
383 if (IS_ERR(page)) 385 if (IS_ERR(page))
384 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", 386 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
385 (int)PAGE_CACHE_SIZE, 387 (int)PAGE_SIZE,
386 (unsigned long long)index << PAGE_CACHE_SHIFT, 388 (unsigned long long)index << PAGE_SHIFT,
387 PTR_ERR(page)); 389 PTR_ERR(page));
388 return page; 390 return page;
389} 391}
@@ -406,11 +408,11 @@ int bitmap_update_sb(struct bitmap *bitmap)
406 return 0; 408 return 0;
407 } 409 }
408 spin_unlock_irqrestore(&bitmap->lock, flags); 410 spin_unlock_irqrestore(&bitmap->lock, flags);
409 sb = (bitmap_super_t *)kmap(bitmap->sb_page); 411 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
410 sb->events = cpu_to_le64(bitmap->mddev->events); 412 sb->events = cpu_to_le64(bitmap->mddev->events);
411 if (!bitmap->mddev->degraded) 413 if (!bitmap->mddev->degraded)
412 sb->events_cleared = cpu_to_le64(bitmap->mddev->events); 414 sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
413 kunmap(bitmap->sb_page); 415 kunmap_atomic(sb, KM_USER0);
414 return write_page(bitmap, bitmap->sb_page, 1); 416 return write_page(bitmap, bitmap->sb_page, 1);
415} 417}
416 418
@@ -421,7 +423,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
421 423
422 if (!bitmap || !bitmap->sb_page) 424 if (!bitmap || !bitmap->sb_page)
423 return; 425 return;
424 sb = (bitmap_super_t *)kmap(bitmap->sb_page); 426 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
425 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 427 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
426 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 428 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
427 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); 429 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
@@ -440,7 +442,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
440 printk(KERN_DEBUG " sync size: %llu KB\n", 442 printk(KERN_DEBUG " sync size: %llu KB\n",
441 (unsigned long long)le64_to_cpu(sb->sync_size)/2); 443 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
442 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); 444 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
443 kunmap(bitmap->sb_page); 445 kunmap_atomic(sb, KM_USER0);
444} 446}
445 447
446/* read the superblock from the bitmap file and initialize some bitmap fields */ 448/* read the superblock from the bitmap file and initialize some bitmap fields */
@@ -466,7 +468,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
466 return err; 468 return err;
467 } 469 }
468 470
469 sb = (bitmap_super_t *)kmap(bitmap->sb_page); 471 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
470 472
471 if (bytes_read < sizeof(*sb)) { /* short read */ 473 if (bytes_read < sizeof(*sb)) { /* short read */
472 printk(KERN_INFO "%s: bitmap file superblock truncated\n", 474 printk(KERN_INFO "%s: bitmap file superblock truncated\n",
@@ -485,12 +487,12 @@ static int bitmap_read_sb(struct bitmap *bitmap)
485 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || 487 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
486 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) 488 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
487 reason = "unrecognized superblock version"; 489 reason = "unrecognized superblock version";
488 else if (chunksize < 512 || chunksize > (1024 * 1024 * 4)) 490 else if (chunksize < PAGE_SIZE)
489 reason = "bitmap chunksize out of range (512B - 4MB)"; 491 reason = "bitmap chunksize too small";
490 else if ((1 << ffz(~chunksize)) != chunksize) 492 else if ((1 << ffz(~chunksize)) != chunksize)
491 reason = "bitmap chunksize not a power of 2"; 493 reason = "bitmap chunksize not a power of 2";
492 else if (daemon_sleep < 1 || daemon_sleep > 15) 494 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ)
493 reason = "daemon sleep period out of range (1-15s)"; 495 reason = "daemon sleep period out of range";
494 else if (write_behind > COUNTER_MAX) 496 else if (write_behind > COUNTER_MAX)
495 reason = "write-behind limit out of range (0 - 16383)"; 497 reason = "write-behind limit out of range (0 - 16383)";
496 if (reason) { 498 if (reason) {
@@ -535,7 +537,7 @@ success:
535 bitmap->events_cleared = bitmap->mddev->events; 537 bitmap->events_cleared = bitmap->mddev->events;
536 err = 0; 538 err = 0;
537out: 539out:
538 kunmap(bitmap->sb_page); 540 kunmap_atomic(sb, KM_USER0);
539 if (err) 541 if (err)
540 bitmap_print_sb(bitmap); 542 bitmap_print_sb(bitmap);
541 return err; 543 return err;
@@ -558,9 +560,9 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
558 spin_unlock_irqrestore(&bitmap->lock, flags); 560 spin_unlock_irqrestore(&bitmap->lock, flags);
559 return; 561 return;
560 } 562 }
561 page_cache_get(bitmap->sb_page); 563 get_page(bitmap->sb_page);
562 spin_unlock_irqrestore(&bitmap->lock, flags); 564 spin_unlock_irqrestore(&bitmap->lock, flags);
563 sb = (bitmap_super_t *)kmap(bitmap->sb_page); 565 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
564 switch (op) { 566 switch (op) {
565 case MASK_SET: sb->state |= bits; 567 case MASK_SET: sb->state |= bits;
566 break; 568 break;
@@ -568,8 +570,8 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
568 break; 570 break;
569 default: BUG(); 571 default: BUG();
570 } 572 }
571 kunmap(bitmap->sb_page); 573 kunmap_atomic(sb, KM_USER0);
572 page_cache_release(bitmap->sb_page); 574 put_page(bitmap->sb_page);
573} 575}
574 576
575/* 577/*
@@ -622,12 +624,11 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
622 624
623 while (pages--) 625 while (pages--)
624 if (map[pages]->index != 0) /* 0 is sb_page, release it below */ 626 if (map[pages]->index != 0) /* 0 is sb_page, release it below */
625 page_cache_release(map[pages]); 627 put_page(map[pages]);
626 kfree(map); 628 kfree(map);
627 kfree(attr); 629 kfree(attr);
628 630
629 if (sb_page) 631 safe_put_page(sb_page);
630 page_cache_release(sb_page);
631} 632}
632 633
633static void bitmap_stop_daemon(struct bitmap *bitmap); 634static void bitmap_stop_daemon(struct bitmap *bitmap);
@@ -654,7 +655,7 @@ static void drain_write_queues(struct bitmap *bitmap)
654 655
655 while ((item = dequeue_page(bitmap))) { 656 while ((item = dequeue_page(bitmap))) {
656 /* don't bother to wait */ 657 /* don't bother to wait */
657 page_cache_release(item->page); 658 put_page(item->page);
658 mempool_free(item, bitmap->write_pool); 659 mempool_free(item, bitmap->write_pool);
659 } 660 }
660 661
@@ -763,7 +764,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
763 764
764 /* make sure the page stays cached until it gets written out */ 765 /* make sure the page stays cached until it gets written out */
765 if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY)) 766 if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
766 page_cache_get(page); 767 get_page(page);
767 768
768 /* set the bit */ 769 /* set the bit */
769 kaddr = kmap_atomic(page, KM_USER0); 770 kaddr = kmap_atomic(page, KM_USER0);
@@ -854,6 +855,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
854 unsigned long bytes, offset, dummy; 855 unsigned long bytes, offset, dummy;
855 int outofdate; 856 int outofdate;
856 int ret = -ENOSPC; 857 int ret = -ENOSPC;
858 void *paddr;
857 859
858 chunks = bitmap->chunks; 860 chunks = bitmap->chunks;
859 file = bitmap->file; 861 file = bitmap->file;
@@ -887,12 +889,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
887 if (!bitmap->filemap) 889 if (!bitmap->filemap)
888 goto out; 890 goto out;
889 891
890 bitmap->filemap_attr = kmalloc(sizeof(long) * num_pages, GFP_KERNEL); 892 bitmap->filemap_attr = kzalloc(sizeof(long) * num_pages, GFP_KERNEL);
891 if (!bitmap->filemap_attr) 893 if (!bitmap->filemap_attr)
892 goto out; 894 goto out;
893 895
894 memset(bitmap->filemap_attr, 0, sizeof(long) * num_pages);
895
896 oldindex = ~0L; 896 oldindex = ~0L;
897 897
898 for (i = 0; i < chunks; i++) { 898 for (i = 0; i < chunks; i++) {
@@ -901,8 +901,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
901 bit = file_page_offset(i); 901 bit = file_page_offset(i);
902 if (index != oldindex) { /* this is a new page, read it in */ 902 if (index != oldindex) { /* this is a new page, read it in */
903 /* unmap the old page, we're done with it */ 903 /* unmap the old page, we're done with it */
904 if (oldpage != NULL)
905 kunmap(oldpage);
906 if (index == 0) { 904 if (index == 0) {
907 /* 905 /*
908 * if we're here then the superblock page 906 * if we're here then the superblock page
@@ -925,30 +923,32 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
925 923
926 oldindex = index; 924 oldindex = index;
927 oldpage = page; 925 oldpage = page;
928 kmap(page);
929 926
930 if (outofdate) { 927 if (outofdate) {
931 /* 928 /*
932 * if bitmap is out of date, dirty the 929 * if bitmap is out of date, dirty the
933 * whole page and write it out 930 * whole page and write it out
934 */ 931 */
935 memset(page_address(page) + offset, 0xff, 932 paddr = kmap_atomic(page, KM_USER0);
933 memset(paddr + offset, 0xff,
936 PAGE_SIZE - offset); 934 PAGE_SIZE - offset);
935 kunmap_atomic(paddr, KM_USER0);
937 ret = write_page(bitmap, page, 1); 936 ret = write_page(bitmap, page, 1);
938 if (ret) { 937 if (ret) {
939 kunmap(page);
940 /* release, page not in filemap yet */ 938 /* release, page not in filemap yet */
941 page_cache_release(page); 939 put_page(page);
942 goto out; 940 goto out;
943 } 941 }
944 } 942 }
945 943
946 bitmap->filemap[bitmap->file_pages++] = page; 944 bitmap->filemap[bitmap->file_pages++] = page;
947 } 945 }
946 paddr = kmap_atomic(page, KM_USER0);
948 if (bitmap->flags & BITMAP_HOSTENDIAN) 947 if (bitmap->flags & BITMAP_HOSTENDIAN)
949 b = test_bit(bit, page_address(page)); 948 b = test_bit(bit, paddr);
950 else 949 else
951 b = ext2_test_bit(bit, page_address(page)); 950 b = ext2_test_bit(bit, paddr);
951 kunmap_atomic(paddr, KM_USER0);
952 if (b) { 952 if (b) {
953 /* if the disk bit is set, set the memory bit */ 953 /* if the disk bit is set, set the memory bit */
954 bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), 954 bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap),
@@ -963,9 +963,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
963 ret = 0; 963 ret = 0;
964 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); 964 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
965 965
966 if (page) /* unmap the last page */
967 kunmap(page);
968
969 if (bit_cnt) { /* Kick recovery if any bits were set */ 966 if (bit_cnt) { /* Kick recovery if any bits were set */
970 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); 967 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
971 md_wakeup_thread(bitmap->mddev->thread); 968 md_wakeup_thread(bitmap->mddev->thread);
@@ -1021,6 +1018,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1021 int err = 0; 1018 int err = 0;
1022 int blocks; 1019 int blocks;
1023 int attr; 1020 int attr;
1021 void *paddr;
1024 1022
1025 if (bitmap == NULL) 1023 if (bitmap == NULL)
1026 return 0; 1024 return 0;
@@ -1043,7 +1041,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1043 /* skip this page unless it's marked as needing cleaning */ 1041 /* skip this page unless it's marked as needing cleaning */
1044 if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) { 1042 if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) {
1045 if (attr & BITMAP_PAGE_NEEDWRITE) { 1043 if (attr & BITMAP_PAGE_NEEDWRITE) {
1046 page_cache_get(page); 1044 get_page(page);
1047 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); 1045 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1048 } 1046 }
1049 spin_unlock_irqrestore(&bitmap->lock, flags); 1047 spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1057,13 +1055,13 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1057 default: 1055 default:
1058 bitmap_file_kick(bitmap); 1056 bitmap_file_kick(bitmap);
1059 } 1057 }
1060 page_cache_release(page); 1058 put_page(page);
1061 } 1059 }
1062 continue; 1060 continue;
1063 } 1061 }
1064 1062
1065 /* grab the new page, sync and release the old */ 1063 /* grab the new page, sync and release the old */
1066 page_cache_get(page); 1064 get_page(page);
1067 if (lastpage != NULL) { 1065 if (lastpage != NULL) {
1068 if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) { 1066 if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
1069 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1067 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
@@ -1077,14 +1075,12 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1077 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1075 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1078 spin_unlock_irqrestore(&bitmap->lock, flags); 1076 spin_unlock_irqrestore(&bitmap->lock, flags);
1079 } 1077 }
1080 kunmap(lastpage); 1078 put_page(lastpage);
1081 page_cache_release(lastpage);
1082 if (err) 1079 if (err)
1083 bitmap_file_kick(bitmap); 1080 bitmap_file_kick(bitmap);
1084 } else 1081 } else
1085 spin_unlock_irqrestore(&bitmap->lock, flags); 1082 spin_unlock_irqrestore(&bitmap->lock, flags);
1086 lastpage = page; 1083 lastpage = page;
1087 kmap(page);
1088/* 1084/*
1089 printk("bitmap clean at page %lu\n", j); 1085 printk("bitmap clean at page %lu\n", j);
1090*/ 1086*/
@@ -1107,10 +1103,12 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1107 -1); 1103 -1);
1108 1104
1109 /* clear the bit */ 1105 /* clear the bit */
1106 paddr = kmap_atomic(page, KM_USER0);
1110 if (bitmap->flags & BITMAP_HOSTENDIAN) 1107 if (bitmap->flags & BITMAP_HOSTENDIAN)
1111 clear_bit(file_page_offset(j), page_address(page)); 1108 clear_bit(file_page_offset(j), paddr);
1112 else 1109 else
1113 ext2_clear_bit(file_page_offset(j), page_address(page)); 1110 ext2_clear_bit(file_page_offset(j), paddr);
1111 kunmap_atomic(paddr, KM_USER0);
1114 } 1112 }
1115 } 1113 }
1116 spin_unlock_irqrestore(&bitmap->lock, flags); 1114 spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1118,7 +1116,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1118 1116
1119 /* now sync the final page */ 1117 /* now sync the final page */
1120 if (lastpage != NULL) { 1118 if (lastpage != NULL) {
1121 kunmap(lastpage);
1122 spin_lock_irqsave(&bitmap->lock, flags); 1119 spin_lock_irqsave(&bitmap->lock, flags);
1123 if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) { 1120 if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) {
1124 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1121 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
@@ -1133,7 +1130,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1133 spin_unlock_irqrestore(&bitmap->lock, flags); 1130 spin_unlock_irqrestore(&bitmap->lock, flags);
1134 } 1131 }
1135 1132
1136 page_cache_release(lastpage); 1133 put_page(lastpage);
1137 } 1134 }
1138 1135
1139 return err; 1136 return err;
@@ -1184,7 +1181,7 @@ static void bitmap_writeback_daemon(mddev_t *mddev)
1184 PRINTK("finished page writeback: %p\n", page); 1181 PRINTK("finished page writeback: %p\n", page);
1185 1182
1186 err = PageError(page); 1183 err = PageError(page);
1187 page_cache_release(page); 1184 put_page(page);
1188 if (err) { 1185 if (err) {
1189 printk(KERN_WARNING "%s: bitmap file writeback " 1186 printk(KERN_WARNING "%s: bitmap file writeback "
1190 "failed (page %lu): %d\n", 1187 "failed (page %lu): %d\n",
@@ -1530,6 +1527,8 @@ void bitmap_destroy(mddev_t *mddev)
1530 return; 1527 return;
1531 1528
1532 mddev->bitmap = NULL; /* disconnect from the md device */ 1529 mddev->bitmap = NULL; /* disconnect from the md device */
1530 if (mddev->thread)
1531 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1533 1532
1534 bitmap_free(bitmap); 1533 bitmap_free(bitmap);
1535} 1534}
@@ -1555,12 +1554,10 @@ int bitmap_create(mddev_t *mddev)
1555 1554
1556 BUG_ON(file && mddev->bitmap_offset); 1555 BUG_ON(file && mddev->bitmap_offset);
1557 1556
1558 bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL); 1557 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1559 if (!bitmap) 1558 if (!bitmap)
1560 return -ENOMEM; 1559 return -ENOMEM;
1561 1560
1562 memset(bitmap, 0, sizeof(*bitmap));
1563
1564 spin_lock_init(&bitmap->lock); 1561 spin_lock_init(&bitmap->lock);
1565 bitmap->mddev = mddev; 1562 bitmap->mddev = mddev;
1566 1563
@@ -1601,12 +1598,11 @@ int bitmap_create(mddev_t *mddev)
1601#ifdef INJECT_FATAL_FAULT_1 1598#ifdef INJECT_FATAL_FAULT_1
1602 bitmap->bp = NULL; 1599 bitmap->bp = NULL;
1603#else 1600#else
1604 bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); 1601 bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
1605#endif 1602#endif
1606 err = -ENOMEM; 1603 err = -ENOMEM;
1607 if (!bitmap->bp) 1604 if (!bitmap->bp)
1608 goto error; 1605 goto error;
1609 memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp));
1610 1606
1611 bitmap->flags |= BITMAP_ACTIVE; 1607 bitmap->flags |= BITMAP_ACTIVE;
1612 1608
@@ -1636,6 +1632,8 @@ int bitmap_create(mddev_t *mddev)
1636 1632
1637 if (IS_ERR(bitmap->writeback_daemon)) 1633 if (IS_ERR(bitmap->writeback_daemon))
1638 return PTR_ERR(bitmap->writeback_daemon); 1634 return PTR_ERR(bitmap->writeback_daemon);
1635 mddev->thread->timeout = bitmap->daemon_sleep * HZ;
1636
1639 return bitmap_update_sb(bitmap); 1637 return bitmap_update_sb(bitmap);
1640 1638
1641 error: 1639 error:
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index cf663105668..a601a427885 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -690,6 +690,8 @@ bad3:
690bad2: 690bad2:
691 crypto_free_tfm(tfm); 691 crypto_free_tfm(tfm);
692bad1: 692bad1:
693 /* Must zero key material before freeing */
694 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
693 kfree(cc); 695 kfree(cc);
694 return -EINVAL; 696 return -EINVAL;
695} 697}
@@ -706,6 +708,9 @@ static void crypt_dtr(struct dm_target *ti)
706 cc->iv_gen_ops->dtr(cc); 708 cc->iv_gen_ops->dtr(cc);
707 crypto_free_tfm(cc->tfm); 709 crypto_free_tfm(cc->tfm);
708 dm_put_device(ti, cc->dev); 710 dm_put_device(ti, cc->dev);
711
712 /* Must zero key material before freeing */
713 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
709 kfree(cc); 714 kfree(cc);
710} 715}
711 716
diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h
index 1a77f326570..f9035bfd1a9 100644
--- a/drivers/md/dm-io.h
+++ b/drivers/md/dm-io.h
@@ -9,9 +9,6 @@
9 9
10#include "dm.h" 10#include "dm.h"
11 11
12/* FIXME make this configurable */
13#define DM_MAX_IO_REGIONS 8
14
15struct io_region { 12struct io_region {
16 struct block_device *bdev; 13 struct block_device *bdev;
17 sector_t sector; 14 sector_t sector;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 07d44e19536..561bda5011e 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -270,6 +270,7 @@ static int dm_hash_rename(const char *old, const char *new)
270{ 270{
271 char *new_name, *old_name; 271 char *new_name, *old_name;
272 struct hash_cell *hc; 272 struct hash_cell *hc;
273 struct dm_table *table;
273 274
274 /* 275 /*
275 * duplicate new. 276 * duplicate new.
@@ -317,6 +318,15 @@ static int dm_hash_rename(const char *old, const char *new)
317 /* rename the device node in devfs */ 318 /* rename the device node in devfs */
318 register_with_devfs(hc); 319 register_with_devfs(hc);
319 320
321 /*
322 * Wake up any dm event waiters.
323 */
324 table = dm_get_table(hc->md);
325 if (table) {
326 dm_table_event(table);
327 dm_table_put(table);
328 }
329
320 up_write(&_hash_lock); 330 up_write(&_hash_lock);
321 kfree(old_name); 331 kfree(old_name);
322 return 0; 332 return 0;
@@ -683,14 +693,18 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
683static int do_suspend(struct dm_ioctl *param) 693static int do_suspend(struct dm_ioctl *param)
684{ 694{
685 int r = 0; 695 int r = 0;
696 int do_lockfs = 1;
686 struct mapped_device *md; 697 struct mapped_device *md;
687 698
688 md = find_device(param); 699 md = find_device(param);
689 if (!md) 700 if (!md)
690 return -ENXIO; 701 return -ENXIO;
691 702
703 if (param->flags & DM_SKIP_LOCKFS_FLAG)
704 do_lockfs = 0;
705
692 if (!dm_suspended(md)) 706 if (!dm_suspended(md))
693 r = dm_suspend(md); 707 r = dm_suspend(md, do_lockfs);
694 708
695 if (!r) 709 if (!r)
696 r = __dev_status(md, param); 710 r = __dev_status(md, param);
@@ -702,6 +716,7 @@ static int do_suspend(struct dm_ioctl *param)
702static int do_resume(struct dm_ioctl *param) 716static int do_resume(struct dm_ioctl *param)
703{ 717{
704 int r = 0; 718 int r = 0;
719 int do_lockfs = 1;
705 struct hash_cell *hc; 720 struct hash_cell *hc;
706 struct mapped_device *md; 721 struct mapped_device *md;
707 struct dm_table *new_map; 722 struct dm_table *new_map;
@@ -727,8 +742,10 @@ static int do_resume(struct dm_ioctl *param)
727 /* Do we need to load a new map ? */ 742 /* Do we need to load a new map ? */
728 if (new_map) { 743 if (new_map) {
729 /* Suspend if it isn't already suspended */ 744 /* Suspend if it isn't already suspended */
745 if (param->flags & DM_SKIP_LOCKFS_FLAG)
746 do_lockfs = 0;
730 if (!dm_suspended(md)) 747 if (!dm_suspended(md))
731 dm_suspend(md); 748 dm_suspend(md, do_lockfs);
732 749
733 r = dm_swap_table(md, new_map); 750 r = dm_swap_table(md, new_map);
734 if (r) { 751 if (r) {
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index a76349cb10a..efe4adf7853 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -573,7 +573,7 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region)
573 lc->sync_search); 573 lc->sync_search);
574 lc->sync_search = *region + 1; 574 lc->sync_search = *region + 1;
575 575
576 if (*region == lc->region_count) 576 if (*region >= lc->region_count)
577 return 0; 577 return 0;
578 578
579 } while (log_test_bit(lc->recovering_bits, *region)); 579 } while (log_test_bit(lc->recovering_bits, *region));
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 6b0fc167092..6cfa8d435d5 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -562,6 +562,8 @@ struct mirror_set {
562 region_t nr_regions; 562 region_t nr_regions;
563 int in_sync; 563 int in_sync;
564 564
565 struct mirror *default_mirror; /* Default mirror */
566
565 unsigned int nr_mirrors; 567 unsigned int nr_mirrors;
566 struct mirror mirror[0]; 568 struct mirror mirror[0];
567}; 569};
@@ -611,7 +613,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
611 unsigned long flags = 0; 613 unsigned long flags = 0;
612 614
613 /* fill in the source */ 615 /* fill in the source */
614 m = ms->mirror + DEFAULT_MIRROR; 616 m = ms->default_mirror;
615 from.bdev = m->dev->bdev; 617 from.bdev = m->dev->bdev;
616 from.sector = m->offset + region_to_sector(reg->rh, reg->key); 618 from.sector = m->offset + region_to_sector(reg->rh, reg->key);
617 if (reg->key == (ms->nr_regions - 1)) { 619 if (reg->key == (ms->nr_regions - 1)) {
@@ -627,7 +629,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
627 629
628 /* fill in the destinations */ 630 /* fill in the destinations */
629 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 631 for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
630 if (i == DEFAULT_MIRROR) 632 if (&ms->mirror[i] == ms->default_mirror)
631 continue; 633 continue;
632 634
633 m = ms->mirror + i; 635 m = ms->mirror + i;
@@ -682,7 +684,7 @@ static void do_recovery(struct mirror_set *ms)
682static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 684static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
683{ 685{
684 /* FIXME: add read balancing */ 686 /* FIXME: add read balancing */
685 return ms->mirror + DEFAULT_MIRROR; 687 return ms->default_mirror;
686} 688}
687 689
688/* 690/*
@@ -709,7 +711,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
709 if (rh_in_sync(&ms->rh, region, 0)) 711 if (rh_in_sync(&ms->rh, region, 0))
710 m = choose_mirror(ms, bio->bi_sector); 712 m = choose_mirror(ms, bio->bi_sector);
711 else 713 else
712 m = ms->mirror + DEFAULT_MIRROR; 714 m = ms->default_mirror;
713 715
714 map_bio(ms, m, bio); 716 map_bio(ms, m, bio);
715 generic_make_request(bio); 717 generic_make_request(bio);
@@ -833,7 +835,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
833 rh_delay(&ms->rh, bio); 835 rh_delay(&ms->rh, bio);
834 836
835 while ((bio = bio_list_pop(&nosync))) { 837 while ((bio = bio_list_pop(&nosync))) {
836 map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); 838 map_bio(ms, ms->default_mirror, bio);
837 generic_make_request(bio); 839 generic_make_request(bio);
838 } 840 }
839} 841}
@@ -900,6 +902,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
900 ms->nr_mirrors = nr_mirrors; 902 ms->nr_mirrors = nr_mirrors;
901 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 903 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
902 ms->in_sync = 0; 904 ms->in_sync = 0;
905 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
903 906
904 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 907 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
905 ti->error = "dm-mirror: Error creating dirty region hash"; 908 ti->error = "dm-mirror: Error creating dirty region hash";
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index ab54f99b7c3..4b9dd8fb1e5 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -371,6 +371,20 @@ static inline ulong round_up(ulong n, ulong size)
371 return (n + size) & ~size; 371 return (n + size) & ~size;
372} 372}
373 373
374static void read_snapshot_metadata(struct dm_snapshot *s)
375{
376 if (s->have_metadata)
377 return;
378
379 if (s->store.read_metadata(&s->store)) {
380 down_write(&s->lock);
381 s->valid = 0;
382 up_write(&s->lock);
383 }
384
385 s->have_metadata = 1;
386}
387
374/* 388/*
375 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> 389 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
376 */ 390 */
@@ -848,16 +862,7 @@ static void snapshot_resume(struct dm_target *ti)
848{ 862{
849 struct dm_snapshot *s = (struct dm_snapshot *) ti->private; 863 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
850 864
851 if (s->have_metadata) 865 read_snapshot_metadata(s);
852 return;
853
854 if (s->store.read_metadata(&s->store)) {
855 down_write(&s->lock);
856 s->valid = 0;
857 up_write(&s->lock);
858 }
859
860 s->have_metadata = 1;
861} 866}
862 867
863static int snapshot_status(struct dm_target *ti, status_type_t type, 868static int snapshot_status(struct dm_target *ti, status_type_t type,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 930b9fc2795..0e481512f91 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -55,6 +55,7 @@ union map_info *dm_get_mapinfo(struct bio *bio)
55 */ 55 */
56#define DMF_BLOCK_IO 0 56#define DMF_BLOCK_IO 0
57#define DMF_SUSPENDED 1 57#define DMF_SUSPENDED 1
58#define DMF_FROZEN 2
58 59
59struct mapped_device { 60struct mapped_device {
60 struct rw_semaphore io_lock; 61 struct rw_semaphore io_lock;
@@ -97,7 +98,7 @@ struct mapped_device {
97 * freeze/thaw support require holding onto a super block 98 * freeze/thaw support require holding onto a super block
98 */ 99 */
99 struct super_block *frozen_sb; 100 struct super_block *frozen_sb;
100 struct block_device *frozen_bdev; 101 struct block_device *suspended_bdev;
101}; 102};
102 103
103#define MIN_IOS 256 104#define MIN_IOS 256
@@ -836,9 +837,9 @@ static void __set_size(struct mapped_device *md, sector_t size)
836{ 837{
837 set_capacity(md->disk, size); 838 set_capacity(md->disk, size);
838 839
839 down(&md->frozen_bdev->bd_inode->i_sem); 840 down(&md->suspended_bdev->bd_inode->i_sem);
840 i_size_write(md->frozen_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 841 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
841 up(&md->frozen_bdev->bd_inode->i_sem); 842 up(&md->suspended_bdev->bd_inode->i_sem);
842} 843}
843 844
844static int __bind(struct mapped_device *md, struct dm_table *t) 845static int __bind(struct mapped_device *md, struct dm_table *t)
@@ -902,10 +903,9 @@ int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
902 return create_aux(minor, 1, result); 903 return create_aux(minor, 1, result);
903} 904}
904 905
905void *dm_get_mdptr(dev_t dev) 906static struct mapped_device *dm_find_md(dev_t dev)
906{ 907{
907 struct mapped_device *md; 908 struct mapped_device *md;
908 void *mdptr = NULL;
909 unsigned minor = MINOR(dev); 909 unsigned minor = MINOR(dev);
910 910
911 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 911 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
@@ -914,12 +914,32 @@ void *dm_get_mdptr(dev_t dev)
914 down(&_minor_lock); 914 down(&_minor_lock);
915 915
916 md = idr_find(&_minor_idr, minor); 916 md = idr_find(&_minor_idr, minor);
917 917 if (!md || (dm_disk(md)->first_minor != minor))
918 if (md && (dm_disk(md)->first_minor == minor)) 918 md = NULL;
919 mdptr = md->interface_ptr;
920 919
921 up(&_minor_lock); 920 up(&_minor_lock);
922 921
922 return md;
923}
924
925struct mapped_device *dm_get_md(dev_t dev)
926{
927 struct mapped_device *md = dm_find_md(dev);
928
929 if (md)
930 dm_get(md);
931
932 return md;
933}
934
935void *dm_get_mdptr(dev_t dev)
936{
937 struct mapped_device *md;
938 void *mdptr = NULL;
939
940 md = dm_find_md(dev);
941 if (md)
942 mdptr = md->interface_ptr;
923 return mdptr; 943 return mdptr;
924} 944}
925 945
@@ -991,43 +1011,33 @@ out:
991 */ 1011 */
992static int lock_fs(struct mapped_device *md) 1012static int lock_fs(struct mapped_device *md)
993{ 1013{
994 int r = -ENOMEM; 1014 int r;
995
996 md->frozen_bdev = bdget_disk(md->disk, 0);
997 if (!md->frozen_bdev) {
998 DMWARN("bdget failed in lock_fs");
999 goto out;
1000 }
1001 1015
1002 WARN_ON(md->frozen_sb); 1016 WARN_ON(md->frozen_sb);
1003 1017
1004 md->frozen_sb = freeze_bdev(md->frozen_bdev); 1018 md->frozen_sb = freeze_bdev(md->suspended_bdev);
1005 if (IS_ERR(md->frozen_sb)) { 1019 if (IS_ERR(md->frozen_sb)) {
1006 r = PTR_ERR(md->frozen_sb); 1020 r = PTR_ERR(md->frozen_sb);
1007 goto out_bdput; 1021 md->frozen_sb = NULL;
1022 return r;
1008 } 1023 }
1009 1024
1025 set_bit(DMF_FROZEN, &md->flags);
1026
1010 /* don't bdput right now, we don't want the bdev 1027 /* don't bdput right now, we don't want the bdev
1011 * to go away while it is locked. We'll bdput 1028 * to go away while it is locked.
1012 * in unlock_fs
1013 */ 1029 */
1014 return 0; 1030 return 0;
1015
1016out_bdput:
1017 bdput(md->frozen_bdev);
1018 md->frozen_sb = NULL;
1019 md->frozen_bdev = NULL;
1020out:
1021 return r;
1022} 1031}
1023 1032
1024static void unlock_fs(struct mapped_device *md) 1033static void unlock_fs(struct mapped_device *md)
1025{ 1034{
1026 thaw_bdev(md->frozen_bdev, md->frozen_sb); 1035 if (!test_bit(DMF_FROZEN, &md->flags))
1027 bdput(md->frozen_bdev); 1036 return;
1028 1037
1038 thaw_bdev(md->suspended_bdev, md->frozen_sb);
1029 md->frozen_sb = NULL; 1039 md->frozen_sb = NULL;
1030 md->frozen_bdev = NULL; 1040 clear_bit(DMF_FROZEN, &md->flags);
1031} 1041}
1032 1042
1033/* 1043/*
@@ -1037,7 +1047,7 @@ static void unlock_fs(struct mapped_device *md)
1037 * dm_bind_table, dm_suspend must be called to flush any in 1047 * dm_bind_table, dm_suspend must be called to flush any in
1038 * flight bios and ensure that any further io gets deferred. 1048 * flight bios and ensure that any further io gets deferred.
1039 */ 1049 */
1040int dm_suspend(struct mapped_device *md) 1050int dm_suspend(struct mapped_device *md, int do_lockfs)
1041{ 1051{
1042 struct dm_table *map = NULL; 1052 struct dm_table *map = NULL;
1043 DECLARE_WAITQUEUE(wait, current); 1053 DECLARE_WAITQUEUE(wait, current);
@@ -1053,10 +1063,19 @@ int dm_suspend(struct mapped_device *md)
1053 /* This does not get reverted if there's an error later. */ 1063 /* This does not get reverted if there's an error later. */
1054 dm_table_presuspend_targets(map); 1064 dm_table_presuspend_targets(map);
1055 1065
1056 /* Flush I/O to the device. */ 1066 md->suspended_bdev = bdget_disk(md->disk, 0);
1057 r = lock_fs(md); 1067 if (!md->suspended_bdev) {
1058 if (r) 1068 DMWARN("bdget failed in dm_suspend");
1069 r = -ENOMEM;
1059 goto out; 1070 goto out;
1071 }
1072
1073 /* Flush I/O to the device. */
1074 if (do_lockfs) {
1075 r = lock_fs(md);
1076 if (r)
1077 goto out;
1078 }
1060 1079
1061 /* 1080 /*
1062 * First we set the BLOCK_IO flag so no more ios will be mapped. 1081 * First we set the BLOCK_IO flag so no more ios will be mapped.
@@ -1105,6 +1124,11 @@ int dm_suspend(struct mapped_device *md)
1105 r = 0; 1124 r = 0;
1106 1125
1107out: 1126out:
1127 if (r && md->suspended_bdev) {
1128 bdput(md->suspended_bdev);
1129 md->suspended_bdev = NULL;
1130 }
1131
1108 dm_table_put(map); 1132 dm_table_put(map);
1109 up(&md->suspend_lock); 1133 up(&md->suspend_lock);
1110 return r; 1134 return r;
@@ -1135,6 +1159,9 @@ int dm_resume(struct mapped_device *md)
1135 1159
1136 unlock_fs(md); 1160 unlock_fs(md);
1137 1161
1162 bdput(md->suspended_bdev);
1163 md->suspended_bdev = NULL;
1164
1138 clear_bit(DMF_SUSPENDED, &md->flags); 1165 clear_bit(DMF_SUSPENDED, &md->flags);
1139 1166
1140 dm_table_unplug_all(map); 1167 dm_table_unplug_all(map);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index e38c3fc1a1d..4eaf075da21 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -28,7 +28,7 @@
28 * in types.h. 28 * in types.h.
29 */ 29 */
30#ifdef CONFIG_LBD 30#ifdef CONFIG_LBD
31#define SECTOR_FORMAT "%Lu" 31#define SECTOR_FORMAT "%llu"
32#else 32#else
33#define SECTOR_FORMAT "%lu" 33#define SECTOR_FORMAT "%lu"
34#endif 34#endif
@@ -58,6 +58,7 @@ int dm_create(struct mapped_device **md);
58int dm_create_with_minor(unsigned int minor, struct mapped_device **md); 58int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
59void dm_set_mdptr(struct mapped_device *md, void *ptr); 59void dm_set_mdptr(struct mapped_device *md, void *ptr);
60void *dm_get_mdptr(dev_t dev); 60void *dm_get_mdptr(dev_t dev);
61struct mapped_device *dm_get_md(dev_t dev);
61 62
62/* 63/*
63 * Reference counting for md. 64 * Reference counting for md.
@@ -68,7 +69,7 @@ void dm_put(struct mapped_device *md);
68/* 69/*
69 * A device can still be used while suspended, but I/O is deferred. 70 * A device can still be used while suspended, but I/O is deferred.
70 */ 71 */
71int dm_suspend(struct mapped_device *md); 72int dm_suspend(struct mapped_device *md, int with_lockfs);
72int dm_resume(struct mapped_device *md); 73int dm_resume(struct mapped_device *md);
73 74
74/* 75/*
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 0248f8e7eac..a7a5ab55433 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -316,9 +316,10 @@ static int stop(mddev_t *mddev)
316 return 0; 316 return 0;
317} 317}
318 318
319static mdk_personality_t faulty_personality = 319static struct mdk_personality faulty_personality =
320{ 320{
321 .name = "faulty", 321 .name = "faulty",
322 .level = LEVEL_FAULTY,
322 .owner = THIS_MODULE, 323 .owner = THIS_MODULE,
323 .make_request = make_request, 324 .make_request = make_request,
324 .run = run, 325 .run = run,
@@ -329,15 +330,17 @@ static mdk_personality_t faulty_personality =
329 330
330static int __init raid_init(void) 331static int __init raid_init(void)
331{ 332{
332 return register_md_personality(FAULTY, &faulty_personality); 333 return register_md_personality(&faulty_personality);
333} 334}
334 335
335static void raid_exit(void) 336static void raid_exit(void)
336{ 337{
337 unregister_md_personality(FAULTY); 338 unregister_md_personality(&faulty_personality);
338} 339}
339 340
340module_init(raid_init); 341module_init(raid_init);
341module_exit(raid_exit); 342module_exit(raid_exit);
342MODULE_LICENSE("GPL"); 343MODULE_LICENSE("GPL");
343MODULE_ALIAS("md-personality-10"); /* faulty */ 344MODULE_ALIAS("md-personality-10"); /* faulty */
345MODULE_ALIAS("md-faulty");
346MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index eb703648597..ca99979c868 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -561,11 +561,13 @@ int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
561 * Cancels a kcopyd job, eg. someone might be deactivating a 561 * Cancels a kcopyd job, eg. someone might be deactivating a
562 * mirror. 562 * mirror.
563 */ 563 */
564#if 0
564int kcopyd_cancel(struct kcopyd_job *job, int block) 565int kcopyd_cancel(struct kcopyd_job *job, int block)
565{ 566{
566 /* FIXME: finish */ 567 /* FIXME: finish */
567 return -1; 568 return -1;
568} 569}
570#endif /* 0 */
569 571
570/*----------------------------------------------------------------- 572/*-----------------------------------------------------------------
571 * Unit setup 573 * Unit setup
@@ -684,4 +686,3 @@ void kcopyd_client_destroy(struct kcopyd_client *kc)
684EXPORT_SYMBOL(kcopyd_client_create); 686EXPORT_SYMBOL(kcopyd_client_create);
685EXPORT_SYMBOL(kcopyd_client_destroy); 687EXPORT_SYMBOL(kcopyd_client_destroy);
686EXPORT_SYMBOL(kcopyd_copy); 688EXPORT_SYMBOL(kcopyd_copy);
687EXPORT_SYMBOL(kcopyd_cancel);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 946efef3a8f..777585458c8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -121,11 +121,10 @@ static int linear_run (mddev_t *mddev)
121 sector_t curr_offset; 121 sector_t curr_offset;
122 struct list_head *tmp; 122 struct list_head *tmp;
123 123
124 conf = kmalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t), 124 conf = kzalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t),
125 GFP_KERNEL); 125 GFP_KERNEL);
126 if (!conf) 126 if (!conf)
127 goto out; 127 goto out;
128 memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
129 mddev->private = conf; 128 mddev->private = conf;
130 129
131 cnt = 0; 130 cnt = 0;
@@ -352,9 +351,10 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev)
352} 351}
353 352
354 353
355static mdk_personality_t linear_personality= 354static struct mdk_personality linear_personality =
356{ 355{
357 .name = "linear", 356 .name = "linear",
357 .level = LEVEL_LINEAR,
358 .owner = THIS_MODULE, 358 .owner = THIS_MODULE,
359 .make_request = linear_make_request, 359 .make_request = linear_make_request,
360 .run = linear_run, 360 .run = linear_run,
@@ -364,16 +364,18 @@ static mdk_personality_t linear_personality=
364 364
365static int __init linear_init (void) 365static int __init linear_init (void)
366{ 366{
367 return register_md_personality (LINEAR, &linear_personality); 367 return register_md_personality (&linear_personality);
368} 368}
369 369
370static void linear_exit (void) 370static void linear_exit (void)
371{ 371{
372 unregister_md_personality (LINEAR); 372 unregister_md_personality (&linear_personality);
373} 373}
374 374
375 375
376module_init(linear_init); 376module_init(linear_init);
377module_exit(linear_exit); 377module_exit(linear_exit);
378MODULE_LICENSE("GPL"); 378MODULE_LICENSE("GPL");
379MODULE_ALIAS("md-personality-1"); /* LINEAR */ 379MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
380MODULE_ALIAS("md-linear");
381MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8175a2a222d..1b76fb29fb7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -42,6 +42,7 @@
42#include <linux/devfs_fs_kernel.h> 42#include <linux/devfs_fs_kernel.h>
43#include <linux/buffer_head.h> /* for invalidate_bdev */ 43#include <linux/buffer_head.h> /* for invalidate_bdev */
44#include <linux/suspend.h> 44#include <linux/suspend.h>
45#include <linux/poll.h>
45 46
46#include <linux/init.h> 47#include <linux/init.h>
47 48
@@ -67,7 +68,7 @@
67static void autostart_arrays (int part); 68static void autostart_arrays (int part);
68#endif 69#endif
69 70
70static mdk_personality_t *pers[MAX_PERSONALITY]; 71static LIST_HEAD(pers_list);
71static DEFINE_SPINLOCK(pers_lock); 72static DEFINE_SPINLOCK(pers_lock);
72 73
73/* 74/*
@@ -80,10 +81,22 @@ static DEFINE_SPINLOCK(pers_lock);
80 * idle IO detection. 81 * idle IO detection.
81 * 82 *
82 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 83 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
84 * or /sys/block/mdX/md/sync_speed_{min,max}
83 */ 85 */
84 86
85static int sysctl_speed_limit_min = 1000; 87static int sysctl_speed_limit_min = 1000;
86static int sysctl_speed_limit_max = 200000; 88static int sysctl_speed_limit_max = 200000;
89static inline int speed_min(mddev_t *mddev)
90{
91 return mddev->sync_speed_min ?
92 mddev->sync_speed_min : sysctl_speed_limit_min;
93}
94
95static inline int speed_max(mddev_t *mddev)
96{
97 return mddev->sync_speed_max ?
98 mddev->sync_speed_max : sysctl_speed_limit_max;
99}
87 100
88static struct ctl_table_header *raid_table_header; 101static struct ctl_table_header *raid_table_header;
89 102
@@ -134,6 +147,24 @@ static struct block_device_operations md_fops;
134static int start_readonly; 147static int start_readonly;
135 148
136/* 149/*
150 * We have a system wide 'event count' that is incremented
151 * on any 'interesting' event, and readers of /proc/mdstat
152 * can use 'poll' or 'select' to find out when the event
153 * count increases.
154 *
155 * Events are:
156 * start array, stop array, error, add device, remove device,
157 * start build, activate spare
158 */
159static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
160static atomic_t md_event_count;
161static void md_new_event(mddev_t *mddev)
162{
163 atomic_inc(&md_event_count);
164 wake_up(&md_event_waiters);
165}
166
167/*
137 * Enables to iterate over all existing md arrays 168 * Enables to iterate over all existing md arrays
138 * all_mddevs_lock protects this list. 169 * all_mddevs_lock protects this list.
139 */ 170 */
@@ -209,12 +240,10 @@ static mddev_t * mddev_find(dev_t unit)
209 } 240 }
210 spin_unlock(&all_mddevs_lock); 241 spin_unlock(&all_mddevs_lock);
211 242
212 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 243 new = kzalloc(sizeof(*new), GFP_KERNEL);
213 if (!new) 244 if (!new)
214 return NULL; 245 return NULL;
215 246
216 memset(new, 0, sizeof(*new));
217
218 new->unit = unit; 247 new->unit = unit;
219 if (MAJOR(unit) == MD_MAJOR) 248 if (MAJOR(unit) == MD_MAJOR)
220 new->md_minor = MINOR(unit); 249 new->md_minor = MINOR(unit);
@@ -262,7 +291,7 @@ static inline void mddev_unlock(mddev_t * mddev)
262 md_wakeup_thread(mddev->thread); 291 md_wakeup_thread(mddev->thread);
263} 292}
264 293
265mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 294static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
266{ 295{
267 mdk_rdev_t * rdev; 296 mdk_rdev_t * rdev;
268 struct list_head *tmp; 297 struct list_head *tmp;
@@ -286,6 +315,18 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
286 return NULL; 315 return NULL;
287} 316}
288 317
318static struct mdk_personality *find_pers(int level, char *clevel)
319{
320 struct mdk_personality *pers;
321 list_for_each_entry(pers, &pers_list, list) {
322 if (level != LEVEL_NONE && pers->level == level)
323 return pers;
324 if (strcmp(pers->name, clevel)==0)
325 return pers;
326 }
327 return NULL;
328}
329
289static inline sector_t calc_dev_sboffset(struct block_device *bdev) 330static inline sector_t calc_dev_sboffset(struct block_device *bdev)
290{ 331{
291 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 332 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
@@ -320,7 +361,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
320static void free_disk_sb(mdk_rdev_t * rdev) 361static void free_disk_sb(mdk_rdev_t * rdev)
321{ 362{
322 if (rdev->sb_page) { 363 if (rdev->sb_page) {
323 page_cache_release(rdev->sb_page); 364 put_page(rdev->sb_page);
324 rdev->sb_loaded = 0; 365 rdev->sb_loaded = 0;
325 rdev->sb_page = NULL; 366 rdev->sb_page = NULL;
326 rdev->sb_offset = 0; 367 rdev->sb_offset = 0;
@@ -461,6 +502,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
461 bio_put(bio); 502 bio_put(bio);
462 return ret; 503 return ret;
463} 504}
505EXPORT_SYMBOL_GPL(sync_page_io);
464 506
465static int read_disk_sb(mdk_rdev_t * rdev, int size) 507static int read_disk_sb(mdk_rdev_t * rdev, int size)
466{ 508{
@@ -665,6 +707,10 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
665 } 707 }
666 rdev->size = calc_dev_size(rdev, sb->chunk_size); 708 rdev->size = calc_dev_size(rdev, sb->chunk_size);
667 709
710 if (rdev->size < sb->size && sb->level > 1)
711 /* "this cannot possibly happen" ... */
712 ret = -EINVAL;
713
668 abort: 714 abort:
669 return ret; 715 return ret;
670} 716}
@@ -688,6 +734,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
688 mddev->ctime = sb->ctime; 734 mddev->ctime = sb->ctime;
689 mddev->utime = sb->utime; 735 mddev->utime = sb->utime;
690 mddev->level = sb->level; 736 mddev->level = sb->level;
737 mddev->clevel[0] = 0;
691 mddev->layout = sb->layout; 738 mddev->layout = sb->layout;
692 mddev->raid_disks = sb->raid_disks; 739 mddev->raid_disks = sb->raid_disks;
693 mddev->size = sb->size; 740 mddev->size = sb->size;
@@ -714,9 +761,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
714 761
715 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 762 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
716 mddev->bitmap_file == NULL) { 763 mddev->bitmap_file == NULL) {
717 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { 764 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
765 && mddev->level != 10) {
718 /* FIXME use a better test */ 766 /* FIXME use a better test */
719 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 767 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
720 return -EINVAL; 768 return -EINVAL;
721 } 769 }
722 mddev->bitmap_offset = mddev->default_bitmap_offset; 770 mddev->bitmap_offset = mddev->default_bitmap_offset;
@@ -968,6 +1016,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
968 } 1016 }
969 rdev->preferred_minor = 0xffff; 1017 rdev->preferred_minor = 0xffff;
970 rdev->data_offset = le64_to_cpu(sb->data_offset); 1018 rdev->data_offset = le64_to_cpu(sb->data_offset);
1019 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
971 1020
972 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1021 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
973 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1022 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
@@ -1006,6 +1055,9 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1006 rdev->size = le64_to_cpu(sb->data_size)/2; 1055 rdev->size = le64_to_cpu(sb->data_size)/2;
1007 if (le32_to_cpu(sb->chunksize)) 1056 if (le32_to_cpu(sb->chunksize))
1008 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1057 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1058
1059 if (le32_to_cpu(sb->size) > rdev->size*2)
1060 return -EINVAL;
1009 return 0; 1061 return 0;
1010} 1062}
1011 1063
@@ -1023,6 +1075,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1023 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1075 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1024 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1076 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1025 mddev->level = le32_to_cpu(sb->level); 1077 mddev->level = le32_to_cpu(sb->level);
1078 mddev->clevel[0] = 0;
1026 mddev->layout = le32_to_cpu(sb->layout); 1079 mddev->layout = le32_to_cpu(sb->layout);
1027 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1080 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1028 mddev->size = le64_to_cpu(sb->size)/2; 1081 mddev->size = le64_to_cpu(sb->size)/2;
@@ -1037,8 +1090,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1037 1090
1038 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1091 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1039 mddev->bitmap_file == NULL ) { 1092 mddev->bitmap_file == NULL ) {
1040 if (mddev->level != 1) { 1093 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1041 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 1094 && mddev->level != 10) {
1095 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
1042 return -EINVAL; 1096 return -EINVAL;
1043 } 1097 }
1044 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1098 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
@@ -1105,6 +1159,8 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1105 else 1159 else
1106 sb->resync_offset = cpu_to_le64(0); 1160 sb->resync_offset = cpu_to_le64(0);
1107 1161
1162 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
1163
1108 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1164 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1109 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1165 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1110 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1166 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@ -1187,6 +1243,14 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1187 MD_BUG(); 1243 MD_BUG();
1188 return -EINVAL; 1244 return -EINVAL;
1189 } 1245 }
1246 /* make sure rdev->size exceeds mddev->size */
1247 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1248 if (mddev->pers)
1249 /* Cannot change size, so fail */
1250 return -ENOSPC;
1251 else
1252 mddev->size = rdev->size;
1253 }
1190 same_pdev = match_dev_unit(mddev, rdev); 1254 same_pdev = match_dev_unit(mddev, rdev);
1191 if (same_pdev) 1255 if (same_pdev)
1192 printk(KERN_WARNING 1256 printk(KERN_WARNING
@@ -1496,6 +1560,26 @@ repeat:
1496 1560
1497} 1561}
1498 1562
1563/* words written to sysfs files may, or my not, be \n terminated.
1564 * We want to accept with case. For this we use cmd_match.
1565 */
1566static int cmd_match(const char *cmd, const char *str)
1567{
1568 /* See if cmd, written into a sysfs file, matches
1569 * str. They must either be the same, or cmd can
1570 * have a trailing newline
1571 */
1572 while (*cmd && *str && *cmd == *str) {
1573 cmd++;
1574 str++;
1575 }
1576 if (*cmd == '\n')
1577 cmd++;
1578 if (*str || *cmd)
1579 return 0;
1580 return 1;
1581}
1582
1499struct rdev_sysfs_entry { 1583struct rdev_sysfs_entry {
1500 struct attribute attr; 1584 struct attribute attr;
1501 ssize_t (*show)(mdk_rdev_t *, char *); 1585 ssize_t (*show)(mdk_rdev_t *, char *);
@@ -1538,9 +1622,113 @@ super_show(mdk_rdev_t *rdev, char *page)
1538} 1622}
1539static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1623static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1540 1624
1625static ssize_t
1626errors_show(mdk_rdev_t *rdev, char *page)
1627{
1628 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1629}
1630
1631static ssize_t
1632errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1633{
1634 char *e;
1635 unsigned long n = simple_strtoul(buf, &e, 10);
1636 if (*buf && (*e == 0 || *e == '\n')) {
1637 atomic_set(&rdev->corrected_errors, n);
1638 return len;
1639 }
1640 return -EINVAL;
1641}
1642static struct rdev_sysfs_entry rdev_errors =
1643__ATTR(errors, 0644, errors_show, errors_store);
1644
1645static ssize_t
1646slot_show(mdk_rdev_t *rdev, char *page)
1647{
1648 if (rdev->raid_disk < 0)
1649 return sprintf(page, "none\n");
1650 else
1651 return sprintf(page, "%d\n", rdev->raid_disk);
1652}
1653
1654static ssize_t
1655slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1656{
1657 char *e;
1658 int slot = simple_strtoul(buf, &e, 10);
1659 if (strncmp(buf, "none", 4)==0)
1660 slot = -1;
1661 else if (e==buf || (*e && *e!= '\n'))
1662 return -EINVAL;
1663 if (rdev->mddev->pers)
1664 /* Cannot set slot in active array (yet) */
1665 return -EBUSY;
1666 if (slot >= rdev->mddev->raid_disks)
1667 return -ENOSPC;
1668 rdev->raid_disk = slot;
1669 /* assume it is working */
1670 rdev->flags = 0;
1671 set_bit(In_sync, &rdev->flags);
1672 return len;
1673}
1674
1675
1676static struct rdev_sysfs_entry rdev_slot =
1677__ATTR(slot, 0644, slot_show, slot_store);
1678
1679static ssize_t
1680offset_show(mdk_rdev_t *rdev, char *page)
1681{
1682 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1683}
1684
1685static ssize_t
1686offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1687{
1688 char *e;
1689 unsigned long long offset = simple_strtoull(buf, &e, 10);
1690 if (e==buf || (*e && *e != '\n'))
1691 return -EINVAL;
1692 if (rdev->mddev->pers)
1693 return -EBUSY;
1694 rdev->data_offset = offset;
1695 return len;
1696}
1697
1698static struct rdev_sysfs_entry rdev_offset =
1699__ATTR(offset, 0644, offset_show, offset_store);
1700
1701static ssize_t
1702rdev_size_show(mdk_rdev_t *rdev, char *page)
1703{
1704 return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1705}
1706
1707static ssize_t
1708rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1709{
1710 char *e;
1711 unsigned long long size = simple_strtoull(buf, &e, 10);
1712 if (e==buf || (*e && *e != '\n'))
1713 return -EINVAL;
1714 if (rdev->mddev->pers)
1715 return -EBUSY;
1716 rdev->size = size;
1717 if (size < rdev->mddev->size || rdev->mddev->size == 0)
1718 rdev->mddev->size = size;
1719 return len;
1720}
1721
1722static struct rdev_sysfs_entry rdev_size =
1723__ATTR(size, 0644, rdev_size_show, rdev_size_store);
1724
1541static struct attribute *rdev_default_attrs[] = { 1725static struct attribute *rdev_default_attrs[] = {
1542 &rdev_state.attr, 1726 &rdev_state.attr,
1543 &rdev_super.attr, 1727 &rdev_super.attr,
1728 &rdev_errors.attr,
1729 &rdev_slot.attr,
1730 &rdev_offset.attr,
1731 &rdev_size.attr,
1544 NULL, 1732 NULL,
1545}; 1733};
1546static ssize_t 1734static ssize_t
@@ -1598,12 +1786,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1598 mdk_rdev_t *rdev; 1786 mdk_rdev_t *rdev;
1599 sector_t size; 1787 sector_t size;
1600 1788
1601 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1789 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
1602 if (!rdev) { 1790 if (!rdev) {
1603 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1791 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1604 return ERR_PTR(-ENOMEM); 1792 return ERR_PTR(-ENOMEM);
1605 } 1793 }
1606 memset(rdev, 0, sizeof(*rdev));
1607 1794
1608 if ((err = alloc_disk_sb(rdev))) 1795 if ((err = alloc_disk_sb(rdev)))
1609 goto abort_free; 1796 goto abort_free;
@@ -1621,6 +1808,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1621 rdev->data_offset = 0; 1808 rdev->data_offset = 0;
1622 atomic_set(&rdev->nr_pending, 0); 1809 atomic_set(&rdev->nr_pending, 0);
1623 atomic_set(&rdev->read_errors, 0); 1810 atomic_set(&rdev->read_errors, 0);
1811 atomic_set(&rdev->corrected_errors, 0);
1624 1812
1625 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1813 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1626 if (!size) { 1814 if (!size) {
@@ -1725,16 +1913,37 @@ static void analyze_sbs(mddev_t * mddev)
1725static ssize_t 1913static ssize_t
1726level_show(mddev_t *mddev, char *page) 1914level_show(mddev_t *mddev, char *page)
1727{ 1915{
1728 mdk_personality_t *p = mddev->pers; 1916 struct mdk_personality *p = mddev->pers;
1729 if (p == NULL && mddev->raid_disks == 0) 1917 if (p)
1730 return 0;
1731 if (mddev->level >= 0)
1732 return sprintf(page, "raid%d\n", mddev->level);
1733 else
1734 return sprintf(page, "%s\n", p->name); 1918 return sprintf(page, "%s\n", p->name);
1919 else if (mddev->clevel[0])
1920 return sprintf(page, "%s\n", mddev->clevel);
1921 else if (mddev->level != LEVEL_NONE)
1922 return sprintf(page, "%d\n", mddev->level);
1923 else
1924 return 0;
1925}
1926
1927static ssize_t
1928level_store(mddev_t *mddev, const char *buf, size_t len)
1929{
1930 int rv = len;
1931 if (mddev->pers)
1932 return -EBUSY;
1933 if (len == 0)
1934 return 0;
1935 if (len >= sizeof(mddev->clevel))
1936 return -ENOSPC;
1937 strncpy(mddev->clevel, buf, len);
1938 if (mddev->clevel[len-1] == '\n')
1939 len--;
1940 mddev->clevel[len] = 0;
1941 mddev->level = LEVEL_NONE;
1942 return rv;
1735} 1943}
1736 1944
1737static struct md_sysfs_entry md_level = __ATTR_RO(level); 1945static struct md_sysfs_entry md_level =
1946__ATTR(level, 0644, level_show, level_store);
1738 1947
1739static ssize_t 1948static ssize_t
1740raid_disks_show(mddev_t *mddev, char *page) 1949raid_disks_show(mddev_t *mddev, char *page)
@@ -1744,7 +1953,197 @@ raid_disks_show(mddev_t *mddev, char *page)
1744 return sprintf(page, "%d\n", mddev->raid_disks); 1953 return sprintf(page, "%d\n", mddev->raid_disks);
1745} 1954}
1746 1955
1747static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks); 1956static int update_raid_disks(mddev_t *mddev, int raid_disks);
1957
1958static ssize_t
1959raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
1960{
1961 /* can only set raid_disks if array is not yet active */
1962 char *e;
1963 int rv = 0;
1964 unsigned long n = simple_strtoul(buf, &e, 10);
1965
1966 if (!*buf || (*e && *e != '\n'))
1967 return -EINVAL;
1968
1969 if (mddev->pers)
1970 rv = update_raid_disks(mddev, n);
1971 else
1972 mddev->raid_disks = n;
1973 return rv ? rv : len;
1974}
1975static struct md_sysfs_entry md_raid_disks =
1976__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
1977
1978static ssize_t
1979chunk_size_show(mddev_t *mddev, char *page)
1980{
1981 return sprintf(page, "%d\n", mddev->chunk_size);
1982}
1983
1984static ssize_t
1985chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
1986{
1987 /* can only set chunk_size if array is not yet active */
1988 char *e;
1989 unsigned long n = simple_strtoul(buf, &e, 10);
1990
1991 if (mddev->pers)
1992 return -EBUSY;
1993 if (!*buf || (*e && *e != '\n'))
1994 return -EINVAL;
1995
1996 mddev->chunk_size = n;
1997 return len;
1998}
1999static struct md_sysfs_entry md_chunk_size =
2000__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
2001
2002static ssize_t
2003null_show(mddev_t *mddev, char *page)
2004{
2005 return -EINVAL;
2006}
2007
2008static ssize_t
2009new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2010{
2011 /* buf must be %d:%d\n? giving major and minor numbers */
2012 /* The new device is added to the array.
2013 * If the array has a persistent superblock, we read the
2014 * superblock to initialise info and check validity.
2015 * Otherwise, only checking done is that in bind_rdev_to_array,
2016 * which mainly checks size.
2017 */
2018 char *e;
2019 int major = simple_strtoul(buf, &e, 10);
2020 int minor;
2021 dev_t dev;
2022 mdk_rdev_t *rdev;
2023 int err;
2024
2025 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2026 return -EINVAL;
2027 minor = simple_strtoul(e+1, &e, 10);
2028 if (*e && *e != '\n')
2029 return -EINVAL;
2030 dev = MKDEV(major, minor);
2031 if (major != MAJOR(dev) ||
2032 minor != MINOR(dev))
2033 return -EOVERFLOW;
2034
2035
2036 if (mddev->persistent) {
2037 rdev = md_import_device(dev, mddev->major_version,
2038 mddev->minor_version);
2039 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2040 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2041 mdk_rdev_t, same_set);
2042 err = super_types[mddev->major_version]
2043 .load_super(rdev, rdev0, mddev->minor_version);
2044 if (err < 0)
2045 goto out;
2046 }
2047 } else
2048 rdev = md_import_device(dev, -1, -1);
2049
2050 if (IS_ERR(rdev))
2051 return PTR_ERR(rdev);
2052 err = bind_rdev_to_array(rdev, mddev);
2053 out:
2054 if (err)
2055 export_rdev(rdev);
2056 return err ? err : len;
2057}
2058
2059static struct md_sysfs_entry md_new_device =
2060__ATTR(new_dev, 0200, null_show, new_dev_store);
2061
2062static ssize_t
2063size_show(mddev_t *mddev, char *page)
2064{
2065 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2066}
2067
2068static int update_size(mddev_t *mddev, unsigned long size);
2069
2070static ssize_t
2071size_store(mddev_t *mddev, const char *buf, size_t len)
2072{
2073 /* If array is inactive, we can reduce the component size, but
2074 * not increase it (except from 0).
2075 * If array is active, we can try an on-line resize
2076 */
2077 char *e;
2078 int err = 0;
2079 unsigned long long size = simple_strtoull(buf, &e, 10);
2080 if (!*buf || *buf == '\n' ||
2081 (*e && *e != '\n'))
2082 return -EINVAL;
2083
2084 if (mddev->pers) {
2085 err = update_size(mddev, size);
2086 md_update_sb(mddev);
2087 } else {
2088 if (mddev->size == 0 ||
2089 mddev->size > size)
2090 mddev->size = size;
2091 else
2092 err = -ENOSPC;
2093 }
2094 return err ? err : len;
2095}
2096
2097static struct md_sysfs_entry md_size =
2098__ATTR(component_size, 0644, size_show, size_store);
2099
2100
2101/* Metdata version.
2102 * This is either 'none' for arrays with externally managed metadata,
2103 * or N.M for internally known formats
2104 */
2105static ssize_t
2106metadata_show(mddev_t *mddev, char *page)
2107{
2108 if (mddev->persistent)
2109 return sprintf(page, "%d.%d\n",
2110 mddev->major_version, mddev->minor_version);
2111 else
2112 return sprintf(page, "none\n");
2113}
2114
2115static ssize_t
2116metadata_store(mddev_t *mddev, const char *buf, size_t len)
2117{
2118 int major, minor;
2119 char *e;
2120 if (!list_empty(&mddev->disks))
2121 return -EBUSY;
2122
2123 if (cmd_match(buf, "none")) {
2124 mddev->persistent = 0;
2125 mddev->major_version = 0;
2126 mddev->minor_version = 90;
2127 return len;
2128 }
2129 major = simple_strtoul(buf, &e, 10);
2130 if (e==buf || *e != '.')
2131 return -EINVAL;
2132 buf = e+1;
2133 minor = simple_strtoul(buf, &e, 10);
2134 if (e==buf || *e != '\n')
2135 return -EINVAL;
2136 if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
2137 super_types[major].name == NULL)
2138 return -ENOENT;
2139 mddev->major_version = major;
2140 mddev->minor_version = minor;
2141 mddev->persistent = 1;
2142 return len;
2143}
2144
2145static struct md_sysfs_entry md_metadata =
2146__ATTR(metadata_version, 0644, metadata_show, metadata_store);
1748 2147
1749static ssize_t 2148static ssize_t
1750action_show(mddev_t *mddev, char *page) 2149action_show(mddev_t *mddev, char *page)
@@ -1771,31 +2170,27 @@ action_store(mddev_t *mddev, const char *page, size_t len)
1771 if (!mddev->pers || !mddev->pers->sync_request) 2170 if (!mddev->pers || !mddev->pers->sync_request)
1772 return -EINVAL; 2171 return -EINVAL;
1773 2172
1774 if (strcmp(page, "idle")==0 || strcmp(page, "idle\n")==0) { 2173 if (cmd_match(page, "idle")) {
1775 if (mddev->sync_thread) { 2174 if (mddev->sync_thread) {
1776 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2175 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1777 md_unregister_thread(mddev->sync_thread); 2176 md_unregister_thread(mddev->sync_thread);
1778 mddev->sync_thread = NULL; 2177 mddev->sync_thread = NULL;
1779 mddev->recovery = 0; 2178 mddev->recovery = 0;
1780 } 2179 }
1781 return len; 2180 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1782 } 2181 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
1783
1784 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1785 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
1786 return -EBUSY; 2182 return -EBUSY;
1787 if (strcmp(page, "resync")==0 || strcmp(page, "resync\n")==0 || 2183 else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
1788 strcmp(page, "recover")==0 || strcmp(page, "recover\n")==0)
1789 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2184 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1790 else { 2185 else {
1791 if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) 2186 if (cmd_match(page, "check"))
1792 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2187 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
1793 else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) 2188 else if (cmd_match(page, "repair"))
1794 return -EINVAL; 2189 return -EINVAL;
1795 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2190 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
1796 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2191 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
1797 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1798 } 2192 }
2193 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1799 md_wakeup_thread(mddev->thread); 2194 md_wakeup_thread(mddev->thread);
1800 return len; 2195 return len;
1801} 2196}
@@ -1814,15 +2209,107 @@ md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
1814static struct md_sysfs_entry 2209static struct md_sysfs_entry
1815md_mismatches = __ATTR_RO(mismatch_cnt); 2210md_mismatches = __ATTR_RO(mismatch_cnt);
1816 2211
2212static ssize_t
2213sync_min_show(mddev_t *mddev, char *page)
2214{
2215 return sprintf(page, "%d (%s)\n", speed_min(mddev),
2216 mddev->sync_speed_min ? "local": "system");
2217}
2218
2219static ssize_t
2220sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2221{
2222 int min;
2223 char *e;
2224 if (strncmp(buf, "system", 6)==0) {
2225 mddev->sync_speed_min = 0;
2226 return len;
2227 }
2228 min = simple_strtoul(buf, &e, 10);
2229 if (buf == e || (*e && *e != '\n') || min <= 0)
2230 return -EINVAL;
2231 mddev->sync_speed_min = min;
2232 return len;
2233}
2234
2235static struct md_sysfs_entry md_sync_min =
2236__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2237
2238static ssize_t
2239sync_max_show(mddev_t *mddev, char *page)
2240{
2241 return sprintf(page, "%d (%s)\n", speed_max(mddev),
2242 mddev->sync_speed_max ? "local": "system");
2243}
2244
2245static ssize_t
2246sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2247{
2248 int max;
2249 char *e;
2250 if (strncmp(buf, "system", 6)==0) {
2251 mddev->sync_speed_max = 0;
2252 return len;
2253 }
2254 max = simple_strtoul(buf, &e, 10);
2255 if (buf == e || (*e && *e != '\n') || max <= 0)
2256 return -EINVAL;
2257 mddev->sync_speed_max = max;
2258 return len;
2259}
2260
2261static struct md_sysfs_entry md_sync_max =
2262__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2263
2264
2265static ssize_t
2266sync_speed_show(mddev_t *mddev, char *page)
2267{
2268 unsigned long resync, dt, db;
2269 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2270 dt = ((jiffies - mddev->resync_mark) / HZ);
2271 if (!dt) dt++;
2272 db = resync - (mddev->resync_mark_cnt);
2273 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2274}
2275
2276static struct md_sysfs_entry
2277md_sync_speed = __ATTR_RO(sync_speed);
2278
2279static ssize_t
2280sync_completed_show(mddev_t *mddev, char *page)
2281{
2282 unsigned long max_blocks, resync;
2283
2284 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2285 max_blocks = mddev->resync_max_sectors;
2286 else
2287 max_blocks = mddev->size << 1;
2288
2289 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2290 return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2291}
2292
2293static struct md_sysfs_entry
2294md_sync_completed = __ATTR_RO(sync_completed);
2295
1817static struct attribute *md_default_attrs[] = { 2296static struct attribute *md_default_attrs[] = {
1818 &md_level.attr, 2297 &md_level.attr,
1819 &md_raid_disks.attr, 2298 &md_raid_disks.attr,
2299 &md_chunk_size.attr,
2300 &md_size.attr,
2301 &md_metadata.attr,
2302 &md_new_device.attr,
1820 NULL, 2303 NULL,
1821}; 2304};
1822 2305
1823static struct attribute *md_redundancy_attrs[] = { 2306static struct attribute *md_redundancy_attrs[] = {
1824 &md_scan_mode.attr, 2307 &md_scan_mode.attr,
1825 &md_mismatches.attr, 2308 &md_mismatches.attr,
2309 &md_sync_min.attr,
2310 &md_sync_max.attr,
2311 &md_sync_speed.attr,
2312 &md_sync_completed.attr,
1826 NULL, 2313 NULL,
1827}; 2314};
1828static struct attribute_group md_redundancy_group = { 2315static struct attribute_group md_redundancy_group = {
@@ -1937,14 +2424,16 @@ static void md_safemode_timeout(unsigned long data)
1937 md_wakeup_thread(mddev->thread); 2424 md_wakeup_thread(mddev->thread);
1938} 2425}
1939 2426
2427static int start_dirty_degraded;
1940 2428
1941static int do_md_run(mddev_t * mddev) 2429static int do_md_run(mddev_t * mddev)
1942{ 2430{
1943 int pnum, err; 2431 int err;
1944 int chunk_size; 2432 int chunk_size;
1945 struct list_head *tmp; 2433 struct list_head *tmp;
1946 mdk_rdev_t *rdev; 2434 mdk_rdev_t *rdev;
1947 struct gendisk *disk; 2435 struct gendisk *disk;
2436 struct mdk_personality *pers;
1948 char b[BDEVNAME_SIZE]; 2437 char b[BDEVNAME_SIZE];
1949 2438
1950 if (list_empty(&mddev->disks)) 2439 if (list_empty(&mddev->disks))
@@ -1961,20 +2450,8 @@ static int do_md_run(mddev_t * mddev)
1961 analyze_sbs(mddev); 2450 analyze_sbs(mddev);
1962 2451
1963 chunk_size = mddev->chunk_size; 2452 chunk_size = mddev->chunk_size;
1964 pnum = level_to_pers(mddev->level);
1965 2453
1966 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 2454 if (chunk_size) {
1967 if (!chunk_size) {
1968 /*
1969 * 'default chunksize' in the old md code used to
1970 * be PAGE_SIZE, baaad.
1971 * we abort here to be on the safe side. We don't
1972 * want to continue the bad practice.
1973 */
1974 printk(KERN_ERR
1975 "no chunksize specified, see 'man raidtab'\n");
1976 return -EINVAL;
1977 }
1978 if (chunk_size > MAX_CHUNK_SIZE) { 2455 if (chunk_size > MAX_CHUNK_SIZE) {
1979 printk(KERN_ERR "too big chunk_size: %d > %d\n", 2456 printk(KERN_ERR "too big chunk_size: %d > %d\n",
1980 chunk_size, MAX_CHUNK_SIZE); 2457 chunk_size, MAX_CHUNK_SIZE);
@@ -2010,10 +2487,10 @@ static int do_md_run(mddev_t * mddev)
2010 } 2487 }
2011 2488
2012#ifdef CONFIG_KMOD 2489#ifdef CONFIG_KMOD
2013 if (!pers[pnum]) 2490 if (mddev->level != LEVEL_NONE)
2014 { 2491 request_module("md-level-%d", mddev->level);
2015 request_module("md-personality-%d", pnum); 2492 else if (mddev->clevel[0])
2016 } 2493 request_module("md-%s", mddev->clevel);
2017#endif 2494#endif
2018 2495
2019 /* 2496 /*
@@ -2035,30 +2512,39 @@ static int do_md_run(mddev_t * mddev)
2035 return -ENOMEM; 2512 return -ENOMEM;
2036 2513
2037 spin_lock(&pers_lock); 2514 spin_lock(&pers_lock);
2038 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 2515 pers = find_pers(mddev->level, mddev->clevel);
2516 if (!pers || !try_module_get(pers->owner)) {
2039 spin_unlock(&pers_lock); 2517 spin_unlock(&pers_lock);
2040 printk(KERN_WARNING "md: personality %d is not loaded!\n", 2518 if (mddev->level != LEVEL_NONE)
2041 pnum); 2519 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
2520 mddev->level);
2521 else
2522 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
2523 mddev->clevel);
2042 return -EINVAL; 2524 return -EINVAL;
2043 } 2525 }
2044 2526 mddev->pers = pers;
2045 mddev->pers = pers[pnum];
2046 spin_unlock(&pers_lock); 2527 spin_unlock(&pers_lock);
2528 mddev->level = pers->level;
2529 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2047 2530
2048 mddev->recovery = 0; 2531 mddev->recovery = 0;
2049 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2532 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
2050 mddev->barriers_work = 1; 2533 mddev->barriers_work = 1;
2534 mddev->ok_start_degraded = start_dirty_degraded;
2051 2535
2052 if (start_readonly) 2536 if (start_readonly)
2053 mddev->ro = 2; /* read-only, but switch on first write */ 2537 mddev->ro = 2; /* read-only, but switch on first write */
2054 2538
2055 /* before we start the array running, initialise the bitmap */ 2539 err = mddev->pers->run(mddev);
2056 err = bitmap_create(mddev); 2540 if (!err && mddev->pers->sync_request) {
2057 if (err) 2541 err = bitmap_create(mddev);
2058 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 2542 if (err) {
2059 mdname(mddev), err); 2543 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
2060 else 2544 mdname(mddev), err);
2061 err = mddev->pers->run(mddev); 2545 mddev->pers->stop(mddev);
2546 }
2547 }
2062 if (err) { 2548 if (err) {
2063 printk(KERN_ERR "md: pers->run() failed ...\n"); 2549 printk(KERN_ERR "md: pers->run() failed ...\n");
2064 module_put(mddev->pers->owner); 2550 module_put(mddev->pers->owner);
@@ -2104,6 +2590,7 @@ static int do_md_run(mddev_t * mddev)
2104 mddev->queue->make_request_fn = mddev->pers->make_request; 2590 mddev->queue->make_request_fn = mddev->pers->make_request;
2105 2591
2106 mddev->changed = 1; 2592 mddev->changed = 1;
2593 md_new_event(mddev);
2107 return 0; 2594 return 0;
2108} 2595}
2109 2596
@@ -2231,6 +2718,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2231 printk(KERN_INFO "md: %s switched to read-only mode.\n", 2718 printk(KERN_INFO "md: %s switched to read-only mode.\n",
2232 mdname(mddev)); 2719 mdname(mddev));
2233 err = 0; 2720 err = 0;
2721 md_new_event(mddev);
2234out: 2722out:
2235 return err; 2723 return err;
2236} 2724}
@@ -2668,12 +3156,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2668 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3156 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2669 set_bit(WriteMostly, &rdev->flags); 3157 set_bit(WriteMostly, &rdev->flags);
2670 3158
2671 err = bind_rdev_to_array(rdev, mddev);
2672 if (err) {
2673 export_rdev(rdev);
2674 return err;
2675 }
2676
2677 if (!mddev->persistent) { 3159 if (!mddev->persistent) {
2678 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3160 printk(KERN_INFO "md: nonpersistent superblock ...\n");
2679 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3161 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
@@ -2681,8 +3163,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2681 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3163 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2682 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3164 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
2683 3165
2684 if (!mddev->size || (mddev->size > rdev->size)) 3166 err = bind_rdev_to_array(rdev, mddev);
2685 mddev->size = rdev->size; 3167 if (err) {
3168 export_rdev(rdev);
3169 return err;
3170 }
2686 } 3171 }
2687 3172
2688 return 0; 3173 return 0;
@@ -2705,6 +3190,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
2705 3190
2706 kick_rdev_from_array(rdev); 3191 kick_rdev_from_array(rdev);
2707 md_update_sb(mddev); 3192 md_update_sb(mddev);
3193 md_new_event(mddev);
2708 3194
2709 return 0; 3195 return 0;
2710busy: 3196busy:
@@ -2753,15 +3239,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2753 size = calc_dev_size(rdev, mddev->chunk_size); 3239 size = calc_dev_size(rdev, mddev->chunk_size);
2754 rdev->size = size; 3240 rdev->size = size;
2755 3241
2756 if (size < mddev->size) {
2757 printk(KERN_WARNING
2758 "%s: disk size %llu blocks < array size %llu\n",
2759 mdname(mddev), (unsigned long long)size,
2760 (unsigned long long)mddev->size);
2761 err = -ENOSPC;
2762 goto abort_export;
2763 }
2764
2765 if (test_bit(Faulty, &rdev->flags)) { 3242 if (test_bit(Faulty, &rdev->flags)) {
2766 printk(KERN_WARNING 3243 printk(KERN_WARNING
2767 "md: can not hot-add faulty %s disk to %s!\n", 3244 "md: can not hot-add faulty %s disk to %s!\n",
@@ -2771,7 +3248,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2771 } 3248 }
2772 clear_bit(In_sync, &rdev->flags); 3249 clear_bit(In_sync, &rdev->flags);
2773 rdev->desc_nr = -1; 3250 rdev->desc_nr = -1;
2774 bind_rdev_to_array(rdev, mddev); 3251 err = bind_rdev_to_array(rdev, mddev);
3252 if (err)
3253 goto abort_export;
2775 3254
2776 /* 3255 /*
2777 * The rest should better be atomic, we can have disk failures 3256 * The rest should better be atomic, we can have disk failures
@@ -2795,7 +3274,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2795 */ 3274 */
2796 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3275 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2797 md_wakeup_thread(mddev->thread); 3276 md_wakeup_thread(mddev->thread);
2798 3277 md_new_event(mddev);
2799 return 0; 3278 return 0;
2800 3279
2801abort_unbind_export: 3280abort_unbind_export:
@@ -2942,6 +3421,81 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
2942 return 0; 3421 return 0;
2943} 3422}
2944 3423
3424static int update_size(mddev_t *mddev, unsigned long size)
3425{
3426 mdk_rdev_t * rdev;
3427 int rv;
3428 struct list_head *tmp;
3429
3430 if (mddev->pers->resize == NULL)
3431 return -EINVAL;
3432 /* The "size" is the amount of each device that is used.
3433 * This can only make sense for arrays with redundancy.
3434 * linear and raid0 always use whatever space is available
3435 * We can only consider changing the size if no resync
3436 * or reconstruction is happening, and if the new size
3437 * is acceptable. It must fit before the sb_offset or,
3438 * if that is <data_offset, it must fit before the
3439 * size of each device.
3440 * If size is zero, we find the largest size that fits.
3441 */
3442 if (mddev->sync_thread)
3443 return -EBUSY;
3444 ITERATE_RDEV(mddev,rdev,tmp) {
3445 sector_t avail;
3446 int fit = (size == 0);
3447 if (rdev->sb_offset > rdev->data_offset)
3448 avail = (rdev->sb_offset*2) - rdev->data_offset;
3449 else
3450 avail = get_capacity(rdev->bdev->bd_disk)
3451 - rdev->data_offset;
3452 if (fit && (size == 0 || size > avail/2))
3453 size = avail/2;
3454 if (avail < ((sector_t)size << 1))
3455 return -ENOSPC;
3456 }
3457 rv = mddev->pers->resize(mddev, (sector_t)size *2);
3458 if (!rv) {
3459 struct block_device *bdev;
3460
3461 bdev = bdget_disk(mddev->gendisk, 0);
3462 if (bdev) {
3463 down(&bdev->bd_inode->i_sem);
3464 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3465 up(&bdev->bd_inode->i_sem);
3466 bdput(bdev);
3467 }
3468 }
3469 return rv;
3470}
3471
3472static int update_raid_disks(mddev_t *mddev, int raid_disks)
3473{
3474 int rv;
3475 /* change the number of raid disks */
3476 if (mddev->pers->reshape == NULL)
3477 return -EINVAL;
3478 if (raid_disks <= 0 ||
3479 raid_disks >= mddev->max_disks)
3480 return -EINVAL;
3481 if (mddev->sync_thread)
3482 return -EBUSY;
3483 rv = mddev->pers->reshape(mddev, raid_disks);
3484 if (!rv) {
3485 struct block_device *bdev;
3486
3487 bdev = bdget_disk(mddev->gendisk, 0);
3488 if (bdev) {
3489 down(&bdev->bd_inode->i_sem);
3490 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3491 up(&bdev->bd_inode->i_sem);
3492 bdput(bdev);
3493 }
3494 }
3495 return rv;
3496}
3497
3498
2945/* 3499/*
2946 * update_array_info is used to change the configuration of an 3500 * update_array_info is used to change the configuration of an
2947 * on-line array. 3501 * on-line array.
@@ -2990,71 +3544,12 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
2990 else 3544 else
2991 return mddev->pers->reconfig(mddev, info->layout, -1); 3545 return mddev->pers->reconfig(mddev, info->layout, -1);
2992 } 3546 }
2993 if (mddev->size != info->size) { 3547 if (mddev->size != info->size)
2994 mdk_rdev_t * rdev; 3548 rv = update_size(mddev, info->size);
2995 struct list_head *tmp; 3549
2996 if (mddev->pers->resize == NULL) 3550 if (mddev->raid_disks != info->raid_disks)
2997 return -EINVAL; 3551 rv = update_raid_disks(mddev, info->raid_disks);
2998 /* The "size" is the amount of each device that is used. 3552
2999 * This can only make sense for arrays with redundancy.
3000 * linear and raid0 always use whatever space is available
3001 * We can only consider changing the size if no resync
3002 * or reconstruction is happening, and if the new size
3003 * is acceptable. It must fit before the sb_offset or,
3004 * if that is <data_offset, it must fit before the
3005 * size of each device.
3006 * If size is zero, we find the largest size that fits.
3007 */
3008 if (mddev->sync_thread)
3009 return -EBUSY;
3010 ITERATE_RDEV(mddev,rdev,tmp) {
3011 sector_t avail;
3012 int fit = (info->size == 0);
3013 if (rdev->sb_offset > rdev->data_offset)
3014 avail = (rdev->sb_offset*2) - rdev->data_offset;
3015 else
3016 avail = get_capacity(rdev->bdev->bd_disk)
3017 - rdev->data_offset;
3018 if (fit && (info->size == 0 || info->size > avail/2))
3019 info->size = avail/2;
3020 if (avail < ((sector_t)info->size << 1))
3021 return -ENOSPC;
3022 }
3023 rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
3024 if (!rv) {
3025 struct block_device *bdev;
3026
3027 bdev = bdget_disk(mddev->gendisk, 0);
3028 if (bdev) {
3029 down(&bdev->bd_inode->i_sem);
3030 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3031 up(&bdev->bd_inode->i_sem);
3032 bdput(bdev);
3033 }
3034 }
3035 }
3036 if (mddev->raid_disks != info->raid_disks) {
3037 /* change the number of raid disks */
3038 if (mddev->pers->reshape == NULL)
3039 return -EINVAL;
3040 if (info->raid_disks <= 0 ||
3041 info->raid_disks >= mddev->max_disks)
3042 return -EINVAL;
3043 if (mddev->sync_thread)
3044 return -EBUSY;
3045 rv = mddev->pers->reshape(mddev, info->raid_disks);
3046 if (!rv) {
3047 struct block_device *bdev;
3048
3049 bdev = bdget_disk(mddev->gendisk, 0);
3050 if (bdev) {
3051 down(&bdev->bd_inode->i_sem);
3052 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3053 up(&bdev->bd_inode->i_sem);
3054 bdput(bdev);
3055 }
3056 }
3057 }
3058 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 3553 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
3059 if (mddev->pers->quiesce == NULL) 3554 if (mddev->pers->quiesce == NULL)
3060 return -EINVAL; 3555 return -EINVAL;
@@ -3476,11 +3971,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3476{ 3971{
3477 mdk_thread_t *thread; 3972 mdk_thread_t *thread;
3478 3973
3479 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3974 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
3480 if (!thread) 3975 if (!thread)
3481 return NULL; 3976 return NULL;
3482 3977
3483 memset(thread, 0, sizeof(mdk_thread_t));
3484 init_waitqueue_head(&thread->wqueue); 3978 init_waitqueue_head(&thread->wqueue);
3485 3979
3486 thread->run = run; 3980 thread->run = run;
@@ -3524,6 +4018,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
3524 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4018 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3525 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4019 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3526 md_wakeup_thread(mddev->thread); 4020 md_wakeup_thread(mddev->thread);
4021 md_new_event(mddev);
3527} 4022}
3528 4023
3529/* seq_file implementation /proc/mdstat */ 4024/* seq_file implementation /proc/mdstat */
@@ -3664,24 +4159,29 @@ static void md_seq_stop(struct seq_file *seq, void *v)
3664 mddev_put(mddev); 4159 mddev_put(mddev);
3665} 4160}
3666 4161
4162struct mdstat_info {
4163 int event;
4164};
4165
3667static int md_seq_show(struct seq_file *seq, void *v) 4166static int md_seq_show(struct seq_file *seq, void *v)
3668{ 4167{
3669 mddev_t *mddev = v; 4168 mddev_t *mddev = v;
3670 sector_t size; 4169 sector_t size;
3671 struct list_head *tmp2; 4170 struct list_head *tmp2;
3672 mdk_rdev_t *rdev; 4171 mdk_rdev_t *rdev;
3673 int i; 4172 struct mdstat_info *mi = seq->private;
3674 struct bitmap *bitmap; 4173 struct bitmap *bitmap;
3675 4174
3676 if (v == (void*)1) { 4175 if (v == (void*)1) {
4176 struct mdk_personality *pers;
3677 seq_printf(seq, "Personalities : "); 4177 seq_printf(seq, "Personalities : ");
3678 spin_lock(&pers_lock); 4178 spin_lock(&pers_lock);
3679 for (i = 0; i < MAX_PERSONALITY; i++) 4179 list_for_each_entry(pers, &pers_list, list)
3680 if (pers[i]) 4180 seq_printf(seq, "[%s] ", pers->name);
3681 seq_printf(seq, "[%s] ", pers[i]->name);
3682 4181
3683 spin_unlock(&pers_lock); 4182 spin_unlock(&pers_lock);
3684 seq_printf(seq, "\n"); 4183 seq_printf(seq, "\n");
4184 mi->event = atomic_read(&md_event_count);
3685 return 0; 4185 return 0;
3686 } 4186 }
3687 if (v == (void*)2) { 4187 if (v == (void*)2) {
@@ -3790,47 +4290,68 @@ static struct seq_operations md_seq_ops = {
3790static int md_seq_open(struct inode *inode, struct file *file) 4290static int md_seq_open(struct inode *inode, struct file *file)
3791{ 4291{
3792 int error; 4292 int error;
4293 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
4294 if (mi == NULL)
4295 return -ENOMEM;
3793 4296
3794 error = seq_open(file, &md_seq_ops); 4297 error = seq_open(file, &md_seq_ops);
4298 if (error)
4299 kfree(mi);
4300 else {
4301 struct seq_file *p = file->private_data;
4302 p->private = mi;
4303 mi->event = atomic_read(&md_event_count);
4304 }
3795 return error; 4305 return error;
3796} 4306}
3797 4307
4308static int md_seq_release(struct inode *inode, struct file *file)
4309{
4310 struct seq_file *m = file->private_data;
4311 struct mdstat_info *mi = m->private;
4312 m->private = NULL;
4313 kfree(mi);
4314 return seq_release(inode, file);
4315}
4316
4317static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
4318{
4319 struct seq_file *m = filp->private_data;
4320 struct mdstat_info *mi = m->private;
4321 int mask;
4322
4323 poll_wait(filp, &md_event_waiters, wait);
4324
4325 /* always allow read */
4326 mask = POLLIN | POLLRDNORM;
4327
4328 if (mi->event != atomic_read(&md_event_count))
4329 mask |= POLLERR | POLLPRI;
4330 return mask;
4331}
4332
3798static struct file_operations md_seq_fops = { 4333static struct file_operations md_seq_fops = {
3799 .open = md_seq_open, 4334 .open = md_seq_open,
3800 .read = seq_read, 4335 .read = seq_read,
3801 .llseek = seq_lseek, 4336 .llseek = seq_lseek,
3802 .release = seq_release, 4337 .release = md_seq_release,
4338 .poll = mdstat_poll,
3803}; 4339};
3804 4340
3805int register_md_personality(int pnum, mdk_personality_t *p) 4341int register_md_personality(struct mdk_personality *p)
3806{ 4342{
3807 if (pnum >= MAX_PERSONALITY) {
3808 printk(KERN_ERR
3809 "md: tried to install personality %s as nr %d, but max is %lu\n",
3810 p->name, pnum, MAX_PERSONALITY-1);
3811 return -EINVAL;
3812 }
3813
3814 spin_lock(&pers_lock); 4343 spin_lock(&pers_lock);
3815 if (pers[pnum]) { 4344 list_add_tail(&p->list, &pers_list);
3816 spin_unlock(&pers_lock); 4345 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
3817 return -EBUSY;
3818 }
3819
3820 pers[pnum] = p;
3821 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3822 spin_unlock(&pers_lock); 4346 spin_unlock(&pers_lock);
3823 return 0; 4347 return 0;
3824} 4348}
3825 4349
3826int unregister_md_personality(int pnum) 4350int unregister_md_personality(struct mdk_personality *p)
3827{ 4351{
3828 if (pnum >= MAX_PERSONALITY) 4352 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
3829 return -EINVAL;
3830
3831 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3832 spin_lock(&pers_lock); 4353 spin_lock(&pers_lock);
3833 pers[pnum] = NULL; 4354 list_del_init(&p->list);
3834 spin_unlock(&pers_lock); 4355 spin_unlock(&pers_lock);
3835 return 0; 4356 return 0;
3836} 4357}
@@ -4012,10 +4533,10 @@ static void md_do_sync(mddev_t *mddev)
4012 4533
4013 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 4534 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
4014 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 4535 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
4015 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 4536 " %d KB/sec/disc.\n", speed_min(mddev));
4016 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 4537 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
4017 "(but not more than %d KB/sec) for reconstruction.\n", 4538 "(but not more than %d KB/sec) for reconstruction.\n",
4018 sysctl_speed_limit_max); 4539 speed_max(mddev));
4019 4540
4020 is_mddev_idle(mddev); /* this also initializes IO event counters */ 4541 is_mddev_idle(mddev); /* this also initializes IO event counters */
4021 /* we don't use the checkpoint if there's a bitmap */ 4542 /* we don't use the checkpoint if there's a bitmap */
@@ -4056,7 +4577,7 @@ static void md_do_sync(mddev_t *mddev)
4056 4577
4057 skipped = 0; 4578 skipped = 0;
4058 sectors = mddev->pers->sync_request(mddev, j, &skipped, 4579 sectors = mddev->pers->sync_request(mddev, j, &skipped,
4059 currspeed < sysctl_speed_limit_min); 4580 currspeed < speed_min(mddev));
4060 if (sectors == 0) { 4581 if (sectors == 0) {
4061 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4582 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
4062 goto out; 4583 goto out;
@@ -4069,7 +4590,11 @@ static void md_do_sync(mddev_t *mddev)
4069 4590
4070 j += sectors; 4591 j += sectors;
4071 if (j>1) mddev->curr_resync = j; 4592 if (j>1) mddev->curr_resync = j;
4072 4593 if (last_check == 0)
4594 /* this is the earliers that rebuilt will be
4595 * visible in /proc/mdstat
4596 */
4597 md_new_event(mddev);
4073 4598
4074 if (last_check + window > io_sectors || j == max_sectors) 4599 if (last_check + window > io_sectors || j == max_sectors)
4075 continue; 4600 continue;
@@ -4117,8 +4642,8 @@ static void md_do_sync(mddev_t *mddev)
4117 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 4642 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
4118 /((jiffies-mddev->resync_mark)/HZ +1) +1; 4643 /((jiffies-mddev->resync_mark)/HZ +1) +1;
4119 4644
4120 if (currspeed > sysctl_speed_limit_min) { 4645 if (currspeed > speed_min(mddev)) {
4121 if ((currspeed > sysctl_speed_limit_max) || 4646 if ((currspeed > speed_max(mddev)) ||
4122 !is_mddev_idle(mddev)) { 4647 !is_mddev_idle(mddev)) {
4123 msleep(500); 4648 msleep(500);
4124 goto repeat; 4649 goto repeat;
@@ -4255,6 +4780,7 @@ void md_check_recovery(mddev_t *mddev)
4255 mddev->recovery = 0; 4780 mddev->recovery = 0;
4256 /* flag recovery needed just to double check */ 4781 /* flag recovery needed just to double check */
4257 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4782 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4783 md_new_event(mddev);
4258 goto unlock; 4784 goto unlock;
4259 } 4785 }
4260 /* Clear some bits that don't mean anything, but 4786 /* Clear some bits that don't mean anything, but
@@ -4292,6 +4818,7 @@ void md_check_recovery(mddev_t *mddev)
4292 sprintf(nm, "rd%d", rdev->raid_disk); 4818 sprintf(nm, "rd%d", rdev->raid_disk);
4293 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 4819 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
4294 spares++; 4820 spares++;
4821 md_new_event(mddev);
4295 } else 4822 } else
4296 break; 4823 break;
4297 } 4824 }
@@ -4324,9 +4851,9 @@ void md_check_recovery(mddev_t *mddev)
4324 mdname(mddev)); 4851 mdname(mddev));
4325 /* leave the spares where they are, it shouldn't hurt */ 4852 /* leave the spares where they are, it shouldn't hurt */
4326 mddev->recovery = 0; 4853 mddev->recovery = 0;
4327 } else { 4854 } else
4328 md_wakeup_thread(mddev->sync_thread); 4855 md_wakeup_thread(mddev->sync_thread);
4329 } 4856 md_new_event(mddev);
4330 } 4857 }
4331 unlock: 4858 unlock:
4332 mddev_unlock(mddev); 4859 mddev_unlock(mddev);
@@ -4503,12 +5030,14 @@ static int set_ro(const char *val, struct kernel_param *kp)
4503 int num = simple_strtoul(val, &e, 10); 5030 int num = simple_strtoul(val, &e, 10);
4504 if (*val && (*e == '\0' || *e == '\n')) { 5031 if (*val && (*e == '\0' || *e == '\n')) {
4505 start_readonly = num; 5032 start_readonly = num;
4506 return 0;; 5033 return 0;
4507 } 5034 }
4508 return -EINVAL; 5035 return -EINVAL;
4509} 5036}
4510 5037
4511module_param_call(start_ro, set_ro, get_ro, NULL, 0600); 5038module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
5039module_param(start_dirty_degraded, int, 0644);
5040
4512 5041
4513EXPORT_SYMBOL(register_md_personality); 5042EXPORT_SYMBOL(register_md_personality);
4514EXPORT_SYMBOL(unregister_md_personality); 5043EXPORT_SYMBOL(unregister_md_personality);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 145cdc5ad00..e6aa309a66d 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -35,15 +35,10 @@
35#define NR_RESERVED_BUFS 32 35#define NR_RESERVED_BUFS 32
36 36
37 37
38static mdk_personality_t multipath_personality;
39
40
41static void *mp_pool_alloc(gfp_t gfp_flags, void *data) 38static void *mp_pool_alloc(gfp_t gfp_flags, void *data)
42{ 39{
43 struct multipath_bh *mpb; 40 struct multipath_bh *mpb;
44 mpb = kmalloc(sizeof(*mpb), gfp_flags); 41 mpb = kzalloc(sizeof(*mpb), gfp_flags);
45 if (mpb)
46 memset(mpb, 0, sizeof(*mpb));
47 return mpb; 42 return mpb;
48} 43}
49 44
@@ -444,7 +439,7 @@ static int multipath_run (mddev_t *mddev)
444 * should be freed in multipath_stop()] 439 * should be freed in multipath_stop()]
445 */ 440 */
446 441
447 conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL); 442 conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
448 mddev->private = conf; 443 mddev->private = conf;
449 if (!conf) { 444 if (!conf) {
450 printk(KERN_ERR 445 printk(KERN_ERR
@@ -452,9 +447,8 @@ static int multipath_run (mddev_t *mddev)
452 mdname(mddev)); 447 mdname(mddev));
453 goto out; 448 goto out;
454 } 449 }
455 memset(conf, 0, sizeof(*conf));
456 450
457 conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks, 451 conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
458 GFP_KERNEL); 452 GFP_KERNEL);
459 if (!conf->multipaths) { 453 if (!conf->multipaths) {
460 printk(KERN_ERR 454 printk(KERN_ERR
@@ -462,7 +456,6 @@ static int multipath_run (mddev_t *mddev)
462 mdname(mddev)); 456 mdname(mddev));
463 goto out_free_conf; 457 goto out_free_conf;
464 } 458 }
465 memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks);
466 459
467 conf->working_disks = 0; 460 conf->working_disks = 0;
468 ITERATE_RDEV(mddev,rdev,tmp) { 461 ITERATE_RDEV(mddev,rdev,tmp) {
@@ -557,9 +550,10 @@ static int multipath_stop (mddev_t *mddev)
557 return 0; 550 return 0;
558} 551}
559 552
560static mdk_personality_t multipath_personality= 553static struct mdk_personality multipath_personality =
561{ 554{
562 .name = "multipath", 555 .name = "multipath",
556 .level = LEVEL_MULTIPATH,
563 .owner = THIS_MODULE, 557 .owner = THIS_MODULE,
564 .make_request = multipath_make_request, 558 .make_request = multipath_make_request,
565 .run = multipath_run, 559 .run = multipath_run,
@@ -572,15 +566,17 @@ static mdk_personality_t multipath_personality=
572 566
573static int __init multipath_init (void) 567static int __init multipath_init (void)
574{ 568{
575 return register_md_personality (MULTIPATH, &multipath_personality); 569 return register_md_personality (&multipath_personality);
576} 570}
577 571
578static void __exit multipath_exit (void) 572static void __exit multipath_exit (void)
579{ 573{
580 unregister_md_personality (MULTIPATH); 574 unregister_md_personality (&multipath_personality);
581} 575}
582 576
583module_init(multipath_init); 577module_init(multipath_init);
584module_exit(multipath_exit); 578module_exit(multipath_exit);
585MODULE_LICENSE("GPL"); 579MODULE_LICENSE("GPL");
586MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ 580MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
581MODULE_ALIAS("md-multipath");
582MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index fece3277c2a..abbca150202 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -113,21 +113,16 @@ static int create_strip_zones (mddev_t *mddev)
113 } 113 }
114 printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); 114 printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
115 115
116 conf->strip_zone = kmalloc(sizeof(struct strip_zone)* 116 conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
117 conf->nr_strip_zones, GFP_KERNEL); 117 conf->nr_strip_zones, GFP_KERNEL);
118 if (!conf->strip_zone) 118 if (!conf->strip_zone)
119 return 1; 119 return 1;
120 conf->devlist = kmalloc(sizeof(mdk_rdev_t*)* 120 conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
121 conf->nr_strip_zones*mddev->raid_disks, 121 conf->nr_strip_zones*mddev->raid_disks,
122 GFP_KERNEL); 122 GFP_KERNEL);
123 if (!conf->devlist) 123 if (!conf->devlist)
124 return 1; 124 return 1;
125 125
126 memset(conf->strip_zone, 0,sizeof(struct strip_zone)*
127 conf->nr_strip_zones);
128 memset(conf->devlist, 0,
129 sizeof(mdk_rdev_t*) * conf->nr_strip_zones * mddev->raid_disks);
130
131 /* The first zone must contain all devices, so here we check that 126 /* The first zone must contain all devices, so here we check that
132 * there is a proper alignment of slots to devices and find them all 127 * there is a proper alignment of slots to devices and find them all
133 */ 128 */
@@ -280,7 +275,11 @@ static int raid0_run (mddev_t *mddev)
280 mdk_rdev_t *rdev; 275 mdk_rdev_t *rdev;
281 struct list_head *tmp; 276 struct list_head *tmp;
282 277
283 printk("%s: setting max_sectors to %d, segment boundary to %d\n", 278 if (mddev->chunk_size == 0) {
279 printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
280 return -EINVAL;
281 }
282 printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
284 mdname(mddev), 283 mdname(mddev),
285 mddev->chunk_size >> 9, 284 mddev->chunk_size >> 9,
286 (mddev->chunk_size>>1)-1); 285 (mddev->chunk_size>>1)-1);
@@ -361,7 +360,7 @@ static int raid0_run (mddev_t *mddev)
361 * chunksize should be used in that case. 360 * chunksize should be used in that case.
362 */ 361 */
363 { 362 {
364 int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; 363 int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
365 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 364 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
366 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 365 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
367 } 366 }
@@ -512,9 +511,10 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
512 return; 511 return;
513} 512}
514 513
515static mdk_personality_t raid0_personality= 514static struct mdk_personality raid0_personality=
516{ 515{
517 .name = "raid0", 516 .name = "raid0",
517 .level = 0,
518 .owner = THIS_MODULE, 518 .owner = THIS_MODULE,
519 .make_request = raid0_make_request, 519 .make_request = raid0_make_request,
520 .run = raid0_run, 520 .run = raid0_run,
@@ -524,15 +524,17 @@ static mdk_personality_t raid0_personality=
524 524
525static int __init raid0_init (void) 525static int __init raid0_init (void)
526{ 526{
527 return register_md_personality (RAID0, &raid0_personality); 527 return register_md_personality (&raid0_personality);
528} 528}
529 529
530static void raid0_exit (void) 530static void raid0_exit (void)
531{ 531{
532 unregister_md_personality (RAID0); 532 unregister_md_personality (&raid0_personality);
533} 533}
534 534
535module_init(raid0_init); 535module_init(raid0_init);
536module_exit(raid0_exit); 536module_exit(raid0_exit);
537MODULE_LICENSE("GPL"); 537MODULE_LICENSE("GPL");
538MODULE_ALIAS("md-personality-2"); /* RAID0 */ 538MODULE_ALIAS("md-personality-2"); /* RAID0 */
539MODULE_ALIAS("md-raid0");
540MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 229d7b20429..a06ff91f27e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -47,10 +47,11 @@
47 */ 47 */
48#define NR_RAID1_BIOS 256 48#define NR_RAID1_BIOS 256
49 49
50static mdk_personality_t raid1_personality;
51 50
52static void unplug_slaves(mddev_t *mddev); 51static void unplug_slaves(mddev_t *mddev);
53 52
53static void allow_barrier(conf_t *conf);
54static void lower_barrier(conf_t *conf);
54 55
55static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 56static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
56{ 57{
@@ -59,10 +60,8 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
59 int size = offsetof(r1bio_t, bios[pi->raid_disks]); 60 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
60 61
61 /* allocate a r1bio with room for raid_disks entries in the bios array */ 62 /* allocate a r1bio with room for raid_disks entries in the bios array */
62 r1_bio = kmalloc(size, gfp_flags); 63 r1_bio = kzalloc(size, gfp_flags);
63 if (r1_bio) 64 if (!r1_bio)
64 memset(r1_bio, 0, size);
65 else
66 unplug_slaves(pi->mddev); 65 unplug_slaves(pi->mddev);
67 66
68 return r1_bio; 67 return r1_bio;
@@ -104,15 +103,30 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
104 } 103 }
105 /* 104 /*
106 * Allocate RESYNC_PAGES data pages and attach them to 105 * Allocate RESYNC_PAGES data pages and attach them to
107 * the first bio; 106 * the first bio.
107 * If this is a user-requested check/repair, allocate
108 * RESYNC_PAGES for each bio.
108 */ 109 */
109 bio = r1_bio->bios[0]; 110 if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
110 for (i = 0; i < RESYNC_PAGES; i++) { 111 j = pi->raid_disks;
111 page = alloc_page(gfp_flags); 112 else
112 if (unlikely(!page)) 113 j = 1;
113 goto out_free_pages; 114 while(j--) {
114 115 bio = r1_bio->bios[j];
115 bio->bi_io_vec[i].bv_page = page; 116 for (i = 0; i < RESYNC_PAGES; i++) {
117 page = alloc_page(gfp_flags);
118 if (unlikely(!page))
119 goto out_free_pages;
120
121 bio->bi_io_vec[i].bv_page = page;
122 }
123 }
124 /* If not user-requests, copy the page pointers to all bios */
125 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
126 for (i=0; i<RESYNC_PAGES ; i++)
127 for (j=1; j<pi->raid_disks; j++)
128 r1_bio->bios[j]->bi_io_vec[i].bv_page =
129 r1_bio->bios[0]->bi_io_vec[i].bv_page;
116 } 130 }
117 131
118 r1_bio->master_bio = NULL; 132 r1_bio->master_bio = NULL;
@@ -120,8 +134,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
120 return r1_bio; 134 return r1_bio;
121 135
122out_free_pages: 136out_free_pages:
123 for ( ; i > 0 ; i--) 137 for (i=0; i < RESYNC_PAGES ; i++)
124 __free_page(bio->bi_io_vec[i-1].bv_page); 138 for (j=0 ; j < pi->raid_disks; j++)
139 safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
140 j = -1;
125out_free_bio: 141out_free_bio:
126 while ( ++j < pi->raid_disks ) 142 while ( ++j < pi->raid_disks )
127 bio_put(r1_bio->bios[j]); 143 bio_put(r1_bio->bios[j]);
@@ -132,14 +148,16 @@ out_free_bio:
132static void r1buf_pool_free(void *__r1_bio, void *data) 148static void r1buf_pool_free(void *__r1_bio, void *data)
133{ 149{
134 struct pool_info *pi = data; 150 struct pool_info *pi = data;
135 int i; 151 int i,j;
136 r1bio_t *r1bio = __r1_bio; 152 r1bio_t *r1bio = __r1_bio;
137 struct bio *bio = r1bio->bios[0];
138 153
139 for (i = 0; i < RESYNC_PAGES; i++) { 154 for (i = 0; i < RESYNC_PAGES; i++)
140 __free_page(bio->bi_io_vec[i].bv_page); 155 for (j = pi->raid_disks; j-- ;) {
141 bio->bi_io_vec[i].bv_page = NULL; 156 if (j == 0 ||
142 } 157 r1bio->bios[j]->bi_io_vec[i].bv_page !=
158 r1bio->bios[0]->bi_io_vec[i].bv_page)
159 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
160 }
143 for (i=0 ; i < pi->raid_disks; i++) 161 for (i=0 ; i < pi->raid_disks; i++)
144 bio_put(r1bio->bios[i]); 162 bio_put(r1bio->bios[i]);
145 163
@@ -152,7 +170,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
152 170
153 for (i = 0; i < conf->raid_disks; i++) { 171 for (i = 0; i < conf->raid_disks; i++) {
154 struct bio **bio = r1_bio->bios + i; 172 struct bio **bio = r1_bio->bios + i;
155 if (*bio) 173 if (*bio && *bio != IO_BLOCKED)
156 bio_put(*bio); 174 bio_put(*bio);
157 *bio = NULL; 175 *bio = NULL;
158 } 176 }
@@ -160,20 +178,13 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
160 178
161static inline void free_r1bio(r1bio_t *r1_bio) 179static inline void free_r1bio(r1bio_t *r1_bio)
162{ 180{
163 unsigned long flags;
164
165 conf_t *conf = mddev_to_conf(r1_bio->mddev); 181 conf_t *conf = mddev_to_conf(r1_bio->mddev);
166 182
167 /* 183 /*
168 * Wake up any possible resync thread that waits for the device 184 * Wake up any possible resync thread that waits for the device
169 * to go idle. 185 * to go idle.
170 */ 186 */
171 spin_lock_irqsave(&conf->resync_lock, flags); 187 allow_barrier(conf);
172 if (!--conf->nr_pending) {
173 wake_up(&conf->wait_idle);
174 wake_up(&conf->wait_resume);
175 }
176 spin_unlock_irqrestore(&conf->resync_lock, flags);
177 188
178 put_all_bios(conf, r1_bio); 189 put_all_bios(conf, r1_bio);
179 mempool_free(r1_bio, conf->r1bio_pool); 190 mempool_free(r1_bio, conf->r1bio_pool);
@@ -182,22 +193,17 @@ static inline void free_r1bio(r1bio_t *r1_bio)
182static inline void put_buf(r1bio_t *r1_bio) 193static inline void put_buf(r1bio_t *r1_bio)
183{ 194{
184 conf_t *conf = mddev_to_conf(r1_bio->mddev); 195 conf_t *conf = mddev_to_conf(r1_bio->mddev);
185 unsigned long flags; 196 int i;
186 197
187 mempool_free(r1_bio, conf->r1buf_pool); 198 for (i=0; i<conf->raid_disks; i++) {
199 struct bio *bio = r1_bio->bios[i];
200 if (bio->bi_end_io)
201 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
202 }
188 203
189 spin_lock_irqsave(&conf->resync_lock, flags); 204 mempool_free(r1_bio, conf->r1buf_pool);
190 if (!conf->barrier)
191 BUG();
192 --conf->barrier;
193 wake_up(&conf->wait_resume);
194 wake_up(&conf->wait_idle);
195 205
196 if (!--conf->nr_pending) { 206 lower_barrier(conf);
197 wake_up(&conf->wait_idle);
198 wake_up(&conf->wait_resume);
199 }
200 spin_unlock_irqrestore(&conf->resync_lock, flags);
201} 207}
202 208
203static void reschedule_retry(r1bio_t *r1_bio) 209static void reschedule_retry(r1bio_t *r1_bio)
@@ -208,8 +214,10 @@ static void reschedule_retry(r1bio_t *r1_bio)
208 214
209 spin_lock_irqsave(&conf->device_lock, flags); 215 spin_lock_irqsave(&conf->device_lock, flags);
210 list_add(&r1_bio->retry_list, &conf->retry_list); 216 list_add(&r1_bio->retry_list, &conf->retry_list);
217 conf->nr_queued ++;
211 spin_unlock_irqrestore(&conf->device_lock, flags); 218 spin_unlock_irqrestore(&conf->device_lock, flags);
212 219
220 wake_up(&conf->wait_barrier);
213 md_wakeup_thread(mddev->thread); 221 md_wakeup_thread(mddev->thread);
214} 222}
215 223
@@ -261,9 +269,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
261 /* 269 /*
262 * this branch is our 'one mirror IO has finished' event handler: 270 * this branch is our 'one mirror IO has finished' event handler:
263 */ 271 */
264 if (!uptodate) 272 update_head_pos(mirror, r1_bio);
265 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 273
266 else 274 if (uptodate || conf->working_disks <= 1) {
267 /* 275 /*
268 * Set R1BIO_Uptodate in our master bio, so that 276 * Set R1BIO_Uptodate in our master bio, so that
269 * we will return a good error code for to the higher 277 * we will return a good error code for to the higher
@@ -273,16 +281,11 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
273 * user-side. So if something waits for IO, then it will 281 * user-side. So if something waits for IO, then it will
274 * wait for the 'master' bio. 282 * wait for the 'master' bio.
275 */ 283 */
276 set_bit(R1BIO_Uptodate, &r1_bio->state); 284 if (uptodate)
277 285 set_bit(R1BIO_Uptodate, &r1_bio->state);
278 update_head_pos(mirror, r1_bio);
279 286
280 /*
281 * we have only one bio on the read side
282 */
283 if (uptodate)
284 raid_end_bio_io(r1_bio); 287 raid_end_bio_io(r1_bio);
285 else { 288 } else {
286 /* 289 /*
287 * oops, read error: 290 * oops, read error:
288 */ 291 */
@@ -378,7 +381,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
378 /* free extra copy of the data pages */ 381 /* free extra copy of the data pages */
379 int i = bio->bi_vcnt; 382 int i = bio->bi_vcnt;
380 while (i--) 383 while (i--)
381 __free_page(bio->bi_io_vec[i].bv_page); 384 safe_put_page(bio->bi_io_vec[i].bv_page);
382 } 385 }
383 /* clear the bitmap if all writes complete successfully */ 386 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 387 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -433,11 +436,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
433 new_disk = 0; 436 new_disk = 0;
434 437
435 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 438 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
439 r1_bio->bios[new_disk] == IO_BLOCKED ||
436 !rdev || !test_bit(In_sync, &rdev->flags) 440 !rdev || !test_bit(In_sync, &rdev->flags)
437 || test_bit(WriteMostly, &rdev->flags); 441 || test_bit(WriteMostly, &rdev->flags);
438 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { 442 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
439 443
440 if (rdev && test_bit(In_sync, &rdev->flags)) 444 if (rdev && test_bit(In_sync, &rdev->flags) &&
445 r1_bio->bios[new_disk] != IO_BLOCKED)
441 wonly_disk = new_disk; 446 wonly_disk = new_disk;
442 447
443 if (new_disk == conf->raid_disks - 1) { 448 if (new_disk == conf->raid_disks - 1) {
@@ -451,11 +456,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
451 456
452 /* make sure the disk is operational */ 457 /* make sure the disk is operational */
453 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 458 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
459 r1_bio->bios[new_disk] == IO_BLOCKED ||
454 !rdev || !test_bit(In_sync, &rdev->flags) || 460 !rdev || !test_bit(In_sync, &rdev->flags) ||
455 test_bit(WriteMostly, &rdev->flags); 461 test_bit(WriteMostly, &rdev->flags);
456 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { 462 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
457 463
458 if (rdev && test_bit(In_sync, &rdev->flags)) 464 if (rdev && test_bit(In_sync, &rdev->flags) &&
465 r1_bio->bios[new_disk] != IO_BLOCKED)
459 wonly_disk = new_disk; 466 wonly_disk = new_disk;
460 467
461 if (new_disk <= 0) 468 if (new_disk <= 0)
@@ -492,7 +499,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
492 499
493 rdev = rcu_dereference(conf->mirrors[disk].rdev); 500 rdev = rcu_dereference(conf->mirrors[disk].rdev);
494 501
495 if (!rdev || 502 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
496 !test_bit(In_sync, &rdev->flags) || 503 !test_bit(In_sync, &rdev->flags) ||
497 test_bit(WriteMostly, &rdev->flags)) 504 test_bit(WriteMostly, &rdev->flags))
498 continue; 505 continue;
@@ -520,7 +527,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
520 /* cannot risk returning a device that failed 527 /* cannot risk returning a device that failed
521 * before we inc'ed nr_pending 528 * before we inc'ed nr_pending
522 */ 529 */
523 atomic_dec(&rdev->nr_pending); 530 rdev_dec_pending(rdev, conf->mddev);
524 goto retry; 531 goto retry;
525 } 532 }
526 conf->next_seq_sect = this_sector + sectors; 533 conf->next_seq_sect = this_sector + sectors;
@@ -593,42 +600,119 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
593 return ret; 600 return ret;
594} 601}
595 602
596/* 603/* Barriers....
597 * Throttle resync depth, so that we can both get proper overlapping of 604 * Sometimes we need to suspend IO while we do something else,
598 * requests, but are still able to handle normal requests quickly. 605 * either some resync/recovery, or reconfigure the array.
606 * To do this we raise a 'barrier'.
607 * The 'barrier' is a counter that can be raised multiple times
608 * to count how many activities are happening which preclude
609 * normal IO.
610 * We can only raise the barrier if there is no pending IO.
611 * i.e. if nr_pending == 0.
612 * We choose only to raise the barrier if no-one is waiting for the
613 * barrier to go down. This means that as soon as an IO request
614 * is ready, no other operations which require a barrier will start
615 * until the IO request has had a chance.
616 *
617 * So: regular IO calls 'wait_barrier'. When that returns there
618 * is no backgroup IO happening, It must arrange to call
619 * allow_barrier when it has finished its IO.
620 * backgroup IO calls must call raise_barrier. Once that returns
621 * there is no normal IO happeing. It must arrange to call
622 * lower_barrier when the particular background IO completes.
599 */ 623 */
600#define RESYNC_DEPTH 32 624#define RESYNC_DEPTH 32
601 625
602static void device_barrier(conf_t *conf, sector_t sect) 626static void raise_barrier(conf_t *conf)
603{ 627{
604 spin_lock_irq(&conf->resync_lock); 628 spin_lock_irq(&conf->resync_lock);
605 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 629
606 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 630 /* Wait until no block IO is waiting */
607 631 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
608 if (!conf->barrier++) { 632 conf->resync_lock,
609 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 633 raid1_unplug(conf->mddev->queue));
610 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 634
611 if (conf->nr_pending) 635 /* block any new IO from starting */
612 BUG(); 636 conf->barrier++;
637
638 /* No wait for all pending IO to complete */
639 wait_event_lock_irq(conf->wait_barrier,
640 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
641 conf->resync_lock,
642 raid1_unplug(conf->mddev->queue));
643
644 spin_unlock_irq(&conf->resync_lock);
645}
646
647static void lower_barrier(conf_t *conf)
648{
649 unsigned long flags;
650 spin_lock_irqsave(&conf->resync_lock, flags);
651 conf->barrier--;
652 spin_unlock_irqrestore(&conf->resync_lock, flags);
653 wake_up(&conf->wait_barrier);
654}
655
656static void wait_barrier(conf_t *conf)
657{
658 spin_lock_irq(&conf->resync_lock);
659 if (conf->barrier) {
660 conf->nr_waiting++;
661 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
662 conf->resync_lock,
663 raid1_unplug(conf->mddev->queue));
664 conf->nr_waiting--;
613 } 665 }
614 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 666 conf->nr_pending++;
615 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 667 spin_unlock_irq(&conf->resync_lock);
616 conf->next_resync = sect; 668}
669
670static void allow_barrier(conf_t *conf)
671{
672 unsigned long flags;
673 spin_lock_irqsave(&conf->resync_lock, flags);
674 conf->nr_pending--;
675 spin_unlock_irqrestore(&conf->resync_lock, flags);
676 wake_up(&conf->wait_barrier);
677}
678
679static void freeze_array(conf_t *conf)
680{
681 /* stop syncio and normal IO and wait for everything to
682 * go quite.
683 * We increment barrier and nr_waiting, and then
684 * wait until barrier+nr_pending match nr_queued+2
685 */
686 spin_lock_irq(&conf->resync_lock);
687 conf->barrier++;
688 conf->nr_waiting++;
689 wait_event_lock_irq(conf->wait_barrier,
690 conf->barrier+conf->nr_pending == conf->nr_queued+2,
691 conf->resync_lock,
692 raid1_unplug(conf->mddev->queue));
693 spin_unlock_irq(&conf->resync_lock);
694}
695static void unfreeze_array(conf_t *conf)
696{
697 /* reverse the effect of the freeze */
698 spin_lock_irq(&conf->resync_lock);
699 conf->barrier--;
700 conf->nr_waiting--;
701 wake_up(&conf->wait_barrier);
617 spin_unlock_irq(&conf->resync_lock); 702 spin_unlock_irq(&conf->resync_lock);
618} 703}
619 704
705
620/* duplicate the data pages for behind I/O */ 706/* duplicate the data pages for behind I/O */
621static struct page **alloc_behind_pages(struct bio *bio) 707static struct page **alloc_behind_pages(struct bio *bio)
622{ 708{
623 int i; 709 int i;
624 struct bio_vec *bvec; 710 struct bio_vec *bvec;
625 struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), 711 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
626 GFP_NOIO); 712 GFP_NOIO);
627 if (unlikely(!pages)) 713 if (unlikely(!pages))
628 goto do_sync_io; 714 goto do_sync_io;
629 715
630 memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
631
632 bio_for_each_segment(bvec, bio, i) { 716 bio_for_each_segment(bvec, bio, i) {
633 pages[i] = alloc_page(GFP_NOIO); 717 pages[i] = alloc_page(GFP_NOIO);
634 if (unlikely(!pages[i])) 718 if (unlikely(!pages[i]))
@@ -644,7 +728,7 @@ static struct page **alloc_behind_pages(struct bio *bio)
644do_sync_io: 728do_sync_io:
645 if (pages) 729 if (pages)
646 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 730 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
647 __free_page(pages[i]); 731 put_page(pages[i]);
648 kfree(pages); 732 kfree(pages);
649 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 733 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
650 return NULL; 734 return NULL;
@@ -678,10 +762,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
678 */ 762 */
679 md_write_start(mddev, bio); /* wait on superblock update early */ 763 md_write_start(mddev, bio); /* wait on superblock update early */
680 764
681 spin_lock_irq(&conf->resync_lock); 765 wait_barrier(conf);
682 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
683 conf->nr_pending++;
684 spin_unlock_irq(&conf->resync_lock);
685 766
686 disk_stat_inc(mddev->gendisk, ios[rw]); 767 disk_stat_inc(mddev->gendisk, ios[rw]);
687 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); 768 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -749,7 +830,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
749 !test_bit(Faulty, &rdev->flags)) { 830 !test_bit(Faulty, &rdev->flags)) {
750 atomic_inc(&rdev->nr_pending); 831 atomic_inc(&rdev->nr_pending);
751 if (test_bit(Faulty, &rdev->flags)) { 832 if (test_bit(Faulty, &rdev->flags)) {
752 atomic_dec(&rdev->nr_pending); 833 rdev_dec_pending(rdev, mddev);
753 r1_bio->bios[i] = NULL; 834 r1_bio->bios[i] = NULL;
754 } else 835 } else
755 r1_bio->bios[i] = bio; 836 r1_bio->bios[i] = bio;
@@ -909,13 +990,8 @@ static void print_conf(conf_t *conf)
909 990
910static void close_sync(conf_t *conf) 991static void close_sync(conf_t *conf)
911{ 992{
912 spin_lock_irq(&conf->resync_lock); 993 wait_barrier(conf);
913 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 994 allow_barrier(conf);
914 conf->resync_lock, raid1_unplug(conf->mddev->queue));
915 spin_unlock_irq(&conf->resync_lock);
916
917 if (conf->barrier) BUG();
918 if (waitqueue_active(&conf->wait_idle)) BUG();
919 995
920 mempool_destroy(conf->r1buf_pool); 996 mempool_destroy(conf->r1buf_pool);
921 conf->r1buf_pool = NULL; 997 conf->r1buf_pool = NULL;
@@ -1015,28 +1091,27 @@ abort:
1015 1091
1016static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) 1092static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1017{ 1093{
1018 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1019 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1094 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1020 conf_t *conf = mddev_to_conf(r1_bio->mddev); 1095 int i;
1021 1096
1022 if (bio->bi_size) 1097 if (bio->bi_size)
1023 return 1; 1098 return 1;
1024 1099
1025 if (r1_bio->bios[r1_bio->read_disk] != bio) 1100 for (i=r1_bio->mddev->raid_disks; i--; )
1026 BUG(); 1101 if (r1_bio->bios[i] == bio)
1027 update_head_pos(r1_bio->read_disk, r1_bio); 1102 break;
1103 BUG_ON(i < 0);
1104 update_head_pos(i, r1_bio);
1028 /* 1105 /*
1029 * we have read a block, now it needs to be re-written, 1106 * we have read a block, now it needs to be re-written,
1030 * or re-read if the read failed. 1107 * or re-read if the read failed.
1031 * We don't do much here, just schedule handling by raid1d 1108 * We don't do much here, just schedule handling by raid1d
1032 */ 1109 */
1033 if (!uptodate) { 1110 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1034 md_error(r1_bio->mddev,
1035 conf->mirrors[r1_bio->read_disk].rdev);
1036 } else
1037 set_bit(R1BIO_Uptodate, &r1_bio->state); 1111 set_bit(R1BIO_Uptodate, &r1_bio->state);
1038 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 1112
1039 reschedule_retry(r1_bio); 1113 if (atomic_dec_and_test(&r1_bio->remaining))
1114 reschedule_retry(r1_bio);
1040 return 0; 1115 return 0;
1041} 1116}
1042 1117
@@ -1066,7 +1141,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1066 md_done_sync(mddev, r1_bio->sectors, uptodate); 1141 md_done_sync(mddev, r1_bio->sectors, uptodate);
1067 put_buf(r1_bio); 1142 put_buf(r1_bio);
1068 } 1143 }
1069 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1070 return 0; 1144 return 0;
1071} 1145}
1072 1146
@@ -1079,34 +1153,173 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1079 1153
1080 bio = r1_bio->bios[r1_bio->read_disk]; 1154 bio = r1_bio->bios[r1_bio->read_disk];
1081 1155
1082/* 1156
1083 if (r1_bio->sector == 0) printk("First sync write startss\n"); 1157 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1084*/ 1158 /* We have read all readable devices. If we haven't
1085 /* 1159 * got the block, then there is no hope left.
1086 * schedule writes 1160 * If we have, then we want to do a comparison
1087 */ 1161 * and skip the write if everything is the same.
1162 * If any blocks failed to read, then we need to
1163 * attempt an over-write
1164 */
1165 int primary;
1166 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1167 for (i=0; i<mddev->raid_disks; i++)
1168 if (r1_bio->bios[i]->bi_end_io == end_sync_read)
1169 md_error(mddev, conf->mirrors[i].rdev);
1170
1171 md_done_sync(mddev, r1_bio->sectors, 1);
1172 put_buf(r1_bio);
1173 return;
1174 }
1175 for (primary=0; primary<mddev->raid_disks; primary++)
1176 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1177 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1178 r1_bio->bios[primary]->bi_end_io = NULL;
1179 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1180 break;
1181 }
1182 r1_bio->read_disk = primary;
1183 for (i=0; i<mddev->raid_disks; i++)
1184 if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
1185 test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
1186 int j;
1187 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1188 struct bio *pbio = r1_bio->bios[primary];
1189 struct bio *sbio = r1_bio->bios[i];
1190 for (j = vcnt; j-- ; )
1191 if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
1192 page_address(sbio->bi_io_vec[j].bv_page),
1193 PAGE_SIZE))
1194 break;
1195 if (j >= 0)
1196 mddev->resync_mismatches += r1_bio->sectors;
1197 if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
1198 sbio->bi_end_io = NULL;
1199 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1200 } else {
1201 /* fixup the bio for reuse */
1202 sbio->bi_vcnt = vcnt;
1203 sbio->bi_size = r1_bio->sectors << 9;
1204 sbio->bi_idx = 0;
1205 sbio->bi_phys_segments = 0;
1206 sbio->bi_hw_segments = 0;
1207 sbio->bi_hw_front_size = 0;
1208 sbio->bi_hw_back_size = 0;
1209 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1210 sbio->bi_flags |= 1 << BIO_UPTODATE;
1211 sbio->bi_next = NULL;
1212 sbio->bi_sector = r1_bio->sector +
1213 conf->mirrors[i].rdev->data_offset;
1214 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1215 }
1216 }
1217 }
1088 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1218 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1089 /* 1219 /* ouch - failed to read all of that.
1090 * There is no point trying a read-for-reconstruct as 1220 * Try some synchronous reads of other devices to get
1091 * reconstruct is about to be aborted 1221 * good data, much like with normal read errors. Only
1222 * read into the pages we already have so they we don't
1223 * need to re-issue the read request.
1224 * We don't need to freeze the array, because being in an
1225 * active sync request, there is no normal IO, and
1226 * no overlapping syncs.
1092 */ 1227 */
1093 char b[BDEVNAME_SIZE]; 1228 sector_t sect = r1_bio->sector;
1094 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1229 int sectors = r1_bio->sectors;
1095 " for block %llu\n", 1230 int idx = 0;
1096 bdevname(bio->bi_bdev,b), 1231
1097 (unsigned long long)r1_bio->sector); 1232 while(sectors) {
1098 md_done_sync(mddev, r1_bio->sectors, 0); 1233 int s = sectors;
1099 put_buf(r1_bio); 1234 int d = r1_bio->read_disk;
1100 return; 1235 int success = 0;
1236 mdk_rdev_t *rdev;
1237
1238 if (s > (PAGE_SIZE>>9))
1239 s = PAGE_SIZE >> 9;
1240 do {
1241 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1242 rdev = conf->mirrors[d].rdev;
1243 if (sync_page_io(rdev->bdev,
1244 sect + rdev->data_offset,
1245 s<<9,
1246 bio->bi_io_vec[idx].bv_page,
1247 READ)) {
1248 success = 1;
1249 break;
1250 }
1251 }
1252 d++;
1253 if (d == conf->raid_disks)
1254 d = 0;
1255 } while (!success && d != r1_bio->read_disk);
1256
1257 if (success) {
1258 int start = d;
1259 /* write it back and re-read */
1260 set_bit(R1BIO_Uptodate, &r1_bio->state);
1261 while (d != r1_bio->read_disk) {
1262 if (d == 0)
1263 d = conf->raid_disks;
1264 d--;
1265 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1266 continue;
1267 rdev = conf->mirrors[d].rdev;
1268 atomic_add(s, &rdev->corrected_errors);
1269 if (sync_page_io(rdev->bdev,
1270 sect + rdev->data_offset,
1271 s<<9,
1272 bio->bi_io_vec[idx].bv_page,
1273 WRITE) == 0)
1274 md_error(mddev, rdev);
1275 }
1276 d = start;
1277 while (d != r1_bio->read_disk) {
1278 if (d == 0)
1279 d = conf->raid_disks;
1280 d--;
1281 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1282 continue;
1283 rdev = conf->mirrors[d].rdev;
1284 if (sync_page_io(rdev->bdev,
1285 sect + rdev->data_offset,
1286 s<<9,
1287 bio->bi_io_vec[idx].bv_page,
1288 READ) == 0)
1289 md_error(mddev, rdev);
1290 }
1291 } else {
1292 char b[BDEVNAME_SIZE];
1293 /* Cannot read from anywhere, array is toast */
1294 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1295 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
1296 " for block %llu\n",
1297 bdevname(bio->bi_bdev,b),
1298 (unsigned long long)r1_bio->sector);
1299 md_done_sync(mddev, r1_bio->sectors, 0);
1300 put_buf(r1_bio);
1301 return;
1302 }
1303 sectors -= s;
1304 sect += s;
1305 idx ++;
1306 }
1101 } 1307 }
1102 1308
1309 /*
1310 * schedule writes
1311 */
1103 atomic_set(&r1_bio->remaining, 1); 1312 atomic_set(&r1_bio->remaining, 1);
1104 for (i = 0; i < disks ; i++) { 1313 for (i = 0; i < disks ; i++) {
1105 wbio = r1_bio->bios[i]; 1314 wbio = r1_bio->bios[i];
1106 if (wbio->bi_end_io != end_sync_write) 1315 if (wbio->bi_end_io == NULL ||
1316 (wbio->bi_end_io == end_sync_read &&
1317 (i == r1_bio->read_disk ||
1318 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1107 continue; 1319 continue;
1108 1320
1109 atomic_inc(&conf->mirrors[i].rdev->nr_pending); 1321 wbio->bi_rw = WRITE;
1322 wbio->bi_end_io = end_sync_write;
1110 atomic_inc(&r1_bio->remaining); 1323 atomic_inc(&r1_bio->remaining);
1111 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 1324 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
1112 1325
@@ -1167,6 +1380,7 @@ static void raid1d(mddev_t *mddev)
1167 break; 1380 break;
1168 r1_bio = list_entry(head->prev, r1bio_t, retry_list); 1381 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1169 list_del(head->prev); 1382 list_del(head->prev);
1383 conf->nr_queued--;
1170 spin_unlock_irqrestore(&conf->device_lock, flags); 1384 spin_unlock_irqrestore(&conf->device_lock, flags);
1171 1385
1172 mddev = r1_bio->mddev; 1386 mddev = r1_bio->mddev;
@@ -1206,6 +1420,86 @@ static void raid1d(mddev_t *mddev)
1206 } 1420 }
1207 } else { 1421 } else {
1208 int disk; 1422 int disk;
1423
1424 /* we got a read error. Maybe the drive is bad. Maybe just
1425 * the block and we can fix it.
1426 * We freeze all other IO, and try reading the block from
1427 * other devices. When we find one, we re-write
1428 * and check it that fixes the read error.
1429 * This is all done synchronously while the array is
1430 * frozen
1431 */
1432 sector_t sect = r1_bio->sector;
1433 int sectors = r1_bio->sectors;
1434 freeze_array(conf);
1435 if (mddev->ro == 0) while(sectors) {
1436 int s = sectors;
1437 int d = r1_bio->read_disk;
1438 int success = 0;
1439
1440 if (s > (PAGE_SIZE>>9))
1441 s = PAGE_SIZE >> 9;
1442
1443 do {
1444 rdev = conf->mirrors[d].rdev;
1445 if (rdev &&
1446 test_bit(In_sync, &rdev->flags) &&
1447 sync_page_io(rdev->bdev,
1448 sect + rdev->data_offset,
1449 s<<9,
1450 conf->tmppage, READ))
1451 success = 1;
1452 else {
1453 d++;
1454 if (d == conf->raid_disks)
1455 d = 0;
1456 }
1457 } while (!success && d != r1_bio->read_disk);
1458
1459 if (success) {
1460 /* write it back and re-read */
1461 int start = d;
1462 while (d != r1_bio->read_disk) {
1463 if (d==0)
1464 d = conf->raid_disks;
1465 d--;
1466 rdev = conf->mirrors[d].rdev;
1467 atomic_add(s, &rdev->corrected_errors);
1468 if (rdev &&
1469 test_bit(In_sync, &rdev->flags)) {
1470 if (sync_page_io(rdev->bdev,
1471 sect + rdev->data_offset,
1472 s<<9, conf->tmppage, WRITE) == 0)
1473 /* Well, this device is dead */
1474 md_error(mddev, rdev);
1475 }
1476 }
1477 d = start;
1478 while (d != r1_bio->read_disk) {
1479 if (d==0)
1480 d = conf->raid_disks;
1481 d--;
1482 rdev = conf->mirrors[d].rdev;
1483 if (rdev &&
1484 test_bit(In_sync, &rdev->flags)) {
1485 if (sync_page_io(rdev->bdev,
1486 sect + rdev->data_offset,
1487 s<<9, conf->tmppage, READ) == 0)
1488 /* Well, this device is dead */
1489 md_error(mddev, rdev);
1490 }
1491 }
1492 } else {
1493 /* Cannot read from anywhere -- bye bye array */
1494 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1495 break;
1496 }
1497 sectors -= s;
1498 sect += s;
1499 }
1500
1501 unfreeze_array(conf);
1502
1209 bio = r1_bio->bios[r1_bio->read_disk]; 1503 bio = r1_bio->bios[r1_bio->read_disk];
1210 if ((disk=read_balance(conf, r1_bio)) == -1) { 1504 if ((disk=read_balance(conf, r1_bio)) == -1) {
1211 printk(KERN_ALERT "raid1: %s: unrecoverable I/O" 1505 printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1214,7 +1508,8 @@ static void raid1d(mddev_t *mddev)
1214 (unsigned long long)r1_bio->sector); 1508 (unsigned long long)r1_bio->sector);
1215 raid_end_bio_io(r1_bio); 1509 raid_end_bio_io(r1_bio);
1216 } else { 1510 } else {
1217 r1_bio->bios[r1_bio->read_disk] = NULL; 1511 r1_bio->bios[r1_bio->read_disk] =
1512 mddev->ro ? IO_BLOCKED : NULL;
1218 r1_bio->read_disk = disk; 1513 r1_bio->read_disk = disk;
1219 bio_put(bio); 1514 bio_put(bio);
1220 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1515 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
@@ -1269,14 +1564,13 @@ static int init_resync(conf_t *conf)
1269static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1564static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1270{ 1565{
1271 conf_t *conf = mddev_to_conf(mddev); 1566 conf_t *conf = mddev_to_conf(mddev);
1272 mirror_info_t *mirror;
1273 r1bio_t *r1_bio; 1567 r1bio_t *r1_bio;
1274 struct bio *bio; 1568 struct bio *bio;
1275 sector_t max_sector, nr_sectors; 1569 sector_t max_sector, nr_sectors;
1276 int disk; 1570 int disk = -1;
1277 int i; 1571 int i;
1278 int wonly; 1572 int wonly = -1;
1279 int write_targets = 0; 1573 int write_targets = 0, read_targets = 0;
1280 int sync_blocks; 1574 int sync_blocks;
1281 int still_degraded = 0; 1575 int still_degraded = 0;
1282 1576
@@ -1317,55 +1611,35 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1317 return sync_blocks; 1611 return sync_blocks;
1318 } 1612 }
1319 /* 1613 /*
1320 * If there is non-resync activity waiting for us then 1614 * If there is non-resync activity waiting for a turn,
1321 * put in a delay to throttle resync. 1615 * and resync is going fast enough,
1616 * then let it though before starting on this new sync request.
1322 */ 1617 */
1323 if (!go_faster && waitqueue_active(&conf->wait_resume)) 1618 if (!go_faster && conf->nr_waiting)
1324 msleep_interruptible(1000); 1619 msleep_interruptible(1000);
1325 device_barrier(conf, sector_nr + RESYNC_SECTORS);
1326
1327 /*
1328 * If reconstructing, and >1 working disc,
1329 * could dedicate one to rebuild and others to
1330 * service read requests ..
1331 */
1332 disk = conf->last_used;
1333 /* make sure disk is operational */
1334 wonly = disk;
1335 while (conf->mirrors[disk].rdev == NULL ||
1336 !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) ||
1337 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
1338 ) {
1339 if (conf->mirrors[disk].rdev &&
1340 test_bit(In_sync, &conf->mirrors[disk].rdev->flags))
1341 wonly = disk;
1342 if (disk <= 0)
1343 disk = conf->raid_disks;
1344 disk--;
1345 if (disk == conf->last_used) {
1346 disk = wonly;
1347 break;
1348 }
1349 }
1350 conf->last_used = disk;
1351 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
1352 1620
1621 raise_barrier(conf);
1353 1622
1354 mirror = conf->mirrors + disk; 1623 conf->next_resync = sector_nr;
1355 1624
1356 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); 1625 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1357 1626 rcu_read_lock();
1358 spin_lock_irq(&conf->resync_lock); 1627 /*
1359 conf->nr_pending++; 1628 * If we get a correctably read error during resync or recovery,
1360 spin_unlock_irq(&conf->resync_lock); 1629 * we might want to read from a different device. So we
1630 * flag all drives that could conceivably be read from for READ,
1631 * and any others (which will be non-In_sync devices) for WRITE.
1632 * If a read fails, we try reading from something else for which READ
1633 * is OK.
1634 */
1361 1635
1362 r1_bio->mddev = mddev; 1636 r1_bio->mddev = mddev;
1363 r1_bio->sector = sector_nr; 1637 r1_bio->sector = sector_nr;
1364 r1_bio->state = 0; 1638 r1_bio->state = 0;
1365 set_bit(R1BIO_IsSync, &r1_bio->state); 1639 set_bit(R1BIO_IsSync, &r1_bio->state);
1366 r1_bio->read_disk = disk;
1367 1640
1368 for (i=0; i < conf->raid_disks; i++) { 1641 for (i=0; i < conf->raid_disks; i++) {
1642 mdk_rdev_t *rdev;
1369 bio = r1_bio->bios[i]; 1643 bio = r1_bio->bios[i];
1370 1644
1371 /* take from bio_init */ 1645 /* take from bio_init */
@@ -1380,35 +1654,49 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1380 bio->bi_end_io = NULL; 1654 bio->bi_end_io = NULL;
1381 bio->bi_private = NULL; 1655 bio->bi_private = NULL;
1382 1656
1383 if (i == disk) { 1657 rdev = rcu_dereference(conf->mirrors[i].rdev);
1384 bio->bi_rw = READ; 1658 if (rdev == NULL ||
1385 bio->bi_end_io = end_sync_read; 1659 test_bit(Faulty, &rdev->flags)) {
1386 } else if (conf->mirrors[i].rdev == NULL ||
1387 test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
1388 still_degraded = 1; 1660 still_degraded = 1;
1389 continue; 1661 continue;
1390 } else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) || 1662 } else if (!test_bit(In_sync, &rdev->flags)) {
1391 sector_nr + RESYNC_SECTORS > mddev->recovery_cp ||
1392 test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1393 bio->bi_rw = WRITE; 1663 bio->bi_rw = WRITE;
1394 bio->bi_end_io = end_sync_write; 1664 bio->bi_end_io = end_sync_write;
1395 write_targets ++; 1665 write_targets ++;
1396 } else 1666 } else {
1397 /* no need to read or write here */ 1667 /* may need to read from here */
1398 continue; 1668 bio->bi_rw = READ;
1399 bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset; 1669 bio->bi_end_io = end_sync_read;
1400 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1670 if (test_bit(WriteMostly, &rdev->flags)) {
1671 if (wonly < 0)
1672 wonly = i;
1673 } else {
1674 if (disk < 0)
1675 disk = i;
1676 }
1677 read_targets++;
1678 }
1679 atomic_inc(&rdev->nr_pending);
1680 bio->bi_sector = sector_nr + rdev->data_offset;
1681 bio->bi_bdev = rdev->bdev;
1401 bio->bi_private = r1_bio; 1682 bio->bi_private = r1_bio;
1402 } 1683 }
1684 rcu_read_unlock();
1685 if (disk < 0)
1686 disk = wonly;
1687 r1_bio->read_disk = disk;
1688
1689 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1690 /* extra read targets are also write targets */
1691 write_targets += read_targets-1;
1403 1692
1404 if (write_targets == 0) { 1693 if (write_targets == 0 || read_targets == 0) {
1405 /* There is nowhere to write, so all non-sync 1694 /* There is nowhere to write, so all non-sync
1406 * drives must be failed - so we are finished 1695 * drives must be failed - so we are finished
1407 */ 1696 */
1408 sector_t rv = max_sector - sector_nr; 1697 sector_t rv = max_sector - sector_nr;
1409 *skipped = 1; 1698 *skipped = 1;
1410 put_buf(r1_bio); 1699 put_buf(r1_bio);
1411 rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
1412 return rv; 1700 return rv;
1413 } 1701 }
1414 1702
@@ -1436,10 +1724,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1436 for (i=0 ; i < conf->raid_disks; i++) { 1724 for (i=0 ; i < conf->raid_disks; i++) {
1437 bio = r1_bio->bios[i]; 1725 bio = r1_bio->bios[i];
1438 if (bio->bi_end_io) { 1726 if (bio->bi_end_io) {
1439 page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page; 1727 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1440 if (bio_add_page(bio, page, len, 0) == 0) { 1728 if (bio_add_page(bio, page, len, 0) == 0) {
1441 /* stop here */ 1729 /* stop here */
1442 r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page; 1730 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1443 while (i > 0) { 1731 while (i > 0) {
1444 i--; 1732 i--;
1445 bio = r1_bio->bios[i]; 1733 bio = r1_bio->bios[i];
@@ -1459,12 +1747,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1459 sync_blocks -= (len>>9); 1747 sync_blocks -= (len>>9);
1460 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 1748 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1461 bio_full: 1749 bio_full:
1462 bio = r1_bio->bios[disk];
1463 r1_bio->sectors = nr_sectors; 1750 r1_bio->sectors = nr_sectors;
1464 1751
1465 md_sync_acct(mirror->rdev->bdev, nr_sectors); 1752 /* For a user-requested sync, we read all readable devices and do a
1753 * compare
1754 */
1755 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1756 atomic_set(&r1_bio->remaining, read_targets);
1757 for (i=0; i<conf->raid_disks; i++) {
1758 bio = r1_bio->bios[i];
1759 if (bio->bi_end_io == end_sync_read) {
1760 md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);
1761 generic_make_request(bio);
1762 }
1763 }
1764 } else {
1765 atomic_set(&r1_bio->remaining, 1);
1766 bio = r1_bio->bios[r1_bio->read_disk];
1767 md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,
1768 nr_sectors);
1769 generic_make_request(bio);
1466 1770
1467 generic_make_request(bio); 1771 }
1468 1772
1469 return nr_sectors; 1773 return nr_sectors;
1470} 1774}
@@ -1487,18 +1791,19 @@ static int run(mddev_t *mddev)
1487 * bookkeeping area. [whatever we allocate in run(), 1791 * bookkeeping area. [whatever we allocate in run(),
1488 * should be freed in stop()] 1792 * should be freed in stop()]
1489 */ 1793 */
1490 conf = kmalloc(sizeof(conf_t), GFP_KERNEL); 1794 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1491 mddev->private = conf; 1795 mddev->private = conf;
1492 if (!conf) 1796 if (!conf)
1493 goto out_no_mem; 1797 goto out_no_mem;
1494 1798
1495 memset(conf, 0, sizeof(*conf)); 1799 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1496 conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1497 GFP_KERNEL); 1800 GFP_KERNEL);
1498 if (!conf->mirrors) 1801 if (!conf->mirrors)
1499 goto out_no_mem; 1802 goto out_no_mem;
1500 1803
1501 memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); 1804 conf->tmppage = alloc_page(GFP_KERNEL);
1805 if (!conf->tmppage)
1806 goto out_no_mem;
1502 1807
1503 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 1808 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1504 if (!conf->poolinfo) 1809 if (!conf->poolinfo)
@@ -1542,8 +1847,7 @@ static int run(mddev_t *mddev)
1542 mddev->recovery_cp = MaxSector; 1847 mddev->recovery_cp = MaxSector;
1543 1848
1544 spin_lock_init(&conf->resync_lock); 1849 spin_lock_init(&conf->resync_lock);
1545 init_waitqueue_head(&conf->wait_idle); 1850 init_waitqueue_head(&conf->wait_barrier);
1546 init_waitqueue_head(&conf->wait_resume);
1547 1851
1548 bio_list_init(&conf->pending_bio_list); 1852 bio_list_init(&conf->pending_bio_list);
1549 bio_list_init(&conf->flushing_bio_list); 1853 bio_list_init(&conf->flushing_bio_list);
@@ -1583,7 +1887,6 @@ static int run(mddev_t *mddev)
1583 mdname(mddev)); 1887 mdname(mddev));
1584 goto out_free_conf; 1888 goto out_free_conf;
1585 } 1889 }
1586 if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1587 1890
1588 printk(KERN_INFO 1891 printk(KERN_INFO
1589 "raid1: raid set %s active with %d out of %d mirrors\n", 1892 "raid1: raid set %s active with %d out of %d mirrors\n",
@@ -1608,6 +1911,7 @@ out_free_conf:
1608 if (conf->r1bio_pool) 1911 if (conf->r1bio_pool)
1609 mempool_destroy(conf->r1bio_pool); 1912 mempool_destroy(conf->r1bio_pool);
1610 kfree(conf->mirrors); 1913 kfree(conf->mirrors);
1914 safe_put_page(conf->tmppage);
1611 kfree(conf->poolinfo); 1915 kfree(conf->poolinfo);
1612 kfree(conf); 1916 kfree(conf);
1613 mddev->private = NULL; 1917 mddev->private = NULL;
@@ -1706,19 +2010,14 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1706 kfree(newpoolinfo); 2010 kfree(newpoolinfo);
1707 return -ENOMEM; 2011 return -ENOMEM;
1708 } 2012 }
1709 newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); 2013 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
1710 if (!newmirrors) { 2014 if (!newmirrors) {
1711 kfree(newpoolinfo); 2015 kfree(newpoolinfo);
1712 mempool_destroy(newpool); 2016 mempool_destroy(newpool);
1713 return -ENOMEM; 2017 return -ENOMEM;
1714 } 2018 }
1715 memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
1716 2019
1717 spin_lock_irq(&conf->resync_lock); 2020 raise_barrier(conf);
1718 conf->barrier++;
1719 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1720 conf->resync_lock, raid1_unplug(mddev->queue));
1721 spin_unlock_irq(&conf->resync_lock);
1722 2021
1723 /* ok, everything is stopped */ 2022 /* ok, everything is stopped */
1724 oldpool = conf->r1bio_pool; 2023 oldpool = conf->r1bio_pool;
@@ -1738,12 +2037,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1738 conf->raid_disks = mddev->raid_disks = raid_disks; 2037 conf->raid_disks = mddev->raid_disks = raid_disks;
1739 2038
1740 conf->last_used = 0; /* just make sure it is in-range */ 2039 conf->last_used = 0; /* just make sure it is in-range */
1741 spin_lock_irq(&conf->resync_lock); 2040 lower_barrier(conf);
1742 conf->barrier--;
1743 spin_unlock_irq(&conf->resync_lock);
1744 wake_up(&conf->wait_resume);
1745 wake_up(&conf->wait_idle);
1746
1747 2041
1748 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2042 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1749 md_wakeup_thread(mddev->thread); 2043 md_wakeup_thread(mddev->thread);
@@ -1758,33 +2052,19 @@ static void raid1_quiesce(mddev_t *mddev, int state)
1758 2052
1759 switch(state) { 2053 switch(state) {
1760 case 1: 2054 case 1:
1761 spin_lock_irq(&conf->resync_lock); 2055 raise_barrier(conf);
1762 conf->barrier++;
1763 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1764 conf->resync_lock, raid1_unplug(mddev->queue));
1765 spin_unlock_irq(&conf->resync_lock);
1766 break; 2056 break;
1767 case 0: 2057 case 0:
1768 spin_lock_irq(&conf->resync_lock); 2058 lower_barrier(conf);
1769 conf->barrier--;
1770 spin_unlock_irq(&conf->resync_lock);
1771 wake_up(&conf->wait_resume);
1772 wake_up(&conf->wait_idle);
1773 break; 2059 break;
1774 } 2060 }
1775 if (mddev->thread) {
1776 if (mddev->bitmap)
1777 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1778 else
1779 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1780 md_wakeup_thread(mddev->thread);
1781 }
1782} 2061}
1783 2062
1784 2063
1785static mdk_personality_t raid1_personality = 2064static struct mdk_personality raid1_personality =
1786{ 2065{
1787 .name = "raid1", 2066 .name = "raid1",
2067 .level = 1,
1788 .owner = THIS_MODULE, 2068 .owner = THIS_MODULE,
1789 .make_request = make_request, 2069 .make_request = make_request,
1790 .run = run, 2070 .run = run,
@@ -1802,15 +2082,17 @@ static mdk_personality_t raid1_personality =
1802 2082
1803static int __init raid_init(void) 2083static int __init raid_init(void)
1804{ 2084{
1805 return register_md_personality(RAID1, &raid1_personality); 2085 return register_md_personality(&raid1_personality);
1806} 2086}
1807 2087
1808static void raid_exit(void) 2088static void raid_exit(void)
1809{ 2089{
1810 unregister_md_personality(RAID1); 2090 unregister_md_personality(&raid1_personality);
1811} 2091}
1812 2092
1813module_init(raid_init); 2093module_init(raid_init);
1814module_exit(raid_exit); 2094module_exit(raid_exit);
1815MODULE_LICENSE("GPL"); 2095MODULE_LICENSE("GPL");
1816MODULE_ALIAS("md-personality-3"); /* RAID1 */ 2096MODULE_ALIAS("md-personality-3"); /* RAID1 */
2097MODULE_ALIAS("md-raid1");
2098MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 713dc9c2c73..9e658e519a2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,7 +18,9 @@
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21#include "dm-bio-list.h"
21#include <linux/raid/raid10.h> 22#include <linux/raid/raid10.h>
23#include <linux/raid/bitmap.h>
22 24
23/* 25/*
24 * RAID10 provides a combination of RAID0 and RAID1 functionality. 26 * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -47,6 +49,9 @@
47 49
48static void unplug_slaves(mddev_t *mddev); 50static void unplug_slaves(mddev_t *mddev);
49 51
52static void allow_barrier(conf_t *conf);
53static void lower_barrier(conf_t *conf);
54
50static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 55static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
51{ 56{
52 conf_t *conf = data; 57 conf_t *conf = data;
@@ -54,10 +59,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
54 int size = offsetof(struct r10bio_s, devs[conf->copies]); 59 int size = offsetof(struct r10bio_s, devs[conf->copies]);
55 60
56 /* allocate a r10bio with room for raid_disks entries in the bios array */ 61 /* allocate a r10bio with room for raid_disks entries in the bios array */
57 r10_bio = kmalloc(size, gfp_flags); 62 r10_bio = kzalloc(size, gfp_flags);
58 if (r10_bio) 63 if (!r10_bio)
59 memset(r10_bio, 0, size);
60 else
61 unplug_slaves(conf->mddev); 64 unplug_slaves(conf->mddev);
62 65
63 return r10_bio; 66 return r10_bio;
@@ -129,10 +132,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
129 132
130out_free_pages: 133out_free_pages:
131 for ( ; i > 0 ; i--) 134 for ( ; i > 0 ; i--)
132 __free_page(bio->bi_io_vec[i-1].bv_page); 135 safe_put_page(bio->bi_io_vec[i-1].bv_page);
133 while (j--) 136 while (j--)
134 for (i = 0; i < RESYNC_PAGES ; i++) 137 for (i = 0; i < RESYNC_PAGES ; i++)
135 __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 138 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
136 j = -1; 139 j = -1;
137out_free_bio: 140out_free_bio:
138 while ( ++j < nalloc ) 141 while ( ++j < nalloc )
@@ -152,7 +155,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
152 struct bio *bio = r10bio->devs[j].bio; 155 struct bio *bio = r10bio->devs[j].bio;
153 if (bio) { 156 if (bio) {
154 for (i = 0; i < RESYNC_PAGES; i++) { 157 for (i = 0; i < RESYNC_PAGES; i++) {
155 __free_page(bio->bi_io_vec[i].bv_page); 158 safe_put_page(bio->bi_io_vec[i].bv_page);
156 bio->bi_io_vec[i].bv_page = NULL; 159 bio->bi_io_vec[i].bv_page = NULL;
157 } 160 }
158 bio_put(bio); 161 bio_put(bio);
@@ -167,7 +170,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
167 170
168 for (i = 0; i < conf->copies; i++) { 171 for (i = 0; i < conf->copies; i++) {
169 struct bio **bio = & r10_bio->devs[i].bio; 172 struct bio **bio = & r10_bio->devs[i].bio;
170 if (*bio) 173 if (*bio && *bio != IO_BLOCKED)
171 bio_put(*bio); 174 bio_put(*bio);
172 *bio = NULL; 175 *bio = NULL;
173 } 176 }
@@ -175,20 +178,13 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
175 178
176static inline void free_r10bio(r10bio_t *r10_bio) 179static inline void free_r10bio(r10bio_t *r10_bio)
177{ 180{
178 unsigned long flags;
179
180 conf_t *conf = mddev_to_conf(r10_bio->mddev); 181 conf_t *conf = mddev_to_conf(r10_bio->mddev);
181 182
182 /* 183 /*
183 * Wake up any possible resync thread that waits for the device 184 * Wake up any possible resync thread that waits for the device
184 * to go idle. 185 * to go idle.
185 */ 186 */
186 spin_lock_irqsave(&conf->resync_lock, flags); 187 allow_barrier(conf);
187 if (!--conf->nr_pending) {
188 wake_up(&conf->wait_idle);
189 wake_up(&conf->wait_resume);
190 }
191 spin_unlock_irqrestore(&conf->resync_lock, flags);
192 188
193 put_all_bios(conf, r10_bio); 189 put_all_bios(conf, r10_bio);
194 mempool_free(r10_bio, conf->r10bio_pool); 190 mempool_free(r10_bio, conf->r10bio_pool);
@@ -197,22 +193,10 @@ static inline void free_r10bio(r10bio_t *r10_bio)
197static inline void put_buf(r10bio_t *r10_bio) 193static inline void put_buf(r10bio_t *r10_bio)
198{ 194{
199 conf_t *conf = mddev_to_conf(r10_bio->mddev); 195 conf_t *conf = mddev_to_conf(r10_bio->mddev);
200 unsigned long flags;
201 196
202 mempool_free(r10_bio, conf->r10buf_pool); 197 mempool_free(r10_bio, conf->r10buf_pool);
203 198
204 spin_lock_irqsave(&conf->resync_lock, flags); 199 lower_barrier(conf);
205 if (!conf->barrier)
206 BUG();
207 --conf->barrier;
208 wake_up(&conf->wait_resume);
209 wake_up(&conf->wait_idle);
210
211 if (!--conf->nr_pending) {
212 wake_up(&conf->wait_idle);
213 wake_up(&conf->wait_resume);
214 }
215 spin_unlock_irqrestore(&conf->resync_lock, flags);
216} 200}
217 201
218static void reschedule_retry(r10bio_t *r10_bio) 202static void reschedule_retry(r10bio_t *r10_bio)
@@ -223,6 +207,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
223 207
224 spin_lock_irqsave(&conf->device_lock, flags); 208 spin_lock_irqsave(&conf->device_lock, flags);
225 list_add(&r10_bio->retry_list, &conf->retry_list); 209 list_add(&r10_bio->retry_list, &conf->retry_list);
210 conf->nr_queued ++;
226 spin_unlock_irqrestore(&conf->device_lock, flags); 211 spin_unlock_irqrestore(&conf->device_lock, flags);
227 212
228 md_wakeup_thread(mddev->thread); 213 md_wakeup_thread(mddev->thread);
@@ -268,9 +253,9 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
268 /* 253 /*
269 * this branch is our 'one mirror IO has finished' event handler: 254 * this branch is our 'one mirror IO has finished' event handler:
270 */ 255 */
271 if (!uptodate) 256 update_head_pos(slot, r10_bio);
272 md_error(r10_bio->mddev, conf->mirrors[dev].rdev); 257
273 else 258 if (uptodate) {
274 /* 259 /*
275 * Set R10BIO_Uptodate in our master bio, so that 260 * Set R10BIO_Uptodate in our master bio, so that
276 * we will return a good error code to the higher 261 * we will return a good error code to the higher
@@ -281,15 +266,8 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
281 * wait for the 'master' bio. 266 * wait for the 'master' bio.
282 */ 267 */
283 set_bit(R10BIO_Uptodate, &r10_bio->state); 268 set_bit(R10BIO_Uptodate, &r10_bio->state);
284
285 update_head_pos(slot, r10_bio);
286
287 /*
288 * we have only one bio on the read side
289 */
290 if (uptodate)
291 raid_end_bio_io(r10_bio); 269 raid_end_bio_io(r10_bio);
292 else { 270 } else {
293 /* 271 /*
294 * oops, read error: 272 * oops, read error:
295 */ 273 */
@@ -322,9 +300,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
322 /* 300 /*
323 * this branch is our 'one mirror IO has finished' event handler: 301 * this branch is our 'one mirror IO has finished' event handler:
324 */ 302 */
325 if (!uptodate) 303 if (!uptodate) {
326 md_error(r10_bio->mddev, conf->mirrors[dev].rdev); 304 md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
327 else 305 /* an I/O failed, we can't clear the bitmap */
306 set_bit(R10BIO_Degraded, &r10_bio->state);
307 } else
328 /* 308 /*
329 * Set R10BIO_Uptodate in our master bio, so that 309 * Set R10BIO_Uptodate in our master bio, so that
330 * we will return a good error code for to the higher 310 * we will return a good error code for to the higher
@@ -344,6 +324,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
344 * already. 324 * already.
345 */ 325 */
346 if (atomic_dec_and_test(&r10_bio->remaining)) { 326 if (atomic_dec_and_test(&r10_bio->remaining)) {
327 /* clear the bitmap if all writes complete successfully */
328 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
329 r10_bio->sectors,
330 !test_bit(R10BIO_Degraded, &r10_bio->state),
331 0);
347 md_write_end(r10_bio->mddev); 332 md_write_end(r10_bio->mddev);
348 raid_end_bio_io(r10_bio); 333 raid_end_bio_io(r10_bio);
349 } 334 }
@@ -502,8 +487,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
502 rcu_read_lock(); 487 rcu_read_lock();
503 /* 488 /*
504 * Check if we can balance. We can balance on the whole 489 * Check if we can balance. We can balance on the whole
505 * device if no resync is going on, or below the resync window. 490 * device if no resync is going on (recovery is ok), or below
506 * We take the first readable disk when above the resync window. 491 * the resync window. We take the first readable disk when
492 * above the resync window.
507 */ 493 */
508 if (conf->mddev->recovery_cp < MaxSector 494 if (conf->mddev->recovery_cp < MaxSector
509 && (this_sector + sectors >= conf->next_resync)) { 495 && (this_sector + sectors >= conf->next_resync)) {
@@ -512,6 +498,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
512 disk = r10_bio->devs[slot].devnum; 498 disk = r10_bio->devs[slot].devnum;
513 499
514 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || 500 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
501 r10_bio->devs[slot].bio == IO_BLOCKED ||
515 !test_bit(In_sync, &rdev->flags)) { 502 !test_bit(In_sync, &rdev->flags)) {
516 slot++; 503 slot++;
517 if (slot == conf->copies) { 504 if (slot == conf->copies) {
@@ -529,6 +516,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
529 slot = 0; 516 slot = 0;
530 disk = r10_bio->devs[slot].devnum; 517 disk = r10_bio->devs[slot].devnum;
531 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || 518 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
519 r10_bio->devs[slot].bio == IO_BLOCKED ||
532 !test_bit(In_sync, &rdev->flags)) { 520 !test_bit(In_sync, &rdev->flags)) {
533 slot ++; 521 slot ++;
534 if (slot == conf->copies) { 522 if (slot == conf->copies) {
@@ -549,6 +537,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
549 537
550 538
551 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || 539 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
540 r10_bio->devs[nslot].bio == IO_BLOCKED ||
552 !test_bit(In_sync, &rdev->flags)) 541 !test_bit(In_sync, &rdev->flags))
553 continue; 542 continue;
554 543
@@ -607,7 +596,10 @@ static void unplug_slaves(mddev_t *mddev)
607 596
608static void raid10_unplug(request_queue_t *q) 597static void raid10_unplug(request_queue_t *q)
609{ 598{
599 mddev_t *mddev = q->queuedata;
600
610 unplug_slaves(q->queuedata); 601 unplug_slaves(q->queuedata);
602 md_wakeup_thread(mddev->thread);
611} 603}
612 604
613static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, 605static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -640,27 +632,107 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
640 return ret; 632 return ret;
641} 633}
642 634
643/* 635/* Barriers....
644 * Throttle resync depth, so that we can both get proper overlapping of 636 * Sometimes we need to suspend IO while we do something else,
645 * requests, but are still able to handle normal requests quickly. 637 * either some resync/recovery, or reconfigure the array.
638 * To do this we raise a 'barrier'.
639 * The 'barrier' is a counter that can be raised multiple times
640 * to count how many activities are happening which preclude
641 * normal IO.
642 * We can only raise the barrier if there is no pending IO.
643 * i.e. if nr_pending == 0.
644 * We choose only to raise the barrier if no-one is waiting for the
645 * barrier to go down. This means that as soon as an IO request
646 * is ready, no other operations which require a barrier will start
647 * until the IO request has had a chance.
648 *
649 * So: regular IO calls 'wait_barrier'. When that returns there
650 * is no backgroup IO happening, It must arrange to call
651 * allow_barrier when it has finished its IO.
652 * backgroup IO calls must call raise_barrier. Once that returns
653 * there is no normal IO happeing. It must arrange to call
654 * lower_barrier when the particular background IO completes.
646 */ 655 */
647#define RESYNC_DEPTH 32 656#define RESYNC_DEPTH 32
648 657
649static void device_barrier(conf_t *conf, sector_t sect) 658static void raise_barrier(conf_t *conf, int force)
659{
660 BUG_ON(force && !conf->barrier);
661 spin_lock_irq(&conf->resync_lock);
662
663 /* Wait until no block IO is waiting (unless 'force') */
664 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
665 conf->resync_lock,
666 raid10_unplug(conf->mddev->queue));
667
668 /* block any new IO from starting */
669 conf->barrier++;
670
671 /* No wait for all pending IO to complete */
672 wait_event_lock_irq(conf->wait_barrier,
673 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
674 conf->resync_lock,
675 raid10_unplug(conf->mddev->queue));
676
677 spin_unlock_irq(&conf->resync_lock);
678}
679
680static void lower_barrier(conf_t *conf)
681{
682 unsigned long flags;
683 spin_lock_irqsave(&conf->resync_lock, flags);
684 conf->barrier--;
685 spin_unlock_irqrestore(&conf->resync_lock, flags);
686 wake_up(&conf->wait_barrier);
687}
688
689static void wait_barrier(conf_t *conf)
650{ 690{
651 spin_lock_irq(&conf->resync_lock); 691 spin_lock_irq(&conf->resync_lock);
652 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 692 if (conf->barrier) {
653 conf->resync_lock, unplug_slaves(conf->mddev)); 693 conf->nr_waiting++;
654 694 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
655 if (!conf->barrier++) { 695 conf->resync_lock,
656 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 696 raid10_unplug(conf->mddev->queue));
657 conf->resync_lock, unplug_slaves(conf->mddev)); 697 conf->nr_waiting--;
658 if (conf->nr_pending)
659 BUG();
660 } 698 }
661 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 699 conf->nr_pending++;
662 conf->resync_lock, unplug_slaves(conf->mddev)); 700 spin_unlock_irq(&conf->resync_lock);
663 conf->next_resync = sect; 701}
702
703static void allow_barrier(conf_t *conf)
704{
705 unsigned long flags;
706 spin_lock_irqsave(&conf->resync_lock, flags);
707 conf->nr_pending--;
708 spin_unlock_irqrestore(&conf->resync_lock, flags);
709 wake_up(&conf->wait_barrier);
710}
711
712static void freeze_array(conf_t *conf)
713{
714 /* stop syncio and normal IO and wait for everything to
715 * go quiet.
716 * We increment barrier and nr_waiting, and then
717 * wait until barrier+nr_pending match nr_queued+2
718 */
719 spin_lock_irq(&conf->resync_lock);
720 conf->barrier++;
721 conf->nr_waiting++;
722 wait_event_lock_irq(conf->wait_barrier,
723 conf->barrier+conf->nr_pending == conf->nr_queued+2,
724 conf->resync_lock,
725 raid10_unplug(conf->mddev->queue));
726 spin_unlock_irq(&conf->resync_lock);
727}
728
729static void unfreeze_array(conf_t *conf)
730{
731 /* reverse the effect of the freeze */
732 spin_lock_irq(&conf->resync_lock);
733 conf->barrier--;
734 conf->nr_waiting--;
735 wake_up(&conf->wait_barrier);
664 spin_unlock_irq(&conf->resync_lock); 736 spin_unlock_irq(&conf->resync_lock);
665} 737}
666 738
@@ -674,6 +746,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
674 int i; 746 int i;
675 int chunk_sects = conf->chunk_mask + 1; 747 int chunk_sects = conf->chunk_mask + 1;
676 const int rw = bio_data_dir(bio); 748 const int rw = bio_data_dir(bio);
749 struct bio_list bl;
750 unsigned long flags;
677 751
678 if (unlikely(bio_barrier(bio))) { 752 if (unlikely(bio_barrier(bio))) {
679 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 753 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -719,10 +793,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
719 * thread has put up a bar for new requests. 793 * thread has put up a bar for new requests.
720 * Continue immediately if no resync is active currently. 794 * Continue immediately if no resync is active currently.
721 */ 795 */
722 spin_lock_irq(&conf->resync_lock); 796 wait_barrier(conf);
723 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
724 conf->nr_pending++;
725 spin_unlock_irq(&conf->resync_lock);
726 797
727 disk_stat_inc(mddev->gendisk, ios[rw]); 798 disk_stat_inc(mddev->gendisk, ios[rw]);
728 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); 799 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -734,6 +805,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
734 805
735 r10_bio->mddev = mddev; 806 r10_bio->mddev = mddev;
736 r10_bio->sector = bio->bi_sector; 807 r10_bio->sector = bio->bi_sector;
808 r10_bio->state = 0;
737 809
738 if (rw == READ) { 810 if (rw == READ) {
739 /* 811 /*
@@ -778,13 +850,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
778 !test_bit(Faulty, &rdev->flags)) { 850 !test_bit(Faulty, &rdev->flags)) {
779 atomic_inc(&rdev->nr_pending); 851 atomic_inc(&rdev->nr_pending);
780 r10_bio->devs[i].bio = bio; 852 r10_bio->devs[i].bio = bio;
781 } else 853 } else {
782 r10_bio->devs[i].bio = NULL; 854 r10_bio->devs[i].bio = NULL;
855 set_bit(R10BIO_Degraded, &r10_bio->state);
856 }
783 } 857 }
784 rcu_read_unlock(); 858 rcu_read_unlock();
785 859
786 atomic_set(&r10_bio->remaining, 1); 860 atomic_set(&r10_bio->remaining, 0);
787 861
862 bio_list_init(&bl);
788 for (i = 0; i < conf->copies; i++) { 863 for (i = 0; i < conf->copies; i++) {
789 struct bio *mbio; 864 struct bio *mbio;
790 int d = r10_bio->devs[i].devnum; 865 int d = r10_bio->devs[i].devnum;
@@ -802,13 +877,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
802 mbio->bi_private = r10_bio; 877 mbio->bi_private = r10_bio;
803 878
804 atomic_inc(&r10_bio->remaining); 879 atomic_inc(&r10_bio->remaining);
805 generic_make_request(mbio); 880 bio_list_add(&bl, mbio);
806 } 881 }
807 882
808 if (atomic_dec_and_test(&r10_bio->remaining)) { 883 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
809 md_write_end(mddev); 884 spin_lock_irqsave(&conf->device_lock, flags);
810 raid_end_bio_io(r10_bio); 885 bio_list_merge(&conf->pending_bio_list, &bl);
811 } 886 blk_plug_device(mddev->queue);
887 spin_unlock_irqrestore(&conf->device_lock, flags);
812 888
813 return 0; 889 return 0;
814} 890}
@@ -897,13 +973,8 @@ static void print_conf(conf_t *conf)
897 973
898static void close_sync(conf_t *conf) 974static void close_sync(conf_t *conf)
899{ 975{
900 spin_lock_irq(&conf->resync_lock); 976 wait_barrier(conf);
901 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 977 allow_barrier(conf);
902 conf->resync_lock, unplug_slaves(conf->mddev));
903 spin_unlock_irq(&conf->resync_lock);
904
905 if (conf->barrier) BUG();
906 if (waitqueue_active(&conf->wait_idle)) BUG();
907 978
908 mempool_destroy(conf->r10buf_pool); 979 mempool_destroy(conf->r10buf_pool);
909 conf->r10buf_pool = NULL; 980 conf->r10buf_pool = NULL;
@@ -971,7 +1042,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
971 if (!enough(conf)) 1042 if (!enough(conf))
972 return 0; 1043 return 0;
973 1044
974 for (mirror=0; mirror < mddev->raid_disks; mirror++) 1045 if (rdev->saved_raid_disk >= 0 &&
1046 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1047 mirror = rdev->saved_raid_disk;
1048 else
1049 mirror = 0;
1050 for ( ; mirror < mddev->raid_disks; mirror++)
975 if ( !(p=conf->mirrors+mirror)->rdev) { 1051 if ( !(p=conf->mirrors+mirror)->rdev) {
976 1052
977 blk_queue_stack_limits(mddev->queue, 1053 blk_queue_stack_limits(mddev->queue,
@@ -987,6 +1063,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
987 p->head_position = 0; 1063 p->head_position = 0;
988 rdev->raid_disk = mirror; 1064 rdev->raid_disk = mirror;
989 found = 1; 1065 found = 1;
1066 if (rdev->saved_raid_disk != mirror)
1067 conf->fullsync = 1;
990 rcu_assign_pointer(p->rdev, rdev); 1068 rcu_assign_pointer(p->rdev, rdev);
991 break; 1069 break;
992 } 1070 }
@@ -1027,7 +1105,6 @@ abort:
1027 1105
1028static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) 1106static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1029{ 1107{
1030 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1031 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1108 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1032 conf_t *conf = mddev_to_conf(r10_bio->mddev); 1109 conf_t *conf = mddev_to_conf(r10_bio->mddev);
1033 int i,d; 1110 int i,d;
@@ -1042,9 +1119,16 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1042 BUG(); 1119 BUG();
1043 update_head_pos(i, r10_bio); 1120 update_head_pos(i, r10_bio);
1044 d = r10_bio->devs[i].devnum; 1121 d = r10_bio->devs[i].devnum;
1045 if (!uptodate) 1122
1046 md_error(r10_bio->mddev, 1123 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1047 conf->mirrors[d].rdev); 1124 set_bit(R10BIO_Uptodate, &r10_bio->state);
1125 else {
1126 atomic_add(r10_bio->sectors,
1127 &conf->mirrors[d].rdev->corrected_errors);
1128 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1129 md_error(r10_bio->mddev,
1130 conf->mirrors[d].rdev);
1131 }
1048 1132
1049 /* for reconstruct, we always reschedule after a read. 1133 /* for reconstruct, we always reschedule after a read.
1050 * for resync, only after all reads 1134 * for resync, only after all reads
@@ -1132,23 +1216,32 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1132 fbio = r10_bio->devs[i].bio; 1216 fbio = r10_bio->devs[i].bio;
1133 1217
1134 /* now find blocks with errors */ 1218 /* now find blocks with errors */
1135 for (i=first+1 ; i < conf->copies ; i++) { 1219 for (i=0 ; i < conf->copies ; i++) {
1136 int vcnt, j, d; 1220 int j, d;
1221 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1137 1222
1138 if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1139 continue;
1140 /* We know that the bi_io_vec layout is the same for
1141 * both 'first' and 'i', so we just compare them.
1142 * All vec entries are PAGE_SIZE;
1143 */
1144 tbio = r10_bio->devs[i].bio; 1223 tbio = r10_bio->devs[i].bio;
1145 vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); 1224
1146 for (j = 0; j < vcnt; j++) 1225 if (tbio->bi_end_io != end_sync_read)
1147 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), 1226 continue;
1148 page_address(tbio->bi_io_vec[j].bv_page), 1227 if (i == first)
1149 PAGE_SIZE)) 1228 continue;
1150 break; 1229 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1151 if (j == vcnt) 1230 /* We know that the bi_io_vec layout is the same for
1231 * both 'first' and 'i', so we just compare them.
1232 * All vec entries are PAGE_SIZE;
1233 */
1234 for (j = 0; j < vcnt; j++)
1235 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1236 page_address(tbio->bi_io_vec[j].bv_page),
1237 PAGE_SIZE))
1238 break;
1239 if (j == vcnt)
1240 continue;
1241 mddev->resync_mismatches += r10_bio->sectors;
1242 }
1243 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1244 /* Don't fix anything. */
1152 continue; 1245 continue;
1153 /* Ok, we need to write this bio 1246 /* Ok, we need to write this bio
1154 * First we need to fixup bv_offset, bv_len and 1247 * First we need to fixup bv_offset, bv_len and
@@ -1227,7 +1320,10 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1227 1320
1228 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1321 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1229 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1322 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1230 generic_make_request(wbio); 1323 if (test_bit(R10BIO_Uptodate, &r10_bio->state))
1324 generic_make_request(wbio);
1325 else
1326 bio_endio(wbio, wbio->bi_size, -EIO);
1231} 1327}
1232 1328
1233 1329
@@ -1254,10 +1350,31 @@ static void raid10d(mddev_t *mddev)
1254 for (;;) { 1350 for (;;) {
1255 char b[BDEVNAME_SIZE]; 1351 char b[BDEVNAME_SIZE];
1256 spin_lock_irqsave(&conf->device_lock, flags); 1352 spin_lock_irqsave(&conf->device_lock, flags);
1353
1354 if (conf->pending_bio_list.head) {
1355 bio = bio_list_get(&conf->pending_bio_list);
1356 blk_remove_plug(mddev->queue);
1357 spin_unlock_irqrestore(&conf->device_lock, flags);
1358 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1359 if (bitmap_unplug(mddev->bitmap) != 0)
1360 printk("%s: bitmap file write failed!\n", mdname(mddev));
1361
1362 while (bio) { /* submit pending writes */
1363 struct bio *next = bio->bi_next;
1364 bio->bi_next = NULL;
1365 generic_make_request(bio);
1366 bio = next;
1367 }
1368 unplug = 1;
1369
1370 continue;
1371 }
1372
1257 if (list_empty(head)) 1373 if (list_empty(head))
1258 break; 1374 break;
1259 r10_bio = list_entry(head->prev, r10bio_t, retry_list); 1375 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1260 list_del(head->prev); 1376 list_del(head->prev);
1377 conf->nr_queued--;
1261 spin_unlock_irqrestore(&conf->device_lock, flags); 1378 spin_unlock_irqrestore(&conf->device_lock, flags);
1262 1379
1263 mddev = r10_bio->mddev; 1380 mddev = r10_bio->mddev;
@@ -1270,8 +1387,96 @@ static void raid10d(mddev_t *mddev)
1270 unplug = 1; 1387 unplug = 1;
1271 } else { 1388 } else {
1272 int mirror; 1389 int mirror;
1390 /* we got a read error. Maybe the drive is bad. Maybe just
1391 * the block and we can fix it.
1392 * We freeze all other IO, and try reading the block from
1393 * other devices. When we find one, we re-write
1394 * and check it that fixes the read error.
1395 * This is all done synchronously while the array is
1396 * frozen.
1397 */
1398 int sect = 0; /* Offset from r10_bio->sector */
1399 int sectors = r10_bio->sectors;
1400 freeze_array(conf);
1401 if (mddev->ro == 0) while(sectors) {
1402 int s = sectors;
1403 int sl = r10_bio->read_slot;
1404 int success = 0;
1405
1406 if (s > (PAGE_SIZE>>9))
1407 s = PAGE_SIZE >> 9;
1408
1409 do {
1410 int d = r10_bio->devs[sl].devnum;
1411 rdev = conf->mirrors[d].rdev;
1412 if (rdev &&
1413 test_bit(In_sync, &rdev->flags) &&
1414 sync_page_io(rdev->bdev,
1415 r10_bio->devs[sl].addr +
1416 sect + rdev->data_offset,
1417 s<<9,
1418 conf->tmppage, READ))
1419 success = 1;
1420 else {
1421 sl++;
1422 if (sl == conf->copies)
1423 sl = 0;
1424 }
1425 } while (!success && sl != r10_bio->read_slot);
1426
1427 if (success) {
1428 int start = sl;
1429 /* write it back and re-read */
1430 while (sl != r10_bio->read_slot) {
1431 int d;
1432 if (sl==0)
1433 sl = conf->copies;
1434 sl--;
1435 d = r10_bio->devs[sl].devnum;
1436 rdev = conf->mirrors[d].rdev;
1437 atomic_add(s, &rdev->corrected_errors);
1438 if (rdev &&
1439 test_bit(In_sync, &rdev->flags)) {
1440 if (sync_page_io(rdev->bdev,
1441 r10_bio->devs[sl].addr +
1442 sect + rdev->data_offset,
1443 s<<9, conf->tmppage, WRITE) == 0)
1444 /* Well, this device is dead */
1445 md_error(mddev, rdev);
1446 }
1447 }
1448 sl = start;
1449 while (sl != r10_bio->read_slot) {
1450 int d;
1451 if (sl==0)
1452 sl = conf->copies;
1453 sl--;
1454 d = r10_bio->devs[sl].devnum;
1455 rdev = conf->mirrors[d].rdev;
1456 if (rdev &&
1457 test_bit(In_sync, &rdev->flags)) {
1458 if (sync_page_io(rdev->bdev,
1459 r10_bio->devs[sl].addr +
1460 sect + rdev->data_offset,
1461 s<<9, conf->tmppage, READ) == 0)
1462 /* Well, this device is dead */
1463 md_error(mddev, rdev);
1464 }
1465 }
1466 } else {
1467 /* Cannot read from anywhere -- bye bye array */
1468 md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
1469 break;
1470 }
1471 sectors -= s;
1472 sect += s;
1473 }
1474
1475 unfreeze_array(conf);
1476
1273 bio = r10_bio->devs[r10_bio->read_slot].bio; 1477 bio = r10_bio->devs[r10_bio->read_slot].bio;
1274 r10_bio->devs[r10_bio->read_slot].bio = NULL; 1478 r10_bio->devs[r10_bio->read_slot].bio =
1479 mddev->ro ? IO_BLOCKED : NULL;
1275 bio_put(bio); 1480 bio_put(bio);
1276 mirror = read_balance(conf, r10_bio); 1481 mirror = read_balance(conf, r10_bio);
1277 if (mirror == -1) { 1482 if (mirror == -1) {
@@ -1360,6 +1565,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1360 sector_t max_sector, nr_sectors; 1565 sector_t max_sector, nr_sectors;
1361 int disk; 1566 int disk;
1362 int i; 1567 int i;
1568 int max_sync;
1569 int sync_blocks;
1363 1570
1364 sector_t sectors_skipped = 0; 1571 sector_t sectors_skipped = 0;
1365 int chunks_skipped = 0; 1572 int chunks_skipped = 0;
@@ -1373,6 +1580,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1373 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 1580 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1374 max_sector = mddev->resync_max_sectors; 1581 max_sector = mddev->resync_max_sectors;
1375 if (sector_nr >= max_sector) { 1582 if (sector_nr >= max_sector) {
1583 /* If we aborted, we need to abort the
1584 * sync on the 'current' bitmap chucks (there can
1585 * be several when recovering multiple devices).
1586 * as we may have started syncing it but not finished.
1587 * We can find the current address in
1588 * mddev->curr_resync, but for recovery,
1589 * we need to convert that to several
1590 * virtual addresses.
1591 */
1592 if (mddev->curr_resync < max_sector) { /* aborted */
1593 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1594 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1595 &sync_blocks, 1);
1596 else for (i=0; i<conf->raid_disks; i++) {
1597 sector_t sect =
1598 raid10_find_virt(conf, mddev->curr_resync, i);
1599 bitmap_end_sync(mddev->bitmap, sect,
1600 &sync_blocks, 1);
1601 }
1602 } else /* completed sync */
1603 conf->fullsync = 0;
1604
1605 bitmap_close_sync(mddev->bitmap);
1376 close_sync(conf); 1606 close_sync(conf);
1377 *skipped = 1; 1607 *skipped = 1;
1378 return sectors_skipped; 1608 return sectors_skipped;
@@ -1395,9 +1625,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1395 * If there is non-resync activity waiting for us then 1625 * If there is non-resync activity waiting for us then
1396 * put in a delay to throttle resync. 1626 * put in a delay to throttle resync.
1397 */ 1627 */
1398 if (!go_faster && waitqueue_active(&conf->wait_resume)) 1628 if (!go_faster && conf->nr_waiting)
1399 msleep_interruptible(1000); 1629 msleep_interruptible(1000);
1400 device_barrier(conf, sector_nr + RESYNC_SECTORS);
1401 1630
1402 /* Again, very different code for resync and recovery. 1631 /* Again, very different code for resync and recovery.
1403 * Both must result in an r10bio with a list of bios that 1632 * Both must result in an r10bio with a list of bios that
@@ -1414,6 +1643,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1414 * end_sync_write if we will want to write. 1643 * end_sync_write if we will want to write.
1415 */ 1644 */
1416 1645
1646 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1417 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1647 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1418 /* recovery... the complicated one */ 1648 /* recovery... the complicated one */
1419 int i, j, k; 1649 int i, j, k;
@@ -1422,14 +1652,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1422 for (i=0 ; i<conf->raid_disks; i++) 1652 for (i=0 ; i<conf->raid_disks; i++)
1423 if (conf->mirrors[i].rdev && 1653 if (conf->mirrors[i].rdev &&
1424 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { 1654 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1655 int still_degraded = 0;
1425 /* want to reconstruct this device */ 1656 /* want to reconstruct this device */
1426 r10bio_t *rb2 = r10_bio; 1657 r10bio_t *rb2 = r10_bio;
1658 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1659 int must_sync;
1660 /* Unless we are doing a full sync, we only need
1661 * to recover the block if it is set in the bitmap
1662 */
1663 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1664 &sync_blocks, 1);
1665 if (sync_blocks < max_sync)
1666 max_sync = sync_blocks;
1667 if (!must_sync &&
1668 !conf->fullsync) {
1669 /* yep, skip the sync_blocks here, but don't assume
1670 * that there will never be anything to do here
1671 */
1672 chunks_skipped = -1;
1673 continue;
1674 }
1427 1675
1428 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 1676 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1429 spin_lock_irq(&conf->resync_lock); 1677 raise_barrier(conf, rb2 != NULL);
1430 conf->nr_pending++;
1431 if (rb2) conf->barrier++;
1432 spin_unlock_irq(&conf->resync_lock);
1433 atomic_set(&r10_bio->remaining, 0); 1678 atomic_set(&r10_bio->remaining, 0);
1434 1679
1435 r10_bio->master_bio = (struct bio*)rb2; 1680 r10_bio->master_bio = (struct bio*)rb2;
@@ -1437,8 +1682,23 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1437 atomic_inc(&rb2->remaining); 1682 atomic_inc(&rb2->remaining);
1438 r10_bio->mddev = mddev; 1683 r10_bio->mddev = mddev;
1439 set_bit(R10BIO_IsRecover, &r10_bio->state); 1684 set_bit(R10BIO_IsRecover, &r10_bio->state);
1440 r10_bio->sector = raid10_find_virt(conf, sector_nr, i); 1685 r10_bio->sector = sect;
1686
1441 raid10_find_phys(conf, r10_bio); 1687 raid10_find_phys(conf, r10_bio);
1688 /* Need to check if this section will still be
1689 * degraded
1690 */
1691 for (j=0; j<conf->copies;j++) {
1692 int d = r10_bio->devs[j].devnum;
1693 if (conf->mirrors[d].rdev == NULL ||
1694 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
1695 still_degraded = 1;
1696 break;
1697 }
1698 }
1699 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1700 &sync_blocks, still_degraded);
1701
1442 for (j=0; j<conf->copies;j++) { 1702 for (j=0; j<conf->copies;j++) {
1443 int d = r10_bio->devs[j].devnum; 1703 int d = r10_bio->devs[j].devnum;
1444 if (conf->mirrors[d].rdev && 1704 if (conf->mirrors[d].rdev &&
@@ -1498,14 +1758,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1498 } else { 1758 } else {
1499 /* resync. Schedule a read for every block at this virt offset */ 1759 /* resync. Schedule a read for every block at this virt offset */
1500 int count = 0; 1760 int count = 0;
1501 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1502 1761
1503 spin_lock_irq(&conf->resync_lock); 1762 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1504 conf->nr_pending++; 1763 &sync_blocks, mddev->degraded) &&
1505 spin_unlock_irq(&conf->resync_lock); 1764 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1765 /* We can skip this block */
1766 *skipped = 1;
1767 return sync_blocks + sectors_skipped;
1768 }
1769 if (sync_blocks < max_sync)
1770 max_sync = sync_blocks;
1771 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1506 1772
1507 r10_bio->mddev = mddev; 1773 r10_bio->mddev = mddev;
1508 atomic_set(&r10_bio->remaining, 0); 1774 atomic_set(&r10_bio->remaining, 0);
1775 raise_barrier(conf, 0);
1776 conf->next_resync = sector_nr;
1509 1777
1510 r10_bio->master_bio = NULL; 1778 r10_bio->master_bio = NULL;
1511 r10_bio->sector = sector_nr; 1779 r10_bio->sector = sector_nr;
@@ -1558,6 +1826,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1558 } 1826 }
1559 1827
1560 nr_sectors = 0; 1828 nr_sectors = 0;
1829 if (sector_nr + max_sync < max_sector)
1830 max_sector = sector_nr + max_sync;
1561 do { 1831 do {
1562 struct page *page; 1832 struct page *page;
1563 int len = PAGE_SIZE; 1833 int len = PAGE_SIZE;
@@ -1632,11 +1902,11 @@ static int run(mddev_t *mddev)
1632 int nc, fc; 1902 int nc, fc;
1633 sector_t stride, size; 1903 sector_t stride, size;
1634 1904
1635 if (mddev->level != 10) { 1905 if (mddev->chunk_size == 0) {
1636 printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n", 1906 printk(KERN_ERR "md/raid10: non-zero chunk size required.\n");
1637 mdname(mddev), mddev->level); 1907 return -EINVAL;
1638 goto out;
1639 } 1908 }
1909
1640 nc = mddev->layout & 255; 1910 nc = mddev->layout & 255;
1641 fc = (mddev->layout >> 8) & 255; 1911 fc = (mddev->layout >> 8) & 255;
1642 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 1912 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
@@ -1650,22 +1920,24 @@ static int run(mddev_t *mddev)
1650 * bookkeeping area. [whatever we allocate in run(), 1920 * bookkeeping area. [whatever we allocate in run(),
1651 * should be freed in stop()] 1921 * should be freed in stop()]
1652 */ 1922 */
1653 conf = kmalloc(sizeof(conf_t), GFP_KERNEL); 1923 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1654 mddev->private = conf; 1924 mddev->private = conf;
1655 if (!conf) { 1925 if (!conf) {
1656 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 1926 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1657 mdname(mddev)); 1927 mdname(mddev));
1658 goto out; 1928 goto out;
1659 } 1929 }
1660 memset(conf, 0, sizeof(*conf)); 1930 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1661 conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1662 GFP_KERNEL); 1931 GFP_KERNEL);
1663 if (!conf->mirrors) { 1932 if (!conf->mirrors) {
1664 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 1933 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1665 mdname(mddev)); 1934 mdname(mddev));
1666 goto out_free_conf; 1935 goto out_free_conf;
1667 } 1936 }
1668 memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); 1937
1938 conf->tmppage = alloc_page(GFP_KERNEL);
1939 if (!conf->tmppage)
1940 goto out_free_conf;
1669 1941
1670 conf->near_copies = nc; 1942 conf->near_copies = nc;
1671 conf->far_copies = fc; 1943 conf->far_copies = fc;
@@ -1713,8 +1985,7 @@ static int run(mddev_t *mddev)
1713 INIT_LIST_HEAD(&conf->retry_list); 1985 INIT_LIST_HEAD(&conf->retry_list);
1714 1986
1715 spin_lock_init(&conf->resync_lock); 1987 spin_lock_init(&conf->resync_lock);
1716 init_waitqueue_head(&conf->wait_idle); 1988 init_waitqueue_head(&conf->wait_barrier);
1717 init_waitqueue_head(&conf->wait_resume);
1718 1989
1719 /* need to check that every block has at least one working mirror */ 1990 /* need to check that every block has at least one working mirror */
1720 if (!enough(conf)) { 1991 if (!enough(conf)) {
@@ -1763,7 +2034,7 @@ static int run(mddev_t *mddev)
1763 * maybe... 2034 * maybe...
1764 */ 2035 */
1765 { 2036 {
1766 int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; 2037 int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE;
1767 stripe /= conf->near_copies; 2038 stripe /= conf->near_copies;
1768 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 2039 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
1769 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2040 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
@@ -1776,6 +2047,7 @@ static int run(mddev_t *mddev)
1776out_free_conf: 2047out_free_conf:
1777 if (conf->r10bio_pool) 2048 if (conf->r10bio_pool)
1778 mempool_destroy(conf->r10bio_pool); 2049 mempool_destroy(conf->r10bio_pool);
2050 safe_put_page(conf->tmppage);
1779 kfree(conf->mirrors); 2051 kfree(conf->mirrors);
1780 kfree(conf); 2052 kfree(conf);
1781 mddev->private = NULL; 2053 mddev->private = NULL;
@@ -1798,10 +2070,31 @@ static int stop(mddev_t *mddev)
1798 return 0; 2070 return 0;
1799} 2071}
1800 2072
2073static void raid10_quiesce(mddev_t *mddev, int state)
2074{
2075 conf_t *conf = mddev_to_conf(mddev);
2076
2077 switch(state) {
2078 case 1:
2079 raise_barrier(conf, 0);
2080 break;
2081 case 0:
2082 lower_barrier(conf);
2083 break;
2084 }
2085 if (mddev->thread) {
2086 if (mddev->bitmap)
2087 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2088 else
2089 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2090 md_wakeup_thread(mddev->thread);
2091 }
2092}
1801 2093
1802static mdk_personality_t raid10_personality = 2094static struct mdk_personality raid10_personality =
1803{ 2095{
1804 .name = "raid10", 2096 .name = "raid10",
2097 .level = 10,
1805 .owner = THIS_MODULE, 2098 .owner = THIS_MODULE,
1806 .make_request = make_request, 2099 .make_request = make_request,
1807 .run = run, 2100 .run = run,
@@ -1812,19 +2105,22 @@ static mdk_personality_t raid10_personality =
1812 .hot_remove_disk= raid10_remove_disk, 2105 .hot_remove_disk= raid10_remove_disk,
1813 .spare_active = raid10_spare_active, 2106 .spare_active = raid10_spare_active,
1814 .sync_request = sync_request, 2107 .sync_request = sync_request,
2108 .quiesce = raid10_quiesce,
1815}; 2109};
1816 2110
1817static int __init raid_init(void) 2111static int __init raid_init(void)
1818{ 2112{
1819 return register_md_personality(RAID10, &raid10_personality); 2113 return register_md_personality(&raid10_personality);
1820} 2114}
1821 2115
1822static void raid_exit(void) 2116static void raid_exit(void)
1823{ 2117{
1824 unregister_md_personality(RAID10); 2118 unregister_md_personality(&raid10_personality);
1825} 2119}
1826 2120
1827module_init(raid_init); 2121module_init(raid_init);
1828module_exit(raid_exit); 2122module_exit(raid_exit);
1829MODULE_LICENSE("GPL"); 2123MODULE_LICENSE("GPL");
1830MODULE_ALIAS("md-personality-9"); /* RAID10 */ 2124MODULE_ALIAS("md-personality-9"); /* RAID10 */
2125MODULE_ALIAS("md-raid10");
2126MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fafc4bc045f..54f4a9847e3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -35,12 +35,10 @@
35#define STRIPE_SHIFT (PAGE_SHIFT - 9) 35#define STRIPE_SHIFT (PAGE_SHIFT - 9)
36#define STRIPE_SECTORS (STRIPE_SIZE>>9) 36#define STRIPE_SECTORS (STRIPE_SIZE>>9)
37#define IO_THRESHOLD 1 37#define IO_THRESHOLD 1
38#define HASH_PAGES 1 38#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
39#define HASH_PAGES_ORDER 0
40#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
41#define HASH_MASK (NR_HASH - 1) 39#define HASH_MASK (NR_HASH - 1)
42 40
43#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) 41#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
44 42
45/* bio's attached to a stripe+device for I/O are linked together in bi_sector 43/* bio's attached to a stripe+device for I/O are linked together in bi_sector
46 * order without overlap. There may be several bio's per stripe+device, and 44 * order without overlap. There may be several bio's per stripe+device, and
@@ -113,29 +111,21 @@ static void release_stripe(struct stripe_head *sh)
113 spin_unlock_irqrestore(&conf->device_lock, flags); 111 spin_unlock_irqrestore(&conf->device_lock, flags);
114} 112}
115 113
116static void remove_hash(struct stripe_head *sh) 114static inline void remove_hash(struct stripe_head *sh)
117{ 115{
118 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); 116 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
119 117
120 if (sh->hash_pprev) { 118 hlist_del_init(&sh->hash);
121 if (sh->hash_next)
122 sh->hash_next->hash_pprev = sh->hash_pprev;
123 *sh->hash_pprev = sh->hash_next;
124 sh->hash_pprev = NULL;
125 }
126} 119}
127 120
128static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 121static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
129{ 122{
130 struct stripe_head **shp = &stripe_hash(conf, sh->sector); 123 struct hlist_head *hp = stripe_hash(conf, sh->sector);
131 124
132 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); 125 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
133 126
134 CHECK_DEVLOCK(); 127 CHECK_DEVLOCK();
135 if ((sh->hash_next = *shp) != NULL) 128 hlist_add_head(&sh->hash, hp);
136 (*shp)->hash_pprev = &sh->hash_next;
137 *shp = sh;
138 sh->hash_pprev = shp;
139} 129}
140 130
141 131
@@ -167,7 +157,7 @@ static void shrink_buffers(struct stripe_head *sh, int num)
167 if (!p) 157 if (!p)
168 continue; 158 continue;
169 sh->dev[i].page = NULL; 159 sh->dev[i].page = NULL;
170 page_cache_release(p); 160 put_page(p);
171 } 161 }
172} 162}
173 163
@@ -228,10 +218,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i
228static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector) 218static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
229{ 219{
230 struct stripe_head *sh; 220 struct stripe_head *sh;
221 struct hlist_node *hn;
231 222
232 CHECK_DEVLOCK(); 223 CHECK_DEVLOCK();
233 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); 224 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
234 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) 225 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
235 if (sh->sector == sector) 226 if (sh->sector == sector)
236 return sh; 227 return sh;
237 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); 228 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -417,7 +408,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
417 set_bit(R5_UPTODATE, &sh->dev[i].flags); 408 set_bit(R5_UPTODATE, &sh->dev[i].flags);
418#endif 409#endif
419 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 410 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
420 printk("R5: read error corrected!!\n"); 411 printk(KERN_INFO "raid5: read error corrected!!\n");
421 clear_bit(R5_ReadError, &sh->dev[i].flags); 412 clear_bit(R5_ReadError, &sh->dev[i].flags);
422 clear_bit(R5_ReWrite, &sh->dev[i].flags); 413 clear_bit(R5_ReWrite, &sh->dev[i].flags);
423 } 414 }
@@ -428,13 +419,14 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
428 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 419 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
429 atomic_inc(&conf->disks[i].rdev->read_errors); 420 atomic_inc(&conf->disks[i].rdev->read_errors);
430 if (conf->mddev->degraded) 421 if (conf->mddev->degraded)
431 printk("R5: read error not correctable.\n"); 422 printk(KERN_WARNING "raid5: read error not correctable.\n");
432 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 423 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
433 /* Oh, no!!! */ 424 /* Oh, no!!! */
434 printk("R5: read error NOT corrected!!\n"); 425 printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
435 else if (atomic_read(&conf->disks[i].rdev->read_errors) 426 else if (atomic_read(&conf->disks[i].rdev->read_errors)
436 > conf->max_nr_stripes) 427 > conf->max_nr_stripes)
437 printk("raid5: Too many read errors, failing device.\n"); 428 printk(KERN_WARNING
429 "raid5: Too many read errors, failing device.\n");
438 else 430 else
439 retry = 1; 431 retry = 1;
440 if (retry) 432 if (retry)
@@ -604,7 +596,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
604 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; 596 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
605 break; 597 break;
606 default: 598 default:
607 printk("raid5: unsupported algorithm %d\n", 599 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
608 conf->algorithm); 600 conf->algorithm);
609 } 601 }
610 602
@@ -645,7 +637,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
645 i -= (sh->pd_idx + 1); 637 i -= (sh->pd_idx + 1);
646 break; 638 break;
647 default: 639 default:
648 printk("raid5: unsupported algorithm %d\n", 640 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
649 conf->algorithm); 641 conf->algorithm);
650 } 642 }
651 643
@@ -654,7 +646,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
654 646
655 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); 647 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
656 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { 648 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
657 printk("compute_blocknr: map not correct\n"); 649 printk(KERN_ERR "compute_blocknr: map not correct\n");
658 return 0; 650 return 0;
659 } 651 }
660 return r_sector; 652 return r_sector;
@@ -737,7 +729,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
737 if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) 729 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
738 ptr[count++] = p; 730 ptr[count++] = p;
739 else 731 else
740 printk("compute_block() %d, stripe %llu, %d" 732 printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
741 " not present\n", dd_idx, 733 " not present\n", dd_idx,
742 (unsigned long long)sh->sector, i); 734 (unsigned long long)sh->sector, i);
743 735
@@ -960,11 +952,11 @@ static void handle_stripe(struct stripe_head *sh)
960 syncing = test_bit(STRIPE_SYNCING, &sh->state); 952 syncing = test_bit(STRIPE_SYNCING, &sh->state);
961 /* Now to look around and see what can be done */ 953 /* Now to look around and see what can be done */
962 954
955 rcu_read_lock();
963 for (i=disks; i--; ) { 956 for (i=disks; i--; ) {
964 mdk_rdev_t *rdev; 957 mdk_rdev_t *rdev;
965 dev = &sh->dev[i]; 958 dev = &sh->dev[i];
966 clear_bit(R5_Insync, &dev->flags); 959 clear_bit(R5_Insync, &dev->flags);
967 clear_bit(R5_Syncio, &dev->flags);
968 960
969 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 961 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
970 i, dev->flags, dev->toread, dev->towrite, dev->written); 962 i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -1003,9 +995,9 @@ static void handle_stripe(struct stripe_head *sh)
1003 non_overwrite++; 995 non_overwrite++;
1004 } 996 }
1005 if (dev->written) written++; 997 if (dev->written) written++;
1006 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ 998 rdev = rcu_dereference(conf->disks[i].rdev);
1007 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 999 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1008 /* The ReadError flag wil just be confusing now */ 1000 /* The ReadError flag will just be confusing now */
1009 clear_bit(R5_ReadError, &dev->flags); 1001 clear_bit(R5_ReadError, &dev->flags);
1010 clear_bit(R5_ReWrite, &dev->flags); 1002 clear_bit(R5_ReWrite, &dev->flags);
1011 } 1003 }
@@ -1016,6 +1008,7 @@ static void handle_stripe(struct stripe_head *sh)
1016 } else 1008 } else
1017 set_bit(R5_Insync, &dev->flags); 1009 set_bit(R5_Insync, &dev->flags);
1018 } 1010 }
1011 rcu_read_unlock();
1019 PRINTK("locked=%d uptodate=%d to_read=%d" 1012 PRINTK("locked=%d uptodate=%d to_read=%d"
1020 " to_write=%d failed=%d failed_num=%d\n", 1013 " to_write=%d failed=%d failed_num=%d\n",
1021 locked, uptodate, to_read, to_write, failed, failed_num); 1014 locked, uptodate, to_read, to_write, failed, failed_num);
@@ -1027,10 +1020,13 @@ static void handle_stripe(struct stripe_head *sh)
1027 int bitmap_end = 0; 1020 int bitmap_end = 0;
1028 1021
1029 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1022 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1030 mdk_rdev_t *rdev = conf->disks[i].rdev; 1023 mdk_rdev_t *rdev;
1024 rcu_read_lock();
1025 rdev = rcu_dereference(conf->disks[i].rdev);
1031 if (rdev && test_bit(In_sync, &rdev->flags)) 1026 if (rdev && test_bit(In_sync, &rdev->flags))
1032 /* multiple read failures in one stripe */ 1027 /* multiple read failures in one stripe */
1033 md_error(conf->mddev, rdev); 1028 md_error(conf->mddev, rdev);
1029 rcu_read_unlock();
1034 } 1030 }
1035 1031
1036 spin_lock_irq(&conf->device_lock); 1032 spin_lock_irq(&conf->device_lock);
@@ -1179,9 +1175,6 @@ static void handle_stripe(struct stripe_head *sh)
1179 locked++; 1175 locked++;
1180 PRINTK("Reading block %d (sync=%d)\n", 1176 PRINTK("Reading block %d (sync=%d)\n",
1181 i, syncing); 1177 i, syncing);
1182 if (syncing)
1183 md_sync_acct(conf->disks[i].rdev->bdev,
1184 STRIPE_SECTORS);
1185 } 1178 }
1186 } 1179 }
1187 } 1180 }
@@ -1288,7 +1281,7 @@ static void handle_stripe(struct stripe_head *sh)
1288 * is available 1281 * is available
1289 */ 1282 */
1290 if (syncing && locked == 0 && 1283 if (syncing && locked == 0 &&
1291 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { 1284 !test_bit(STRIPE_INSYNC, &sh->state)) {
1292 set_bit(STRIPE_HANDLE, &sh->state); 1285 set_bit(STRIPE_HANDLE, &sh->state);
1293 if (failed == 0) { 1286 if (failed == 0) {
1294 char *pagea; 1287 char *pagea;
@@ -1306,27 +1299,25 @@ static void handle_stripe(struct stripe_head *sh)
1306 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 1299 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1307 /* don't try to repair!! */ 1300 /* don't try to repair!! */
1308 set_bit(STRIPE_INSYNC, &sh->state); 1301 set_bit(STRIPE_INSYNC, &sh->state);
1302 else {
1303 compute_block(sh, sh->pd_idx);
1304 uptodate++;
1305 }
1309 } 1306 }
1310 } 1307 }
1311 if (!test_bit(STRIPE_INSYNC, &sh->state)) { 1308 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1309 /* either failed parity check, or recovery is happening */
1312 if (failed==0) 1310 if (failed==0)
1313 failed_num = sh->pd_idx; 1311 failed_num = sh->pd_idx;
1314 /* should be able to compute the missing block and write it to spare */
1315 if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
1316 if (uptodate+1 != disks)
1317 BUG();
1318 compute_block(sh, failed_num);
1319 uptodate++;
1320 }
1321 if (uptodate != disks)
1322 BUG();
1323 dev = &sh->dev[failed_num]; 1312 dev = &sh->dev[failed_num];
1313 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
1314 BUG_ON(uptodate != disks);
1315
1324 set_bit(R5_LOCKED, &dev->flags); 1316 set_bit(R5_LOCKED, &dev->flags);
1325 set_bit(R5_Wantwrite, &dev->flags); 1317 set_bit(R5_Wantwrite, &dev->flags);
1326 clear_bit(STRIPE_DEGRADED, &sh->state); 1318 clear_bit(STRIPE_DEGRADED, &sh->state);
1327 locked++; 1319 locked++;
1328 set_bit(STRIPE_INSYNC, &sh->state); 1320 set_bit(STRIPE_INSYNC, &sh->state);
1329 set_bit(R5_Syncio, &dev->flags);
1330 } 1321 }
1331 } 1322 }
1332 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 1323 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -1392,7 +1383,7 @@ static void handle_stripe(struct stripe_head *sh)
1392 rcu_read_unlock(); 1383 rcu_read_unlock();
1393 1384
1394 if (rdev) { 1385 if (rdev) {
1395 if (test_bit(R5_Syncio, &sh->dev[i].flags)) 1386 if (syncing)
1396 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1387 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1397 1388
1398 bi->bi_bdev = rdev->bdev; 1389 bi->bi_bdev = rdev->bdev;
@@ -1409,6 +1400,9 @@ static void handle_stripe(struct stripe_head *sh)
1409 bi->bi_io_vec[0].bv_offset = 0; 1400 bi->bi_io_vec[0].bv_offset = 0;
1410 bi->bi_size = STRIPE_SIZE; 1401 bi->bi_size = STRIPE_SIZE;
1411 bi->bi_next = NULL; 1402 bi->bi_next = NULL;
1403 if (rw == WRITE &&
1404 test_bit(R5_ReWrite, &sh->dev[i].flags))
1405 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1412 generic_make_request(bi); 1406 generic_make_request(bi);
1413 } else { 1407 } else {
1414 if (rw == 1) 1408 if (rw == 1)
@@ -1822,21 +1816,21 @@ static int run(mddev_t *mddev)
1822 struct list_head *tmp; 1816 struct list_head *tmp;
1823 1817
1824 if (mddev->level != 5 && mddev->level != 4) { 1818 if (mddev->level != 5 && mddev->level != 4) {
1825 printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev), mddev->level); 1819 printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n",
1820 mdname(mddev), mddev->level);
1826 return -EIO; 1821 return -EIO;
1827 } 1822 }
1828 1823
1829 mddev->private = kmalloc (sizeof (raid5_conf_t) 1824 mddev->private = kzalloc(sizeof (raid5_conf_t)
1830 + mddev->raid_disks * sizeof(struct disk_info), 1825 + mddev->raid_disks * sizeof(struct disk_info),
1831 GFP_KERNEL); 1826 GFP_KERNEL);
1832 if ((conf = mddev->private) == NULL) 1827 if ((conf = mddev->private) == NULL)
1833 goto abort; 1828 goto abort;
1834 memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); 1829
1835 conf->mddev = mddev; 1830 conf->mddev = mddev;
1836 1831
1837 if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) 1832 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
1838 goto abort; 1833 goto abort;
1839 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1840 1834
1841 spin_lock_init(&conf->device_lock); 1835 spin_lock_init(&conf->device_lock);
1842 init_waitqueue_head(&conf->wait_for_stripe); 1836 init_waitqueue_head(&conf->wait_for_stripe);
@@ -1903,10 +1897,17 @@ static int run(mddev_t *mddev)
1903 1897
1904 if (mddev->degraded == 1 && 1898 if (mddev->degraded == 1 &&
1905 mddev->recovery_cp != MaxSector) { 1899 mddev->recovery_cp != MaxSector) {
1906 printk(KERN_ERR 1900 if (mddev->ok_start_degraded)
1907 "raid5: cannot start dirty degraded array for %s\n", 1901 printk(KERN_WARNING
1908 mdname(mddev)); 1902 "raid5: starting dirty degraded array: %s"
1909 goto abort; 1903 "- data corruption possible.\n",
1904 mdname(mddev));
1905 else {
1906 printk(KERN_ERR
1907 "raid5: cannot start dirty degraded array for %s\n",
1908 mdname(mddev));
1909 goto abort;
1910 }
1910 } 1911 }
1911 1912
1912 { 1913 {
@@ -1948,7 +1949,7 @@ static int run(mddev_t *mddev)
1948 */ 1949 */
1949 { 1950 {
1950 int stripe = (mddev->raid_disks-1) * mddev->chunk_size 1951 int stripe = (mddev->raid_disks-1) * mddev->chunk_size
1951 / PAGE_CACHE_SIZE; 1952 / PAGE_SIZE;
1952 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 1953 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
1953 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 1954 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
1954 } 1955 }
@@ -1956,9 +1957,6 @@ static int run(mddev_t *mddev)
1956 /* Ok, everything is just fine now */ 1957 /* Ok, everything is just fine now */
1957 sysfs_create_group(&mddev->kobj, &raid5_attrs_group); 1958 sysfs_create_group(&mddev->kobj, &raid5_attrs_group);
1958 1959
1959 if (mddev->bitmap)
1960 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1961
1962 mddev->queue->unplug_fn = raid5_unplug_device; 1960 mddev->queue->unplug_fn = raid5_unplug_device;
1963 mddev->queue->issue_flush_fn = raid5_issue_flush; 1961 mddev->queue->issue_flush_fn = raid5_issue_flush;
1964 1962
@@ -1967,9 +1965,7 @@ static int run(mddev_t *mddev)
1967abort: 1965abort:
1968 if (conf) { 1966 if (conf) {
1969 print_raid5_conf(conf); 1967 print_raid5_conf(conf);
1970 if (conf->stripe_hashtbl) 1968 kfree(conf->stripe_hashtbl);
1971 free_pages((unsigned long) conf->stripe_hashtbl,
1972 HASH_PAGES_ORDER);
1973 kfree(conf); 1969 kfree(conf);
1974 } 1970 }
1975 mddev->private = NULL; 1971 mddev->private = NULL;
@@ -1986,7 +1982,7 @@ static int stop(mddev_t *mddev)
1986 md_unregister_thread(mddev->thread); 1982 md_unregister_thread(mddev->thread);
1987 mddev->thread = NULL; 1983 mddev->thread = NULL;
1988 shrink_stripes(conf); 1984 shrink_stripes(conf);
1989 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); 1985 kfree(conf->stripe_hashtbl);
1990 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1986 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1991 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 1987 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
1992 kfree(conf); 1988 kfree(conf);
@@ -2014,12 +2010,12 @@ static void print_sh (struct stripe_head *sh)
2014static void printall (raid5_conf_t *conf) 2010static void printall (raid5_conf_t *conf)
2015{ 2011{
2016 struct stripe_head *sh; 2012 struct stripe_head *sh;
2013 struct hlist_node *hn;
2017 int i; 2014 int i;
2018 2015
2019 spin_lock_irq(&conf->device_lock); 2016 spin_lock_irq(&conf->device_lock);
2020 for (i = 0; i < NR_HASH; i++) { 2017 for (i = 0; i < NR_HASH; i++) {
2021 sh = conf->stripe_hashtbl[i]; 2018 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2022 for (; sh; sh = sh->hash_next) {
2023 if (sh->raid_conf != conf) 2019 if (sh->raid_conf != conf)
2024 continue; 2020 continue;
2025 print_sh(sh); 2021 print_sh(sh);
@@ -2192,17 +2188,12 @@ static void raid5_quiesce(mddev_t *mddev, int state)
2192 spin_unlock_irq(&conf->device_lock); 2188 spin_unlock_irq(&conf->device_lock);
2193 break; 2189 break;
2194 } 2190 }
2195 if (mddev->thread) {
2196 if (mddev->bitmap)
2197 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2198 else
2199 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2200 md_wakeup_thread(mddev->thread);
2201 }
2202} 2191}
2203static mdk_personality_t raid5_personality= 2192
2193static struct mdk_personality raid5_personality =
2204{ 2194{
2205 .name = "raid5", 2195 .name = "raid5",
2196 .level = 5,
2206 .owner = THIS_MODULE, 2197 .owner = THIS_MODULE,
2207 .make_request = make_request, 2198 .make_request = make_request,
2208 .run = run, 2199 .run = run,
@@ -2217,17 +2208,42 @@ static mdk_personality_t raid5_personality=
2217 .quiesce = raid5_quiesce, 2208 .quiesce = raid5_quiesce,
2218}; 2209};
2219 2210
2220static int __init raid5_init (void) 2211static struct mdk_personality raid4_personality =
2221{ 2212{
2222 return register_md_personality (RAID5, &raid5_personality); 2213 .name = "raid4",
2214 .level = 4,
2215 .owner = THIS_MODULE,
2216 .make_request = make_request,
2217 .run = run,
2218 .stop = stop,
2219 .status = status,
2220 .error_handler = error,
2221 .hot_add_disk = raid5_add_disk,
2222 .hot_remove_disk= raid5_remove_disk,
2223 .spare_active = raid5_spare_active,
2224 .sync_request = sync_request,
2225 .resize = raid5_resize,
2226 .quiesce = raid5_quiesce,
2227};
2228
2229static int __init raid5_init(void)
2230{
2231 register_md_personality(&raid5_personality);
2232 register_md_personality(&raid4_personality);
2233 return 0;
2223} 2234}
2224 2235
2225static void raid5_exit (void) 2236static void raid5_exit(void)
2226{ 2237{
2227 unregister_md_personality (RAID5); 2238 unregister_md_personality(&raid5_personality);
2239 unregister_md_personality(&raid4_personality);
2228} 2240}
2229 2241
2230module_init(raid5_init); 2242module_init(raid5_init);
2231module_exit(raid5_exit); 2243module_exit(raid5_exit);
2232MODULE_LICENSE("GPL"); 2244MODULE_LICENSE("GPL");
2233MODULE_ALIAS("md-personality-4"); /* RAID5 */ 2245MODULE_ALIAS("md-personality-4"); /* RAID5 */
2246MODULE_ALIAS("md-raid5");
2247MODULE_ALIAS("md-raid4");
2248MODULE_ALIAS("md-level-5");
2249MODULE_ALIAS("md-level-4");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 0000d162d19..8c823d686a6 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -40,12 +40,10 @@
40#define STRIPE_SHIFT (PAGE_SHIFT - 9) 40#define STRIPE_SHIFT (PAGE_SHIFT - 9)
41#define STRIPE_SECTORS (STRIPE_SIZE>>9) 41#define STRIPE_SECTORS (STRIPE_SIZE>>9)
42#define IO_THRESHOLD 1 42#define IO_THRESHOLD 1
43#define HASH_PAGES 1 43#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
44#define HASH_PAGES_ORDER 0
45#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
46#define HASH_MASK (NR_HASH - 1) 44#define HASH_MASK (NR_HASH - 1)
47 45
48#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) 46#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
49 47
50/* bio's attached to a stripe+device for I/O are linked together in bi_sector 48/* bio's attached to a stripe+device for I/O are linked together in bi_sector
51 * order without overlap. There may be several bio's per stripe+device, and 49 * order without overlap. There may be several bio's per stripe+device, and
@@ -132,29 +130,21 @@ static void release_stripe(struct stripe_head *sh)
132 spin_unlock_irqrestore(&conf->device_lock, flags); 130 spin_unlock_irqrestore(&conf->device_lock, flags);
133} 131}
134 132
135static void remove_hash(struct stripe_head *sh) 133static inline void remove_hash(struct stripe_head *sh)
136{ 134{
137 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); 135 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
138 136
139 if (sh->hash_pprev) { 137 hlist_del_init(&sh->hash);
140 if (sh->hash_next)
141 sh->hash_next->hash_pprev = sh->hash_pprev;
142 *sh->hash_pprev = sh->hash_next;
143 sh->hash_pprev = NULL;
144 }
145} 138}
146 139
147static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) 140static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
148{ 141{
149 struct stripe_head **shp = &stripe_hash(conf, sh->sector); 142 struct hlist_head *hp = stripe_hash(conf, sh->sector);
150 143
151 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); 144 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
152 145
153 CHECK_DEVLOCK(); 146 CHECK_DEVLOCK();
154 if ((sh->hash_next = *shp) != NULL) 147 hlist_add_head(&sh->hash, hp);
155 (*shp)->hash_pprev = &sh->hash_next;
156 *shp = sh;
157 sh->hash_pprev = shp;
158} 148}
159 149
160 150
@@ -186,7 +176,7 @@ static void shrink_buffers(struct stripe_head *sh, int num)
186 if (!p) 176 if (!p)
187 continue; 177 continue;
188 sh->dev[i].page = NULL; 178 sh->dev[i].page = NULL;
189 page_cache_release(p); 179 put_page(p);
190 } 180 }
191} 181}
192 182
@@ -247,10 +237,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i
247static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) 237static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
248{ 238{
249 struct stripe_head *sh; 239 struct stripe_head *sh;
240 struct hlist_node *hn;
250 241
251 CHECK_DEVLOCK(); 242 CHECK_DEVLOCK();
252 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); 243 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
253 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) 244 hlist_for_each_entry (sh, hn, stripe_hash(conf, sector), hash)
254 if (sh->sector == sector) 245 if (sh->sector == sector)
255 return sh; 246 return sh;
256 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); 247 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -367,8 +358,8 @@ static void shrink_stripes(raid6_conf_t *conf)
367 conf->slab_cache = NULL; 358 conf->slab_cache = NULL;
368} 359}
369 360
370static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, 361static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done,
371 int error) 362 int error)
372{ 363{
373 struct stripe_head *sh = bi->bi_private; 364 struct stripe_head *sh = bi->bi_private;
374 raid6_conf_t *conf = sh->raid_conf; 365 raid6_conf_t *conf = sh->raid_conf;
@@ -420,9 +411,35 @@ static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done,
420#else 411#else
421 set_bit(R5_UPTODATE, &sh->dev[i].flags); 412 set_bit(R5_UPTODATE, &sh->dev[i].flags);
422#endif 413#endif
414 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
415 printk(KERN_INFO "raid6: read error corrected!!\n");
416 clear_bit(R5_ReadError, &sh->dev[i].flags);
417 clear_bit(R5_ReWrite, &sh->dev[i].flags);
418 }
419 if (atomic_read(&conf->disks[i].rdev->read_errors))
420 atomic_set(&conf->disks[i].rdev->read_errors, 0);
423 } else { 421 } else {
424 md_error(conf->mddev, conf->disks[i].rdev); 422 int retry = 0;
425 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 423 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
424 atomic_inc(&conf->disks[i].rdev->read_errors);
425 if (conf->mddev->degraded)
426 printk(KERN_WARNING "raid6: read error not correctable.\n");
427 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
428 /* Oh, no!!! */
429 printk(KERN_WARNING "raid6: read error NOT corrected!!\n");
430 else if (atomic_read(&conf->disks[i].rdev->read_errors)
431 > conf->max_nr_stripes)
432 printk(KERN_WARNING
433 "raid6: Too many read errors, failing device.\n");
434 else
435 retry = 1;
436 if (retry)
437 set_bit(R5_ReadError, &sh->dev[i].flags);
438 else {
439 clear_bit(R5_ReadError, &sh->dev[i].flags);
440 clear_bit(R5_ReWrite, &sh->dev[i].flags);
441 md_error(conf->mddev, conf->disks[i].rdev);
442 }
426 } 443 }
427 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 444 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
428#if 0 445#if 0
@@ -805,7 +822,7 @@ static void compute_parity(struct stripe_head *sh, int method)
805} 822}
806 823
807/* Compute one missing block */ 824/* Compute one missing block */
808static void compute_block_1(struct stripe_head *sh, int dd_idx) 825static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
809{ 826{
810 raid6_conf_t *conf = sh->raid_conf; 827 raid6_conf_t *conf = sh->raid_conf;
811 int i, count, disks = conf->raid_disks; 828 int i, count, disks = conf->raid_disks;
@@ -821,7 +838,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
821 compute_parity(sh, UPDATE_PARITY); 838 compute_parity(sh, UPDATE_PARITY);
822 } else { 839 } else {
823 ptr[0] = page_address(sh->dev[dd_idx].page); 840 ptr[0] = page_address(sh->dev[dd_idx].page);
824 memset(ptr[0], 0, STRIPE_SIZE); 841 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
825 count = 1; 842 count = 1;
826 for (i = disks ; i--; ) { 843 for (i = disks ; i--; ) {
827 if (i == dd_idx || i == qd_idx) 844 if (i == dd_idx || i == qd_idx)
@@ -838,7 +855,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
838 } 855 }
839 if (count != 1) 856 if (count != 1)
840 xor_block(count, STRIPE_SIZE, ptr); 857 xor_block(count, STRIPE_SIZE, ptr);
841 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 858 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
859 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
842 } 860 }
843} 861}
844 862
@@ -871,7 +889,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
871 return; 889 return;
872 } else { 890 } else {
873 /* We're missing D+Q; recompute D from P */ 891 /* We're missing D+Q; recompute D from P */
874 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); 892 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
875 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ 893 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
876 return; 894 return;
877 } 895 }
@@ -982,6 +1000,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
982} 1000}
983 1001
984 1002
1003static int page_is_zero(struct page *p)
1004{
1005 char *a = page_address(p);
1006 return ((*(u32*)a) == 0 &&
1007 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1008}
985/* 1009/*
986 * handle_stripe - do things to a stripe. 1010 * handle_stripe - do things to a stripe.
987 * 1011 *
@@ -1000,7 +1024,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1000 * 1024 *
1001 */ 1025 */
1002 1026
1003static void handle_stripe(struct stripe_head *sh) 1027static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
1004{ 1028{
1005 raid6_conf_t *conf = sh->raid_conf; 1029 raid6_conf_t *conf = sh->raid_conf;
1006 int disks = conf->raid_disks; 1030 int disks = conf->raid_disks;
@@ -1027,11 +1051,11 @@ static void handle_stripe(struct stripe_head *sh)
1027 syncing = test_bit(STRIPE_SYNCING, &sh->state); 1051 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1028 /* Now to look around and see what can be done */ 1052 /* Now to look around and see what can be done */
1029 1053
1054 rcu_read_lock();
1030 for (i=disks; i--; ) { 1055 for (i=disks; i--; ) {
1031 mdk_rdev_t *rdev; 1056 mdk_rdev_t *rdev;
1032 dev = &sh->dev[i]; 1057 dev = &sh->dev[i];
1033 clear_bit(R5_Insync, &dev->flags); 1058 clear_bit(R5_Insync, &dev->flags);
1034 clear_bit(R5_Syncio, &dev->flags);
1035 1059
1036 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 1060 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1037 i, dev->flags, dev->toread, dev->towrite, dev->written); 1061 i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -1070,14 +1094,21 @@ static void handle_stripe(struct stripe_head *sh)
1070 non_overwrite++; 1094 non_overwrite++;
1071 } 1095 }
1072 if (dev->written) written++; 1096 if (dev->written) written++;
1073 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ 1097 rdev = rcu_dereference(conf->disks[i].rdev);
1074 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 1098 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1099 /* The ReadError flag will just be confusing now */
1100 clear_bit(R5_ReadError, &dev->flags);
1101 clear_bit(R5_ReWrite, &dev->flags);
1102 }
1103 if (!rdev || !test_bit(In_sync, &rdev->flags)
1104 || test_bit(R5_ReadError, &dev->flags)) {
1075 if ( failed < 2 ) 1105 if ( failed < 2 )
1076 failed_num[failed] = i; 1106 failed_num[failed] = i;
1077 failed++; 1107 failed++;
1078 } else 1108 } else
1079 set_bit(R5_Insync, &dev->flags); 1109 set_bit(R5_Insync, &dev->flags);
1080 } 1110 }
1111 rcu_read_unlock();
1081 PRINTK("locked=%d uptodate=%d to_read=%d" 1112 PRINTK("locked=%d uptodate=%d to_read=%d"
1082 " to_write=%d failed=%d failed_num=%d,%d\n", 1113 " to_write=%d failed=%d failed_num=%d,%d\n",
1083 locked, uptodate, to_read, to_write, failed, 1114 locked, uptodate, to_read, to_write, failed,
@@ -1088,6 +1119,17 @@ static void handle_stripe(struct stripe_head *sh)
1088 if (failed > 2 && to_read+to_write+written) { 1119 if (failed > 2 && to_read+to_write+written) {
1089 for (i=disks; i--; ) { 1120 for (i=disks; i--; ) {
1090 int bitmap_end = 0; 1121 int bitmap_end = 0;
1122
1123 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1124 mdk_rdev_t *rdev;
1125 rcu_read_lock();
1126 rdev = rcu_dereference(conf->disks[i].rdev);
1127 if (rdev && test_bit(In_sync, &rdev->flags))
1128 /* multiple read failures in one stripe */
1129 md_error(conf->mddev, rdev);
1130 rcu_read_unlock();
1131 }
1132
1091 spin_lock_irq(&conf->device_lock); 1133 spin_lock_irq(&conf->device_lock);
1092 /* fail all writes first */ 1134 /* fail all writes first */
1093 bi = sh->dev[i].towrite; 1135 bi = sh->dev[i].towrite;
@@ -1123,7 +1165,8 @@ static void handle_stripe(struct stripe_head *sh)
1123 } 1165 }
1124 1166
1125 /* fail any reads if this device is non-operational */ 1167 /* fail any reads if this device is non-operational */
1126 if (!test_bit(R5_Insync, &sh->dev[i].flags)) { 1168 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1169 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1127 bi = sh->dev[i].toread; 1170 bi = sh->dev[i].toread;
1128 sh->dev[i].toread = NULL; 1171 sh->dev[i].toread = NULL;
1129 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1172 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
@@ -1228,7 +1271,7 @@ static void handle_stripe(struct stripe_head *sh)
1228 if (uptodate == disks-1) { 1271 if (uptodate == disks-1) {
1229 PRINTK("Computing stripe %llu block %d\n", 1272 PRINTK("Computing stripe %llu block %d\n",
1230 (unsigned long long)sh->sector, i); 1273 (unsigned long long)sh->sector, i);
1231 compute_block_1(sh, i); 1274 compute_block_1(sh, i, 0);
1232 uptodate++; 1275 uptodate++;
1233 } else if ( uptodate == disks-2 && failed >= 2 ) { 1276 } else if ( uptodate == disks-2 && failed >= 2 ) {
1234 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ 1277 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
@@ -1259,9 +1302,6 @@ static void handle_stripe(struct stripe_head *sh)
1259 locked++; 1302 locked++;
1260 PRINTK("Reading block %d (sync=%d)\n", 1303 PRINTK("Reading block %d (sync=%d)\n",
1261 i, syncing); 1304 i, syncing);
1262 if (syncing)
1263 md_sync_acct(conf->disks[i].rdev->bdev,
1264 STRIPE_SECTORS);
1265 } 1305 }
1266 } 1306 }
1267 } 1307 }
@@ -1323,7 +1363,7 @@ static void handle_stripe(struct stripe_head *sh)
1323 /* We have failed blocks and need to compute them */ 1363 /* We have failed blocks and need to compute them */
1324 switch ( failed ) { 1364 switch ( failed ) {
1325 case 0: BUG(); 1365 case 0: BUG();
1326 case 1: compute_block_1(sh, failed_num[0]); break; 1366 case 1: compute_block_1(sh, failed_num[0], 0); break;
1327 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; 1367 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
1328 default: BUG(); /* This request should have been failed? */ 1368 default: BUG(); /* This request should have been failed? */
1329 } 1369 }
@@ -1338,12 +1378,10 @@ static void handle_stripe(struct stripe_head *sh)
1338 (unsigned long long)sh->sector, i); 1378 (unsigned long long)sh->sector, i);
1339 locked++; 1379 locked++;
1340 set_bit(R5_Wantwrite, &sh->dev[i].flags); 1380 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1341#if 0 /**** FIX: I don't understand the logic here... ****/
1342 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1343 || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */
1344 set_bit(STRIPE_INSYNC, &sh->state);
1345#endif
1346 } 1381 }
1382 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
1383 set_bit(STRIPE_INSYNC, &sh->state);
1384
1347 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 1385 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1348 atomic_dec(&conf->preread_active_stripes); 1386 atomic_dec(&conf->preread_active_stripes);
1349 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 1387 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -1356,84 +1394,119 @@ static void handle_stripe(struct stripe_head *sh)
1356 * Any reads will already have been scheduled, so we just see if enough data 1394 * Any reads will already have been scheduled, so we just see if enough data
1357 * is available 1395 * is available
1358 */ 1396 */
1359 if (syncing && locked == 0 && 1397 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
1360 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { 1398 int update_p = 0, update_q = 0;
1361 set_bit(STRIPE_HANDLE, &sh->state); 1399 struct r5dev *dev;
1362#if 0 /* RAID-6: Don't support CHECK PARITY yet */
1363 if (failed == 0) {
1364 char *pagea;
1365 if (uptodate != disks)
1366 BUG();
1367 compute_parity(sh, CHECK_PARITY);
1368 uptodate--;
1369 pagea = page_address(sh->dev[pd_idx].page);
1370 if ((*(u32*)pagea) == 0 &&
1371 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1372 /* parity is correct (on disc, not in buffer any more) */
1373 set_bit(STRIPE_INSYNC, &sh->state);
1374 }
1375 }
1376#endif
1377 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1378 int failed_needupdate[2];
1379 struct r5dev *adev, *bdev;
1380
1381 if ( failed < 1 )
1382 failed_num[0] = pd_idx;
1383 if ( failed < 2 )
1384 failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx;
1385 1400
1386 failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); 1401 set_bit(STRIPE_HANDLE, &sh->state);
1387 failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags);
1388 1402
1389 PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", 1403 BUG_ON(failed>2);
1390 failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); 1404 BUG_ON(uptodate < disks);
1405 /* Want to check and possibly repair P and Q.
1406 * However there could be one 'failed' device, in which
1407 * case we can only check one of them, possibly using the
1408 * other to generate missing data
1409 */
1391 1410
1392#if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ 1411 /* If !tmp_page, we cannot do the calculations,
1393 /* should be able to compute the missing block(s) and write to spare */ 1412 * but as we have set STRIPE_HANDLE, we will soon be called
1394 if ( failed_needupdate[0] ^ failed_needupdate[1] ) { 1413 * by stripe_handle with a tmp_page - just wait until then.
1395 if (uptodate+1 != disks) 1414 */
1396 BUG(); 1415 if (tmp_page) {
1397 compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); 1416 if (failed == q_failed) {
1398 uptodate++; 1417 /* The only possible failed device holds 'Q', so it makes
1399 } else if ( failed_needupdate[0] & failed_needupdate[1] ) { 1418 * sense to check P (If anything else were failed, we would
1400 if (uptodate+2 != disks) 1419 * have used P to recreate it).
1401 BUG(); 1420 */
1402 compute_block_2(sh, failed_num[0], failed_num[1]); 1421 compute_block_1(sh, pd_idx, 1);
1403 uptodate += 2; 1422 if (!page_is_zero(sh->dev[pd_idx].page)) {
1423 compute_block_1(sh,pd_idx,0);
1424 update_p = 1;
1425 }
1426 }
1427 if (!q_failed && failed < 2) {
1428 /* q is not failed, and we didn't use it to generate
1429 * anything, so it makes sense to check it
1430 */
1431 memcpy(page_address(tmp_page),
1432 page_address(sh->dev[qd_idx].page),
1433 STRIPE_SIZE);
1434 compute_parity(sh, UPDATE_PARITY);
1435 if (memcmp(page_address(tmp_page),
1436 page_address(sh->dev[qd_idx].page),
1437 STRIPE_SIZE)!= 0) {
1438 clear_bit(STRIPE_INSYNC, &sh->state);
1439 update_q = 1;
1440 }
1441 }
1442 if (update_p || update_q) {
1443 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1444 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1445 /* don't try to repair!! */
1446 update_p = update_q = 0;
1404 } 1447 }
1405#else
1406 compute_block_2(sh, failed_num[0], failed_num[1]);
1407 uptodate += failed_needupdate[0] + failed_needupdate[1];
1408#endif
1409 1448
1410 if (uptodate != disks) 1449 /* now write out any block on a failed drive,
1411 BUG(); 1450 * or P or Q if they need it
1451 */
1412 1452
1413 PRINTK("Marking for sync stripe %llu blocks %d,%d\n", 1453 if (failed == 2) {
1414 (unsigned long long)sh->sector, failed_num[0], failed_num[1]); 1454 dev = &sh->dev[failed_num[1]];
1455 locked++;
1456 set_bit(R5_LOCKED, &dev->flags);
1457 set_bit(R5_Wantwrite, &dev->flags);
1458 }
1459 if (failed >= 1) {
1460 dev = &sh->dev[failed_num[0]];
1461 locked++;
1462 set_bit(R5_LOCKED, &dev->flags);
1463 set_bit(R5_Wantwrite, &dev->flags);
1464 }
1415 1465
1416 /**** FIX: Should we really do both of these unconditionally? ****/ 1466 if (update_p) {
1417 adev = &sh->dev[failed_num[0]]; 1467 dev = &sh->dev[pd_idx];
1418 locked += !test_bit(R5_LOCKED, &adev->flags); 1468 locked ++;
1419 set_bit(R5_LOCKED, &adev->flags); 1469 set_bit(R5_LOCKED, &dev->flags);
1420 set_bit(R5_Wantwrite, &adev->flags); 1470 set_bit(R5_Wantwrite, &dev->flags);
1421 bdev = &sh->dev[failed_num[1]]; 1471 }
1422 locked += !test_bit(R5_LOCKED, &bdev->flags); 1472 if (update_q) {
1423 set_bit(R5_LOCKED, &bdev->flags); 1473 dev = &sh->dev[qd_idx];
1474 locked++;
1475 set_bit(R5_LOCKED, &dev->flags);
1476 set_bit(R5_Wantwrite, &dev->flags);
1477 }
1424 clear_bit(STRIPE_DEGRADED, &sh->state); 1478 clear_bit(STRIPE_DEGRADED, &sh->state);
1425 set_bit(R5_Wantwrite, &bdev->flags);
1426 1479
1427 set_bit(STRIPE_INSYNC, &sh->state); 1480 set_bit(STRIPE_INSYNC, &sh->state);
1428 set_bit(R5_Syncio, &adev->flags);
1429 set_bit(R5_Syncio, &bdev->flags);
1430 } 1481 }
1431 } 1482 }
1483
1432 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 1484 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1433 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 1485 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1434 clear_bit(STRIPE_SYNCING, &sh->state); 1486 clear_bit(STRIPE_SYNCING, &sh->state);
1435 } 1487 }
1436 1488
1489 /* If the failed drives are just a ReadError, then we might need
1490 * to progress the repair/check process
1491 */
1492 if (failed <= 2 && ! conf->mddev->ro)
1493 for (i=0; i<failed;i++) {
1494 dev = &sh->dev[failed_num[i]];
1495 if (test_bit(R5_ReadError, &dev->flags)
1496 && !test_bit(R5_LOCKED, &dev->flags)
1497 && test_bit(R5_UPTODATE, &dev->flags)
1498 ) {
1499 if (!test_bit(R5_ReWrite, &dev->flags)) {
1500 set_bit(R5_Wantwrite, &dev->flags);
1501 set_bit(R5_ReWrite, &dev->flags);
1502 set_bit(R5_LOCKED, &dev->flags);
1503 } else {
1504 /* let's read it back */
1505 set_bit(R5_Wantread, &dev->flags);
1506 set_bit(R5_LOCKED, &dev->flags);
1507 }
1508 }
1509 }
1437 spin_unlock(&sh->lock); 1510 spin_unlock(&sh->lock);
1438 1511
1439 while ((bi=return_bi)) { 1512 while ((bi=return_bi)) {
@@ -1472,7 +1545,7 @@ static void handle_stripe(struct stripe_head *sh)
1472 rcu_read_unlock(); 1545 rcu_read_unlock();
1473 1546
1474 if (rdev) { 1547 if (rdev) {
1475 if (test_bit(R5_Syncio, &sh->dev[i].flags)) 1548 if (syncing)
1476 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1549 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1477 1550
1478 bi->bi_bdev = rdev->bdev; 1551 bi->bi_bdev = rdev->bdev;
@@ -1489,6 +1562,9 @@ static void handle_stripe(struct stripe_head *sh)
1489 bi->bi_io_vec[0].bv_offset = 0; 1562 bi->bi_io_vec[0].bv_offset = 0;
1490 bi->bi_size = STRIPE_SIZE; 1563 bi->bi_size = STRIPE_SIZE;
1491 bi->bi_next = NULL; 1564 bi->bi_next = NULL;
1565 if (rw == WRITE &&
1566 test_bit(R5_ReWrite, &sh->dev[i].flags))
1567 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1492 generic_make_request(bi); 1568 generic_make_request(bi);
1493 } else { 1569 } else {
1494 if (rw == 1) 1570 if (rw == 1)
@@ -1664,7 +1740,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1664 } 1740 }
1665 finish_wait(&conf->wait_for_overlap, &w); 1741 finish_wait(&conf->wait_for_overlap, &w);
1666 raid6_plug_device(conf); 1742 raid6_plug_device(conf);
1667 handle_stripe(sh); 1743 handle_stripe(sh, NULL);
1668 release_stripe(sh); 1744 release_stripe(sh);
1669 } else { 1745 } else {
1670 /* cannot get stripe for read-ahead, just give-up */ 1746 /* cannot get stripe for read-ahead, just give-up */
@@ -1728,6 +1804,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1728 return rv; 1804 return rv;
1729 } 1805 }
1730 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 1806 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1807 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1731 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 1808 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1732 /* we can skip this block, and probably more */ 1809 /* we can skip this block, and probably more */
1733 sync_blocks /= STRIPE_SECTORS; 1810 sync_blocks /= STRIPE_SECTORS;
@@ -1765,7 +1842,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1765 clear_bit(STRIPE_INSYNC, &sh->state); 1842 clear_bit(STRIPE_INSYNC, &sh->state);
1766 spin_unlock(&sh->lock); 1843 spin_unlock(&sh->lock);
1767 1844
1768 handle_stripe(sh); 1845 handle_stripe(sh, NULL);
1769 release_stripe(sh); 1846 release_stripe(sh);
1770 1847
1771 return STRIPE_SECTORS; 1848 return STRIPE_SECTORS;
@@ -1821,7 +1898,7 @@ static void raid6d (mddev_t *mddev)
1821 spin_unlock_irq(&conf->device_lock); 1898 spin_unlock_irq(&conf->device_lock);
1822 1899
1823 handled++; 1900 handled++;
1824 handle_stripe(sh); 1901 handle_stripe(sh, conf->spare_page);
1825 release_stripe(sh); 1902 release_stripe(sh);
1826 1903
1827 spin_lock_irq(&conf->device_lock); 1904 spin_lock_irq(&conf->device_lock);
@@ -1848,17 +1925,19 @@ static int run(mddev_t *mddev)
1848 return -EIO; 1925 return -EIO;
1849 } 1926 }
1850 1927
1851 mddev->private = kmalloc (sizeof (raid6_conf_t) 1928 mddev->private = kzalloc(sizeof (raid6_conf_t)
1852 + mddev->raid_disks * sizeof(struct disk_info), 1929 + mddev->raid_disks * sizeof(struct disk_info),
1853 GFP_KERNEL); 1930 GFP_KERNEL);
1854 if ((conf = mddev->private) == NULL) 1931 if ((conf = mddev->private) == NULL)
1855 goto abort; 1932 goto abort;
1856 memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
1857 conf->mddev = mddev; 1933 conf->mddev = mddev;
1858 1934
1859 if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) 1935 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
1936 goto abort;
1937
1938 conf->spare_page = alloc_page(GFP_KERNEL);
1939 if (!conf->spare_page)
1860 goto abort; 1940 goto abort;
1861 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1862 1941
1863 spin_lock_init(&conf->device_lock); 1942 spin_lock_init(&conf->device_lock);
1864 init_waitqueue_head(&conf->wait_for_stripe); 1943 init_waitqueue_head(&conf->wait_for_stripe);
@@ -1929,13 +2008,18 @@ static int run(mddev_t *mddev)
1929 goto abort; 2008 goto abort;
1930 } 2009 }
1931 2010
1932#if 0 /* FIX: For now */
1933 if (mddev->degraded > 0 && 2011 if (mddev->degraded > 0 &&
1934 mddev->recovery_cp != MaxSector) { 2012 mddev->recovery_cp != MaxSector) {
1935 printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev)); 2013 if (mddev->ok_start_degraded)
1936 goto abort; 2014 printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
2015 "- data corruption possible.\n",
2016 mdname(mddev));
2017 else {
2018 printk(KERN_ERR "raid6: cannot start dirty degraded array"
2019 " for %s\n", mdname(mddev));
2020 goto abort;
2021 }
1937 } 2022 }
1938#endif
1939 2023
1940 { 2024 {
1941 mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6"); 2025 mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
@@ -1977,7 +2061,7 @@ static int run(mddev_t *mddev)
1977 */ 2061 */
1978 { 2062 {
1979 int stripe = (mddev->raid_disks-2) * mddev->chunk_size 2063 int stripe = (mddev->raid_disks-2) * mddev->chunk_size
1980 / PAGE_CACHE_SIZE; 2064 / PAGE_SIZE;
1981 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 2065 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
1982 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 2066 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
1983 } 2067 }
@@ -1985,18 +2069,14 @@ static int run(mddev_t *mddev)
1985 /* Ok, everything is just fine now */ 2069 /* Ok, everything is just fine now */
1986 mddev->array_size = mddev->size * (mddev->raid_disks - 2); 2070 mddev->array_size = mddev->size * (mddev->raid_disks - 2);
1987 2071
1988 if (mddev->bitmap)
1989 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1990
1991 mddev->queue->unplug_fn = raid6_unplug_device; 2072 mddev->queue->unplug_fn = raid6_unplug_device;
1992 mddev->queue->issue_flush_fn = raid6_issue_flush; 2073 mddev->queue->issue_flush_fn = raid6_issue_flush;
1993 return 0; 2074 return 0;
1994abort: 2075abort:
1995 if (conf) { 2076 if (conf) {
1996 print_raid6_conf(conf); 2077 print_raid6_conf(conf);
1997 if (conf->stripe_hashtbl) 2078 safe_put_page(conf->spare_page);
1998 free_pages((unsigned long) conf->stripe_hashtbl, 2079 kfree(conf->stripe_hashtbl);
1999 HASH_PAGES_ORDER);
2000 kfree(conf); 2080 kfree(conf);
2001 } 2081 }
2002 mddev->private = NULL; 2082 mddev->private = NULL;
@@ -2013,7 +2093,7 @@ static int stop (mddev_t *mddev)
2013 md_unregister_thread(mddev->thread); 2093 md_unregister_thread(mddev->thread);
2014 mddev->thread = NULL; 2094 mddev->thread = NULL;
2015 shrink_stripes(conf); 2095 shrink_stripes(conf);
2016 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); 2096 kfree(conf->stripe_hashtbl);
2017 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2097 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2018 kfree(conf); 2098 kfree(conf);
2019 mddev->private = NULL; 2099 mddev->private = NULL;
@@ -2040,12 +2120,13 @@ static void print_sh (struct seq_file *seq, struct stripe_head *sh)
2040static void printall (struct seq_file *seq, raid6_conf_t *conf) 2120static void printall (struct seq_file *seq, raid6_conf_t *conf)
2041{ 2121{
2042 struct stripe_head *sh; 2122 struct stripe_head *sh;
2123 struct hlist_node *hn;
2043 int i; 2124 int i;
2044 2125
2045 spin_lock_irq(&conf->device_lock); 2126 spin_lock_irq(&conf->device_lock);
2046 for (i = 0; i < NR_HASH; i++) { 2127 for (i = 0; i < NR_HASH; i++) {
2047 sh = conf->stripe_hashtbl[i]; 2128 sh = conf->stripe_hashtbl[i];
2048 for (; sh; sh = sh->hash_next) { 2129 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2049 if (sh->raid_conf != conf) 2130 if (sh->raid_conf != conf)
2050 continue; 2131 continue;
2051 print_sh(seq, sh); 2132 print_sh(seq, sh);
@@ -2223,17 +2304,12 @@ static void raid6_quiesce(mddev_t *mddev, int state)
2223 spin_unlock_irq(&conf->device_lock); 2304 spin_unlock_irq(&conf->device_lock);
2224 break; 2305 break;
2225 } 2306 }
2226 if (mddev->thread) {
2227 if (mddev->bitmap)
2228 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2229 else
2230 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2231 md_wakeup_thread(mddev->thread);
2232 }
2233} 2307}
2234static mdk_personality_t raid6_personality= 2308
2309static struct mdk_personality raid6_personality =
2235{ 2310{
2236 .name = "raid6", 2311 .name = "raid6",
2312 .level = 6,
2237 .owner = THIS_MODULE, 2313 .owner = THIS_MODULE,
2238 .make_request = make_request, 2314 .make_request = make_request,
2239 .run = run, 2315 .run = run,
@@ -2248,7 +2324,7 @@ static mdk_personality_t raid6_personality=
2248 .quiesce = raid6_quiesce, 2324 .quiesce = raid6_quiesce,
2249}; 2325};
2250 2326
2251static int __init raid6_init (void) 2327static int __init raid6_init(void)
2252{ 2328{
2253 int e; 2329 int e;
2254 2330
@@ -2256,15 +2332,17 @@ static int __init raid6_init (void)
2256 if ( e ) 2332 if ( e )
2257 return e; 2333 return e;
2258 2334
2259 return register_md_personality (RAID6, &raid6_personality); 2335 return register_md_personality(&raid6_personality);
2260} 2336}
2261 2337
2262static void raid6_exit (void) 2338static void raid6_exit (void)
2263{ 2339{
2264 unregister_md_personality (RAID6); 2340 unregister_md_personality(&raid6_personality);
2265} 2341}
2266 2342
2267module_init(raid6_init); 2343module_init(raid6_init);
2268module_exit(raid6_exit); 2344module_exit(raid6_exit);
2269MODULE_LICENSE("GPL"); 2345MODULE_LICENSE("GPL");
2270MODULE_ALIAS("md-personality-8"); /* RAID6 */ 2346MODULE_ALIAS("md-personality-8"); /* RAID6 */
2347MODULE_ALIAS("md-raid6");
2348MODULE_ALIAS("md-level-6");