aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShaohua Li <shli@fb.com>2016-12-13 15:40:15 -0500
committerShaohua Li <shli@fb.com>2016-12-13 15:40:15 -0500
commit20737738d397dfadbca1ea50dcc00d7259f500cf (patch)
tree5765b1815331bac9ca32208963c850e60806d6de
parentb78b499a67c3f77aeb6cd0b54724bc38b141255d (diff)
parent2953079c692da067aeb6345659875b97378f9b0a (diff)
Merge branch 'md-next' into md-linus
-rw-r--r--drivers/md/bitmap.c166
-rw-r--r--drivers/md/dm-raid.c4
-rw-r--r--drivers/md/linear.c31
-rw-r--r--drivers/md/md.c701
-rw-r--r--drivers/md/md.h108
-rw-r--r--drivers/md/multipath.c92
-rw-r--r--drivers/md/raid0.c107
-rw-r--r--drivers/md/raid1.c247
-rw-r--r--drivers/md/raid1.h19
-rw-r--r--drivers/md/raid10.c295
-rw-r--r--drivers/md/raid10.h2
-rw-r--r--drivers/md/raid5-cache.c1833
-rw-r--r--drivers/md/raid5.c623
-rw-r--r--drivers/md/raid5.h172
-rw-r--r--include/uapi/linux/raid/md_p.h7
-rw-r--r--lib/raid6/avx2.c232
16 files changed, 3403 insertions, 1236 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2d826927a3bf..9fb2ccac958a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -27,6 +27,7 @@
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/seq_file.h> 29#include <linux/seq_file.h>
30#include <trace/events/block.h>
30#include "md.h" 31#include "md.h"
31#include "bitmap.h" 32#include "bitmap.h"
32 33
@@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
208 209
209static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) 210static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
210{ 211{
211 struct md_rdev *rdev = NULL; 212 struct md_rdev *rdev;
212 struct block_device *bdev; 213 struct block_device *bdev;
213 struct mddev *mddev = bitmap->mddev; 214 struct mddev *mddev = bitmap->mddev;
214 struct bitmap_storage *store = &bitmap->storage; 215 struct bitmap_storage *store = &bitmap->storage;
215 216
217restart:
218 rdev = NULL;
216 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 219 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
217 int size = PAGE_SIZE; 220 int size = PAGE_SIZE;
218 loff_t offset = mddev->bitmap_info.offset; 221 loff_t offset = mddev->bitmap_info.offset;
@@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
268 page); 271 page);
269 } 272 }
270 273
271 if (wait) 274 if (wait && md_super_wait(mddev) < 0)
272 md_super_wait(mddev); 275 goto restart;
273 return 0; 276 return 0;
274 277
275 bad_alignment: 278 bad_alignment:
@@ -405,10 +408,10 @@ static int read_page(struct file *file, unsigned long index,
405 ret = -EIO; 408 ret = -EIO;
406out: 409out:
407 if (ret) 410 if (ret)
408 printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", 411 pr_err("md: bitmap read error: (%dB @ %llu): %d\n",
409 (int)PAGE_SIZE, 412 (int)PAGE_SIZE,
410 (unsigned long long)index << PAGE_SHIFT, 413 (unsigned long long)index << PAGE_SHIFT,
411 ret); 414 ret);
412 return ret; 415 return ret;
413} 416}
414 417
@@ -416,6 +419,28 @@ out:
416 * bitmap file superblock operations 419 * bitmap file superblock operations
417 */ 420 */
418 421
422/*
423 * bitmap_wait_writes() should be called before writing any bitmap
424 * blocks, to ensure previous writes, particularly from
425 * bitmap_daemon_work(), have completed.
426 */
427static void bitmap_wait_writes(struct bitmap *bitmap)
428{
429 if (bitmap->storage.file)
430 wait_event(bitmap->write_wait,
431 atomic_read(&bitmap->pending_writes)==0);
432 else
433 /* Note that we ignore the return value. The writes
434 * might have failed, but that would just mean that
435 * some bits which should be cleared haven't been,
436 * which is safe. The relevant bitmap blocks will
437 * probably get written again, but there is no great
438 * loss if they aren't.
439 */
440 md_super_wait(bitmap->mddev);
441}
442
443
419/* update the event counter and sync the superblock to disk */ 444/* update the event counter and sync the superblock to disk */
420void bitmap_update_sb(struct bitmap *bitmap) 445void bitmap_update_sb(struct bitmap *bitmap)
421{ 446{
@@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap)
455 if (!bitmap || !bitmap->storage.sb_page) 480 if (!bitmap || !bitmap->storage.sb_page)
456 return; 481 return;
457 sb = kmap_atomic(bitmap->storage.sb_page); 482 sb = kmap_atomic(bitmap->storage.sb_page);
458 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 483 pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
459 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 484 pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
460 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); 485 pr_debug(" version: %d\n", le32_to_cpu(sb->version));
461 printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n", 486 pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
462 *(__u32 *)(sb->uuid+0), 487 *(__u32 *)(sb->uuid+0),
463 *(__u32 *)(sb->uuid+4), 488 *(__u32 *)(sb->uuid+4),
464 *(__u32 *)(sb->uuid+8), 489 *(__u32 *)(sb->uuid+8),
465 *(__u32 *)(sb->uuid+12)); 490 *(__u32 *)(sb->uuid+12));
466 printk(KERN_DEBUG " events: %llu\n", 491 pr_debug(" events: %llu\n",
467 (unsigned long long) le64_to_cpu(sb->events)); 492 (unsigned long long) le64_to_cpu(sb->events));
468 printk(KERN_DEBUG "events cleared: %llu\n", 493 pr_debug("events cleared: %llu\n",
469 (unsigned long long) le64_to_cpu(sb->events_cleared)); 494 (unsigned long long) le64_to_cpu(sb->events_cleared));
470 printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state)); 495 pr_debug(" state: %08x\n", le32_to_cpu(sb->state));
471 printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize)); 496 pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize));
472 printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); 497 pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
473 printk(KERN_DEBUG " sync size: %llu KB\n", 498 pr_debug(" sync size: %llu KB\n",
474 (unsigned long long)le64_to_cpu(sb->sync_size)/2); 499 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
475 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); 500 pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
476 kunmap_atomic(sb); 501 kunmap_atomic(sb);
477} 502}
478 503
@@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
506 BUG_ON(!chunksize); 531 BUG_ON(!chunksize);
507 if (!is_power_of_2(chunksize)) { 532 if (!is_power_of_2(chunksize)) {
508 kunmap_atomic(sb); 533 kunmap_atomic(sb);
509 printk(KERN_ERR "bitmap chunksize not a power of 2\n"); 534 pr_warn("bitmap chunksize not a power of 2\n");
510 return -EINVAL; 535 return -EINVAL;
511 } 536 }
512 sb->chunksize = cpu_to_le32(chunksize); 537 sb->chunksize = cpu_to_le32(chunksize);
513 538
514 daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; 539 daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
515 if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { 540 if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
516 printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); 541 pr_debug("Choosing daemon_sleep default (5 sec)\n");
517 daemon_sleep = 5 * HZ; 542 daemon_sleep = 5 * HZ;
518 } 543 }
519 sb->daemon_sleep = cpu_to_le32(daemon_sleep); 544 sb->daemon_sleep = cpu_to_le32(daemon_sleep);
@@ -584,7 +609,7 @@ re_read:
584 /* to 4k blocks */ 609 /* to 4k blocks */
585 bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); 610 bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
586 offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); 611 offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
587 pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, 612 pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
588 bitmap->cluster_slot, offset); 613 bitmap->cluster_slot, offset);
589 } 614 }
590 615
@@ -634,7 +659,7 @@ re_read:
634 else if (write_behind > COUNTER_MAX) 659 else if (write_behind > COUNTER_MAX)
635 reason = "write-behind limit out of range (0 - 16383)"; 660 reason = "write-behind limit out of range (0 - 16383)";
636 if (reason) { 661 if (reason) {
637 printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", 662 pr_warn("%s: invalid bitmap file superblock: %s\n",
638 bmname(bitmap), reason); 663 bmname(bitmap), reason);
639 goto out; 664 goto out;
640 } 665 }
@@ -648,18 +673,15 @@ re_read:
648 * bitmap's UUID and event counter to the mddev's 673 * bitmap's UUID and event counter to the mddev's
649 */ 674 */
650 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { 675 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
651 printk(KERN_INFO 676 pr_warn("%s: bitmap superblock UUID mismatch\n",
652 "%s: bitmap superblock UUID mismatch\n", 677 bmname(bitmap));
653 bmname(bitmap));
654 goto out; 678 goto out;
655 } 679 }
656 events = le64_to_cpu(sb->events); 680 events = le64_to_cpu(sb->events);
657 if (!nodes && (events < bitmap->mddev->events)) { 681 if (!nodes && (events < bitmap->mddev->events)) {
658 printk(KERN_INFO 682 pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n",
659 "%s: bitmap file is out of date (%llu < %llu) " 683 bmname(bitmap), events,
660 "-- forcing full recovery\n", 684 (unsigned long long) bitmap->mddev->events);
661 bmname(bitmap), events,
662 (unsigned long long) bitmap->mddev->events);
663 set_bit(BITMAP_STALE, &bitmap->flags); 685 set_bit(BITMAP_STALE, &bitmap->flags);
664 } 686 }
665 } 687 }
@@ -679,8 +701,8 @@ out:
679 if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { 701 if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
680 err = md_setup_cluster(bitmap->mddev, nodes); 702 err = md_setup_cluster(bitmap->mddev, nodes);
681 if (err) { 703 if (err) {
682 pr_err("%s: Could not setup cluster service (%d)\n", 704 pr_warn("%s: Could not setup cluster service (%d)\n",
683 bmname(bitmap), err); 705 bmname(bitmap), err);
684 goto out_no_sb; 706 goto out_no_sb;
685 } 707 }
686 bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); 708 bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
@@ -847,15 +869,13 @@ static void bitmap_file_kick(struct bitmap *bitmap)
847 ptr = file_path(bitmap->storage.file, 869 ptr = file_path(bitmap->storage.file,
848 path, PAGE_SIZE); 870 path, PAGE_SIZE);
849 871
850 printk(KERN_ALERT 872 pr_warn("%s: kicking failed bitmap file %s from array!\n",
851 "%s: kicking failed bitmap file %s from array!\n", 873 bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
852 bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
853 874
854 kfree(path); 875 kfree(path);
855 } else 876 } else
856 printk(KERN_ALERT 877 pr_warn("%s: disabling internal bitmap due to errors\n",
857 "%s: disabling internal bitmap due to errors\n", 878 bmname(bitmap));
858 bmname(bitmap));
859 } 879 }
860} 880}
861 881
@@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap)
983{ 1003{
984 unsigned long i; 1004 unsigned long i;
985 int dirty, need_write; 1005 int dirty, need_write;
1006 int writing = 0;
986 1007
987 if (!bitmap || !bitmap->storage.filemap || 1008 if (!bitmap || !bitmap->storage.filemap ||
988 test_bit(BITMAP_STALE, &bitmap->flags)) 1009 test_bit(BITMAP_STALE, &bitmap->flags))
@@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap)
997 need_write = test_and_clear_page_attr(bitmap, i, 1018 need_write = test_and_clear_page_attr(bitmap, i,
998 BITMAP_PAGE_NEEDWRITE); 1019 BITMAP_PAGE_NEEDWRITE);
999 if (dirty || need_write) { 1020 if (dirty || need_write) {
1021 if (!writing) {
1022 bitmap_wait_writes(bitmap);
1023 if (bitmap->mddev->queue)
1024 blk_add_trace_msg(bitmap->mddev->queue,
1025 "md bitmap_unplug");
1026 }
1000 clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); 1027 clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
1001 write_page(bitmap, bitmap->storage.filemap[i], 0); 1028 write_page(bitmap, bitmap->storage.filemap[i], 0);
1029 writing = 1;
1002 } 1030 }
1003 } 1031 }
1004 if (bitmap->storage.file) 1032 if (writing)
1005 wait_event(bitmap->write_wait, 1033 bitmap_wait_writes(bitmap);
1006 atomic_read(&bitmap->pending_writes)==0);
1007 else
1008 md_super_wait(bitmap->mddev);
1009 1034
1010 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 1035 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
1011 bitmap_file_kick(bitmap); 1036 bitmap_file_kick(bitmap);
@@ -1056,14 +1081,13 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1056 1081
1057 outofdate = test_bit(BITMAP_STALE, &bitmap->flags); 1082 outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
1058 if (outofdate) 1083 if (outofdate)
1059 printk(KERN_INFO "%s: bitmap file is out of date, doing full " 1084 pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap));
1060 "recovery\n", bmname(bitmap));
1061 1085
1062 if (file && i_size_read(file->f_mapping->host) < store->bytes) { 1086 if (file && i_size_read(file->f_mapping->host) < store->bytes) {
1063 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", 1087 pr_warn("%s: bitmap file too short %lu < %lu\n",
1064 bmname(bitmap), 1088 bmname(bitmap),
1065 (unsigned long) i_size_read(file->f_mapping->host), 1089 (unsigned long) i_size_read(file->f_mapping->host),
1066 store->bytes); 1090 store->bytes);
1067 goto err; 1091 goto err;
1068 } 1092 }
1069 1093
@@ -1137,16 +1161,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1137 offset = 0; 1161 offset = 0;
1138 } 1162 }
1139 1163
1140 printk(KERN_INFO "%s: bitmap initialized from disk: " 1164 pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
1141 "read %lu pages, set %lu of %lu bits\n", 1165 bmname(bitmap), store->file_pages,
1142 bmname(bitmap), store->file_pages, 1166 bit_cnt, chunks);
1143 bit_cnt, chunks);
1144 1167
1145 return 0; 1168 return 0;
1146 1169
1147 err: 1170 err:
1148 printk(KERN_INFO "%s: bitmap initialisation failed: %d\n", 1171 pr_warn("%s: bitmap initialisation failed: %d\n",
1149 bmname(bitmap), ret); 1172 bmname(bitmap), ret);
1150 return ret; 1173 return ret;
1151} 1174}
1152 1175
@@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev)
1225 } 1248 }
1226 bitmap->allclean = 1; 1249 bitmap->allclean = 1;
1227 1250
1251 if (bitmap->mddev->queue)
1252 blk_add_trace_msg(bitmap->mddev->queue,
1253 "md bitmap_daemon_work");
1254
1228 /* Any file-page which is PENDING now needs to be written. 1255 /* Any file-page which is PENDING now needs to be written.
1229 * So set NEEDWRITE now, then after we make any last-minute changes 1256 * So set NEEDWRITE now, then after we make any last-minute changes
1230 * we will write it. 1257 * we will write it.
@@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1289 } 1316 }
1290 spin_unlock_irq(&counts->lock); 1317 spin_unlock_irq(&counts->lock);
1291 1318
1319 bitmap_wait_writes(bitmap);
1292 /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. 1320 /* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
1293 * DIRTY pages need to be written by bitmap_unplug so it can wait 1321 * DIRTY pages need to be written by bitmap_unplug so it can wait
1294 * for them. 1322 * for them.
@@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
1595 atomic_read(&bitmap->mddev->recovery_active) == 0); 1623 atomic_read(&bitmap->mddev->recovery_active) == 0);
1596 1624
1597 bitmap->mddev->curr_resync_completed = sector; 1625 bitmap->mddev->curr_resync_completed = sector;
1598 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1626 set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags);
1599 sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); 1627 sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
1600 s = 0; 1628 s = 0;
1601 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1629 while (s < sector && s < bitmap->mddev->resync_max_sectors) {
@@ -1825,8 +1853,8 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
1825 if (err) 1853 if (err)
1826 goto error; 1854 goto error;
1827 1855
1828 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", 1856 pr_debug("created bitmap (%lu pages) for device %s\n",
1829 bitmap->counts.pages, bmname(bitmap)); 1857 bitmap->counts.pages, bmname(bitmap));
1830 1858
1831 err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; 1859 err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
1832 if (err) 1860 if (err)
@@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
2029 !bitmap->mddev->bitmap_info.external, 2057 !bitmap->mddev->bitmap_info.external,
2030 mddev_is_clustered(bitmap->mddev) 2058 mddev_is_clustered(bitmap->mddev)
2031 ? bitmap->cluster_slot : 0); 2059 ? bitmap->cluster_slot : 0);
2032 if (ret) 2060 if (ret) {
2061 bitmap_file_unmap(&store);
2033 goto err; 2062 goto err;
2063 }
2034 2064
2035 pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); 2065 pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
2036 2066
@@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
2089 bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + 2119 bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
2090 BITMAP_BLOCK_SHIFT); 2120 BITMAP_BLOCK_SHIFT);
2091 blocks = old_counts.chunks << old_counts.chunkshift; 2121 blocks = old_counts.chunks << old_counts.chunkshift;
2092 pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n"); 2122 pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
2093 break; 2123 break;
2094 } else 2124 } else
2095 bitmap->counts.bp[page].count += 1; 2125 bitmap->counts.bp[page].count += 1;
@@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
2266 /* Ensure new bitmap info is stored in 2296 /* Ensure new bitmap info is stored in
2267 * metadata promptly. 2297 * metadata promptly.
2268 */ 2298 */
2269 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2299 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2270 md_wakeup_thread(mddev->thread); 2300 md_wakeup_thread(mddev->thread);
2271 } 2301 }
2272 rv = 0; 2302 rv = 0;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6d53810963f7..953159d9a825 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
2011 sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); 2011 sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
2012 2012
2013 /* Force writing of superblocks to disk */ 2013 /* Force writing of superblocks to disk */
2014 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); 2014 set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags);
2015 2015
2016 /* Any superblock is better than none, choose that if given */ 2016 /* Any superblock is better than none, choose that if given */
2017 return refdev ? 0 : 1; 2017 return refdev ? 0 : 1;
@@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs)
3497 struct mddev *mddev = &rs->md; 3497 struct mddev *mddev = &rs->md;
3498 int ro = mddev->ro; 3498 int ro = mddev->ro;
3499 3499
3500 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3500 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3501 mddev->ro = 0; 3501 mddev->ro = 0;
3502 md_update_sb(mddev, 1); 3502 md_update_sb(mddev, 1);
3503 mddev->ro = ro; 3503 mddev->ro = ro;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 86f5d435901d..5975c9915684 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -21,6 +21,7 @@
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <trace/events/block.h>
24#include "md.h" 25#include "md.h"
25#include "linear.h" 26#include "linear.h"
26 27
@@ -101,8 +102,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
101 sector_t sectors; 102 sector_t sectors;
102 103
103 if (j < 0 || j >= raid_disks || disk->rdev) { 104 if (j < 0 || j >= raid_disks || disk->rdev) {
104 printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", 105 pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
105 mdname(mddev)); 106 mdname(mddev));
106 goto out; 107 goto out;
107 } 108 }
108 109
@@ -123,8 +124,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
123 discard_supported = true; 124 discard_supported = true;
124 } 125 }
125 if (cnt != raid_disks) { 126 if (cnt != raid_disks) {
126 printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", 127 pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
127 mdname(mddev)); 128 mdname(mddev));
128 goto out; 129 goto out;
129 } 130 }
130 131
@@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
227 } 228 }
228 229
229 do { 230 do {
230 tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); 231 sector_t bio_sector = bio->bi_iter.bi_sector;
232 tmp_dev = which_dev(mddev, bio_sector);
231 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; 233 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
232 end_sector = tmp_dev->end_sector; 234 end_sector = tmp_dev->end_sector;
233 data_offset = tmp_dev->rdev->data_offset; 235 data_offset = tmp_dev->rdev->data_offset;
234 bio->bi_bdev = tmp_dev->rdev->bdev; 236 bio->bi_bdev = tmp_dev->rdev->bdev;
235 237
236 if (unlikely(bio->bi_iter.bi_sector >= end_sector || 238 if (unlikely(bio_sector >= end_sector ||
237 bio->bi_iter.bi_sector < start_sector)) 239 bio_sector < start_sector))
238 goto out_of_bounds; 240 goto out_of_bounds;
239 241
240 if (unlikely(bio_end_sector(bio) > end_sector)) { 242 if (unlikely(bio_end_sector(bio) > end_sector)) {
241 /* This bio crosses a device boundary, so we have to 243 /* This bio crosses a device boundary, so we have to
242 * split it. 244 * split it.
243 */ 245 */
244 split = bio_split(bio, end_sector - 246 split = bio_split(bio, end_sector - bio_sector,
245 bio->bi_iter.bi_sector,
246 GFP_NOIO, fs_bio_set); 247 GFP_NOIO, fs_bio_set);
247 bio_chain(split, bio); 248 bio_chain(split, bio);
248 } else { 249 } else {
@@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
256 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { 257 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
257 /* Just ignore it */ 258 /* Just ignore it */
258 bio_endio(split); 259 bio_endio(split);
259 } else 260 } else {
261 if (mddev->gendisk)
262 trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
263 split, disk_devt(mddev->gendisk),
264 bio_sector);
260 generic_make_request(split); 265 generic_make_request(split);
266 }
261 } while (split != bio); 267 } while (split != bio);
262 return; 268 return;
263 269
264out_of_bounds: 270out_of_bounds:
265 printk(KERN_ERR 271 pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
266 "md/linear:%s: make_request: Sector %llu out of bounds on "
267 "dev %s: %llu sectors, offset %llu\n",
268 mdname(mddev), 272 mdname(mddev),
269 (unsigned long long)bio->bi_iter.bi_sector, 273 (unsigned long long)bio->bi_iter.bi_sector,
270 bdevname(tmp_dev->rdev->bdev, b), 274 bdevname(tmp_dev->rdev->bdev, b),
@@ -275,7 +279,6 @@ out_of_bounds:
275 279
276static void linear_status (struct seq_file *seq, struct mddev *mddev) 280static void linear_status (struct seq_file *seq, struct mddev *mddev)
277{ 281{
278
279 seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); 282 seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
280} 283}
281 284
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f975cd08923d..82821ee0d57f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -30,6 +30,18 @@
30 You should have received a copy of the GNU General Public License 30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free 31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33
34 Errors, Warnings, etc.
35 Please use:
36 pr_crit() for error conditions that risk data loss
37 pr_err() for error conditions that are unexpected, like an IO error
38 or internal inconsistency
39 pr_warn() for error conditions that could have been predicated, like
40 adding a device to an array when it has incompatible metadata
41 pr_info() for every interesting, very rare events, like an array starting
42 or stopping, or resync starting or stopping
43 pr_debug() for everything else.
44
33*/ 45*/
34 46
35#include <linux/kthread.h> 47#include <linux/kthread.h>
@@ -52,6 +64,7 @@
52#include <linux/raid/md_p.h> 64#include <linux/raid/md_p.h>
53#include <linux/raid/md_u.h> 65#include <linux/raid/md_u.h>
54#include <linux/slab.h> 66#include <linux/slab.h>
67#include <trace/events/block.h>
55#include "md.h" 68#include "md.h"
56#include "bitmap.h" 69#include "bitmap.h"
57#include "md-cluster.h" 70#include "md-cluster.h"
@@ -684,11 +697,8 @@ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
684static int alloc_disk_sb(struct md_rdev *rdev) 697static int alloc_disk_sb(struct md_rdev *rdev)
685{ 698{
686 rdev->sb_page = alloc_page(GFP_KERNEL); 699 rdev->sb_page = alloc_page(GFP_KERNEL);
687 if (!rdev->sb_page) { 700 if (!rdev->sb_page)
688 printk(KERN_ALERT "md: out of memory.\n");
689 return -ENOMEM; 701 return -ENOMEM;
690 }
691
692 return 0; 702 return 0;
693} 703}
694 704
@@ -715,9 +725,15 @@ static void super_written(struct bio *bio)
715 struct mddev *mddev = rdev->mddev; 725 struct mddev *mddev = rdev->mddev;
716 726
717 if (bio->bi_error) { 727 if (bio->bi_error) {
718 printk("md: super_written gets error=%d\n", bio->bi_error); 728 pr_err("md: super_written gets error=%d\n", bio->bi_error);
719 md_error(mddev, rdev); 729 md_error(mddev, rdev);
720 } 730 if (!test_bit(Faulty, &rdev->flags)
731 && (bio->bi_opf & MD_FAILFAST)) {
732 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
733 set_bit(LastDev, &rdev->flags);
734 }
735 } else
736 clear_bit(LastDev, &rdev->flags);
721 737
722 if (atomic_dec_and_test(&mddev->pending_writes)) 738 if (atomic_dec_and_test(&mddev->pending_writes))
723 wake_up(&mddev->sb_wait); 739 wake_up(&mddev->sb_wait);
@@ -734,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
734 * if zero is reached. 750 * if zero is reached.
735 * If an error occurred, call md_error 751 * If an error occurred, call md_error
736 */ 752 */
737 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 753 struct bio *bio;
754 int ff = 0;
755
756 if (test_bit(Faulty, &rdev->flags))
757 return;
758
759 bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
738 760
739 atomic_inc(&rdev->nr_pending); 761 atomic_inc(&rdev->nr_pending);
740 762
@@ -743,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
743 bio_add_page(bio, page, size, 0); 765 bio_add_page(bio, page, size, 0);
744 bio->bi_private = rdev; 766 bio->bi_private = rdev;
745 bio->bi_end_io = super_written; 767 bio->bi_end_io = super_written;
746 bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA; 768
769 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
770 test_bit(FailFast, &rdev->flags) &&
771 !test_bit(LastDev, &rdev->flags))
772 ff = MD_FAILFAST;
773 bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff;
747 774
748 atomic_inc(&mddev->pending_writes); 775 atomic_inc(&mddev->pending_writes);
749 submit_bio(bio); 776 submit_bio(bio);
750} 777}
751 778
752void md_super_wait(struct mddev *mddev) 779int md_super_wait(struct mddev *mddev)
753{ 780{
754 /* wait for all superblock writes that were scheduled to complete */ 781 /* wait for all superblock writes that were scheduled to complete */
755 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 782 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
783 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
784 return -EAGAIN;
785 return 0;
756} 786}
757 787
758int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 788int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
@@ -795,8 +825,8 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
795 return 0; 825 return 0;
796 826
797fail: 827fail:
798 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 828 pr_err("md: disabled device %s, could not read superblock.\n",
799 bdevname(rdev->bdev,b)); 829 bdevname(rdev->bdev,b));
800 return -EINVAL; 830 return -EINVAL;
801} 831}
802 832
@@ -818,7 +848,6 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
818 848
819 if (!tmp1 || !tmp2) { 849 if (!tmp1 || !tmp2) {
820 ret = 0; 850 ret = 0;
821 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
822 goto abort; 851 goto abort;
823 } 852 }
824 853
@@ -932,7 +961,7 @@ int md_check_no_bitmap(struct mddev *mddev)
932{ 961{
933 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 962 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
934 return 0; 963 return 0;
935 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 964 pr_warn("%s: bitmaps are not supported for %s\n",
936 mdname(mddev), mddev->pers->name); 965 mdname(mddev), mddev->pers->name);
937 return 1; 966 return 1;
938} 967}
@@ -956,7 +985,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
956 rdev->sb_start = calc_dev_sboffset(rdev); 985 rdev->sb_start = calc_dev_sboffset(rdev);
957 986
958 ret = read_disk_sb(rdev, MD_SB_BYTES); 987 ret = read_disk_sb(rdev, MD_SB_BYTES);
959 if (ret) return ret; 988 if (ret)
989 return ret;
960 990
961 ret = -EINVAL; 991 ret = -EINVAL;
962 992
@@ -964,17 +994,15 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
964 sb = page_address(rdev->sb_page); 994 sb = page_address(rdev->sb_page);
965 995
966 if (sb->md_magic != MD_SB_MAGIC) { 996 if (sb->md_magic != MD_SB_MAGIC) {
967 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 997 pr_warn("md: invalid raid superblock magic on %s\n", b);
968 b);
969 goto abort; 998 goto abort;
970 } 999 }
971 1000
972 if (sb->major_version != 0 || 1001 if (sb->major_version != 0 ||
973 sb->minor_version < 90 || 1002 sb->minor_version < 90 ||
974 sb->minor_version > 91) { 1003 sb->minor_version > 91) {
975 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 1004 pr_warn("Bad version number %d.%d on %s\n",
976 sb->major_version, sb->minor_version, 1005 sb->major_version, sb->minor_version, b);
977 b);
978 goto abort; 1006 goto abort;
979 } 1007 }
980 1008
@@ -982,8 +1010,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
982 goto abort; 1010 goto abort;
983 1011
984 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1012 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
985 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 1013 pr_warn("md: invalid superblock checksum on %s\n", b);
986 b);
987 goto abort; 1014 goto abort;
988 } 1015 }
989 1016
@@ -1004,14 +1031,13 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
1004 __u64 ev1, ev2; 1031 __u64 ev1, ev2;
1005 mdp_super_t *refsb = page_address(refdev->sb_page); 1032 mdp_super_t *refsb = page_address(refdev->sb_page);
1006 if (!uuid_equal(refsb, sb)) { 1033 if (!uuid_equal(refsb, sb)) {
1007 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1034 pr_warn("md: %s has different UUID to %s\n",
1008 b, bdevname(refdev->bdev,b2)); 1035 b, bdevname(refdev->bdev,b2));
1009 goto abort; 1036 goto abort;
1010 } 1037 }
1011 if (!sb_equal(refsb, sb)) { 1038 if (!sb_equal(refsb, sb)) {
1012 printk(KERN_WARNING "md: %s has same UUID" 1039 pr_warn("md: %s has same UUID but different superblock to %s\n",
1013 " but different superblock to %s\n", 1040 b, bdevname(refdev->bdev, b2));
1014 b, bdevname(refdev->bdev, b2));
1015 goto abort; 1041 goto abort;
1016 } 1042 }
1017 ev1 = md_event(sb); 1043 ev1 = md_event(sb);
@@ -1158,6 +1184,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1158 } 1184 }
1159 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1185 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1160 set_bit(WriteMostly, &rdev->flags); 1186 set_bit(WriteMostly, &rdev->flags);
1187 if (desc->state & (1<<MD_DISK_FAILFAST))
1188 set_bit(FailFast, &rdev->flags);
1161 } else /* MULTIPATH are always insync */ 1189 } else /* MULTIPATH are always insync */
1162 set_bit(In_sync, &rdev->flags); 1190 set_bit(In_sync, &rdev->flags);
1163 return 0; 1191 return 0;
@@ -1283,6 +1311,8 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1283 } 1311 }
1284 if (test_bit(WriteMostly, &rdev2->flags)) 1312 if (test_bit(WriteMostly, &rdev2->flags))
1285 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1313 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1314 if (test_bit(FailFast, &rdev2->flags))
1315 d->state |= (1<<MD_DISK_FAILFAST);
1286 } 1316 }
1287 /* now set the "removed" and "faulty" bits on any missing devices */ 1317 /* now set the "removed" and "faulty" bits on any missing devices */
1288 for (i=0 ; i < mddev->raid_disks ; i++) { 1318 for (i=0 ; i < mddev->raid_disks ; i++) {
@@ -1324,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1324 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && 1354 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
1325 rdev->mddev->level >= 1) 1355 rdev->mddev->level >= 1)
1326 num_sectors = (sector_t)(2ULL << 32) - 2; 1356 num_sectors = (sector_t)(2ULL << 32) - 2;
1327 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1357 do {
1358 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1328 rdev->sb_page); 1359 rdev->sb_page);
1329 md_super_wait(rdev->mddev); 1360 } while (md_super_wait(rdev->mddev) < 0);
1330 return num_sectors; 1361 return num_sectors;
1331} 1362}
1332 1363
@@ -1413,13 +1444,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1413 return -EINVAL; 1444 return -EINVAL;
1414 1445
1415 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1446 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1416 printk("md: invalid superblock checksum on %s\n", 1447 pr_warn("md: invalid superblock checksum on %s\n",
1417 bdevname(rdev->bdev,b)); 1448 bdevname(rdev->bdev,b));
1418 return -EINVAL; 1449 return -EINVAL;
1419 } 1450 }
1420 if (le64_to_cpu(sb->data_size) < 10) { 1451 if (le64_to_cpu(sb->data_size) < 10) {
1421 printk("md: data_size too small on %s\n", 1452 pr_warn("md: data_size too small on %s\n",
1422 bdevname(rdev->bdev,b)); 1453 bdevname(rdev->bdev,b));
1423 return -EINVAL; 1454 return -EINVAL;
1424 } 1455 }
1425 if (sb->pad0 || 1456 if (sb->pad0 ||
@@ -1503,8 +1534,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1503 sb->level != refsb->level || 1534 sb->level != refsb->level ||
1504 sb->layout != refsb->layout || 1535 sb->layout != refsb->layout ||
1505 sb->chunksize != refsb->chunksize) { 1536 sb->chunksize != refsb->chunksize) {
1506 printk(KERN_WARNING "md: %s has strangely different" 1537 pr_warn("md: %s has strangely different superblock to %s\n",
1507 " superblock to %s\n",
1508 bdevname(rdev->bdev,b), 1538 bdevname(rdev->bdev,b),
1509 bdevname(refdev->bdev,b2)); 1539 bdevname(refdev->bdev,b2));
1510 return -EINVAL; 1540 return -EINVAL;
@@ -1646,8 +1676,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1646 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1676 case MD_DISK_ROLE_JOURNAL: /* journal device */
1647 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1677 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1648 /* journal device without journal feature */ 1678 /* journal device without journal feature */
1649 printk(KERN_WARNING 1679 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1650 "md: journal device provided without journal feature, ignoring the device\n");
1651 return -EINVAL; 1680 return -EINVAL;
1652 } 1681 }
1653 set_bit(Journal, &rdev->flags); 1682 set_bit(Journal, &rdev->flags);
@@ -1669,6 +1698,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1669 } 1698 }
1670 if (sb->devflags & WriteMostly1) 1699 if (sb->devflags & WriteMostly1)
1671 set_bit(WriteMostly, &rdev->flags); 1700 set_bit(WriteMostly, &rdev->flags);
1701 if (sb->devflags & FailFast1)
1702 set_bit(FailFast, &rdev->flags);
1672 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1703 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1673 set_bit(Replacement, &rdev->flags); 1704 set_bit(Replacement, &rdev->flags);
1674 } else /* MULTIPATH are always insync */ 1705 } else /* MULTIPATH are always insync */
@@ -1707,6 +1738,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1707 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1738 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1708 sb->level = cpu_to_le32(mddev->level); 1739 sb->level = cpu_to_le32(mddev->level);
1709 sb->layout = cpu_to_le32(mddev->layout); 1740 sb->layout = cpu_to_le32(mddev->layout);
1741 if (test_bit(FailFast, &rdev->flags))
1742 sb->devflags |= FailFast1;
1743 else
1744 sb->devflags &= ~FailFast1;
1710 1745
1711 if (test_bit(WriteMostly, &rdev->flags)) 1746 if (test_bit(WriteMostly, &rdev->flags))
1712 sb->devflags |= WriteMostly1; 1747 sb->devflags |= WriteMostly1;
@@ -1863,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1863 sb->data_size = cpu_to_le64(num_sectors); 1898 sb->data_size = cpu_to_le64(num_sectors);
1864 sb->super_offset = rdev->sb_start; 1899 sb->super_offset = rdev->sb_start;
1865 sb->sb_csum = calc_sb_1_csum(sb); 1900 sb->sb_csum = calc_sb_1_csum(sb);
1866 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1901 do {
1867 rdev->sb_page); 1902 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1868 md_super_wait(rdev->mddev); 1903 rdev->sb_page);
1904 } while (md_super_wait(rdev->mddev) < 0);
1869 return num_sectors; 1905 return num_sectors;
1870 1906
1871} 1907}
@@ -2004,9 +2040,9 @@ int md_integrity_register(struct mddev *mddev)
2004 blk_integrity_register(mddev->gendisk, 2040 blk_integrity_register(mddev->gendisk,
2005 bdev_get_integrity(reference->bdev)); 2041 bdev_get_integrity(reference->bdev));
2006 2042
2007 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); 2043 pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2008 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 2044 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2009 printk(KERN_ERR "md: failed to create integrity pool for %s\n", 2045 pr_err("md: failed to create integrity pool for %s\n",
2010 mdname(mddev)); 2046 mdname(mddev));
2011 return -EINVAL; 2047 return -EINVAL;
2012 } 2048 }
@@ -2034,8 +2070,8 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2034 return 0; 2070 return 0;
2035 2071
2036 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2072 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2037 printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", 2073 pr_err("%s: incompatible integrity profile for %s\n",
2038 mdname(mddev), bdevname(rdev->bdev, name)); 2074 mdname(mddev), bdevname(rdev->bdev, name));
2039 return -ENXIO; 2075 return -ENXIO;
2040 } 2076 }
2041 2077
@@ -2089,15 +2125,15 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2089 rcu_read_unlock(); 2125 rcu_read_unlock();
2090 if (!test_bit(Journal, &rdev->flags) && 2126 if (!test_bit(Journal, &rdev->flags) &&
2091 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2127 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2092 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 2128 pr_warn("md: %s: array is limited to %d devices\n",
2093 mdname(mddev), mddev->max_disks); 2129 mdname(mddev), mddev->max_disks);
2094 return -EBUSY; 2130 return -EBUSY;
2095 } 2131 }
2096 bdevname(rdev->bdev,b); 2132 bdevname(rdev->bdev,b);
2097 strreplace(b, '/', '!'); 2133 strreplace(b, '/', '!');
2098 2134
2099 rdev->mddev = mddev; 2135 rdev->mddev = mddev;
2100 printk(KERN_INFO "md: bind<%s>\n", b); 2136 pr_debug("md: bind<%s>\n", b);
2101 2137
2102 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2138 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2103 goto fail; 2139 goto fail;
@@ -2116,8 +2152,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2116 return 0; 2152 return 0;
2117 2153
2118 fail: 2154 fail:
2119 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 2155 pr_warn("md: failed to register dev-%s for %s\n",
2120 b, mdname(mddev)); 2156 b, mdname(mddev));
2121 return err; 2157 return err;
2122} 2158}
2123 2159
@@ -2134,7 +2170,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
2134 2170
2135 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2171 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2136 list_del_rcu(&rdev->same_set); 2172 list_del_rcu(&rdev->same_set);
2137 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2173 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2138 rdev->mddev = NULL; 2174 rdev->mddev = NULL;
2139 sysfs_remove_link(&rdev->kobj, "block"); 2175 sysfs_remove_link(&rdev->kobj, "block");
2140 sysfs_put(rdev->sysfs_state); 2176 sysfs_put(rdev->sysfs_state);
@@ -2164,8 +2200,7 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2164 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2200 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2165 shared ? (struct md_rdev *)lock_rdev : rdev); 2201 shared ? (struct md_rdev *)lock_rdev : rdev);
2166 if (IS_ERR(bdev)) { 2202 if (IS_ERR(bdev)) {
2167 printk(KERN_ERR "md: could not open %s.\n", 2203 pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2168 __bdevname(dev, b));
2169 return PTR_ERR(bdev); 2204 return PTR_ERR(bdev);
2170 } 2205 }
2171 rdev->bdev = bdev; 2206 rdev->bdev = bdev;
@@ -2185,8 +2220,7 @@ static void export_rdev(struct md_rdev *rdev)
2185{ 2220{
2186 char b[BDEVNAME_SIZE]; 2221 char b[BDEVNAME_SIZE];
2187 2222
2188 printk(KERN_INFO "md: export_rdev(%s)\n", 2223 pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2189 bdevname(rdev->bdev,b));
2190 md_rdev_clear(rdev); 2224 md_rdev_clear(rdev);
2191#ifndef MODULE 2225#ifndef MODULE
2192 if (test_bit(AutoDetected, &rdev->flags)) 2226 if (test_bit(AutoDetected, &rdev->flags))
@@ -2288,24 +2322,24 @@ void md_update_sb(struct mddev *mddev, int force_change)
2288 2322
2289 if (mddev->ro) { 2323 if (mddev->ro) {
2290 if (force_change) 2324 if (force_change)
2291 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2325 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2292 return; 2326 return;
2293 } 2327 }
2294 2328
2295repeat: 2329repeat:
2296 if (mddev_is_clustered(mddev)) { 2330 if (mddev_is_clustered(mddev)) {
2297 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2331 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2298 force_change = 1; 2332 force_change = 1;
2299 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2333 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2300 nospares = 1; 2334 nospares = 1;
2301 ret = md_cluster_ops->metadata_update_start(mddev); 2335 ret = md_cluster_ops->metadata_update_start(mddev);
2302 /* Has someone else has updated the sb */ 2336 /* Has someone else has updated the sb */
2303 if (!does_sb_need_changing(mddev)) { 2337 if (!does_sb_need_changing(mddev)) {
2304 if (ret == 0) 2338 if (ret == 0)
2305 md_cluster_ops->metadata_update_cancel(mddev); 2339 md_cluster_ops->metadata_update_cancel(mddev);
2306 bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), 2340 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2307 BIT(MD_CHANGE_DEVS) | 2341 BIT(MD_SB_CHANGE_DEVS) |
2308 BIT(MD_CHANGE_CLEAN)); 2342 BIT(MD_SB_CHANGE_CLEAN));
2309 return; 2343 return;
2310 } 2344 }
2311 } 2345 }
@@ -2321,10 +2355,10 @@ repeat:
2321 2355
2322 } 2356 }
2323 if (!mddev->persistent) { 2357 if (!mddev->persistent) {
2324 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2358 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2325 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2359 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2326 if (!mddev->external) { 2360 if (!mddev->external) {
2327 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2361 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2328 rdev_for_each(rdev, mddev) { 2362 rdev_for_each(rdev, mddev) {
2329 if (rdev->badblocks.changed) { 2363 if (rdev->badblocks.changed) {
2330 rdev->badblocks.changed = 0; 2364 rdev->badblocks.changed = 0;
@@ -2344,9 +2378,9 @@ repeat:
2344 2378
2345 mddev->utime = ktime_get_real_seconds(); 2379 mddev->utime = ktime_get_real_seconds();
2346 2380
2347 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2381 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2348 force_change = 1; 2382 force_change = 1;
2349 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2383 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2350 /* just a clean<-> dirty transition, possibly leave spares alone, 2384 /* just a clean<-> dirty transition, possibly leave spares alone,
2351 * though if events isn't the right even/odd, we will have to do 2385 * though if events isn't the right even/odd, we will have to do
2352 * spares after all 2386 * spares after all
@@ -2402,6 +2436,9 @@ repeat:
2402 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2436 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2403 mdname(mddev), mddev->in_sync); 2437 mdname(mddev), mddev->in_sync);
2404 2438
2439 if (mddev->queue)
2440 blk_add_trace_msg(mddev->queue, "md md_update_sb");
2441rewrite:
2405 bitmap_update_sb(mddev->bitmap); 2442 bitmap_update_sb(mddev->bitmap);
2406 rdev_for_each(rdev, mddev) { 2443 rdev_for_each(rdev, mddev) {
2407 char b[BDEVNAME_SIZE]; 2444 char b[BDEVNAME_SIZE];
@@ -2433,15 +2470,16 @@ repeat:
2433 /* only need to write one superblock... */ 2470 /* only need to write one superblock... */
2434 break; 2471 break;
2435 } 2472 }
2436 md_super_wait(mddev); 2473 if (md_super_wait(mddev) < 0)
2437 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2474 goto rewrite;
2475 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2438 2476
2439 if (mddev_is_clustered(mddev) && ret == 0) 2477 if (mddev_is_clustered(mddev) && ret == 0)
2440 md_cluster_ops->metadata_update_finish(mddev); 2478 md_cluster_ops->metadata_update_finish(mddev);
2441 2479
2442 if (mddev->in_sync != sync_req || 2480 if (mddev->in_sync != sync_req ||
2443 !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), 2481 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2444 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN))) 2482 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2445 /* have to write it out again */ 2483 /* have to write it out again */
2446 goto repeat; 2484 goto repeat;
2447 wake_up(&mddev->sb_wait); 2485 wake_up(&mddev->sb_wait);
@@ -2485,7 +2523,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
2485 } 2523 }
2486 sysfs_notify_dirent_safe(rdev->sysfs_state); 2524 sysfs_notify_dirent_safe(rdev->sysfs_state);
2487 2525
2488 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2526 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2489 if (mddev->degraded) 2527 if (mddev->degraded)
2490 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2528 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2491 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2529 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -2523,51 +2561,41 @@ struct rdev_sysfs_entry {
2523static ssize_t 2561static ssize_t
2524state_show(struct md_rdev *rdev, char *page) 2562state_show(struct md_rdev *rdev, char *page)
2525{ 2563{
2526 char *sep = ""; 2564 char *sep = ",";
2527 size_t len = 0; 2565 size_t len = 0;
2528 unsigned long flags = ACCESS_ONCE(rdev->flags); 2566 unsigned long flags = ACCESS_ONCE(rdev->flags);
2529 2567
2530 if (test_bit(Faulty, &flags) || 2568 if (test_bit(Faulty, &flags) ||
2531 rdev->badblocks.unacked_exist) { 2569 (!test_bit(ExternalBbl, &flags) &&
2532 len+= sprintf(page+len, "%sfaulty",sep); 2570 rdev->badblocks.unacked_exist))
2533 sep = ","; 2571 len += sprintf(page+len, "faulty%s", sep);
2534 } 2572 if (test_bit(In_sync, &flags))
2535 if (test_bit(In_sync, &flags)) { 2573 len += sprintf(page+len, "in_sync%s", sep);
2536 len += sprintf(page+len, "%sin_sync",sep); 2574 if (test_bit(Journal, &flags))
2537 sep = ","; 2575 len += sprintf(page+len, "journal%s", sep);
2538 } 2576 if (test_bit(WriteMostly, &flags))
2539 if (test_bit(Journal, &flags)) { 2577 len += sprintf(page+len, "write_mostly%s", sep);
2540 len += sprintf(page+len, "%sjournal",sep);
2541 sep = ",";
2542 }
2543 if (test_bit(WriteMostly, &flags)) {
2544 len += sprintf(page+len, "%swrite_mostly",sep);
2545 sep = ",";
2546 }
2547 if (test_bit(Blocked, &flags) || 2578 if (test_bit(Blocked, &flags) ||
2548 (rdev->badblocks.unacked_exist 2579 (rdev->badblocks.unacked_exist
2549 && !test_bit(Faulty, &flags))) { 2580 && !test_bit(Faulty, &flags)))
2550 len += sprintf(page+len, "%sblocked", sep); 2581 len += sprintf(page+len, "blocked%s", sep);
2551 sep = ",";
2552 }
2553 if (!test_bit(Faulty, &flags) && 2582 if (!test_bit(Faulty, &flags) &&
2554 !test_bit(Journal, &flags) && 2583 !test_bit(Journal, &flags) &&
2555 !test_bit(In_sync, &flags)) { 2584 !test_bit(In_sync, &flags))
2556 len += sprintf(page+len, "%sspare", sep); 2585 len += sprintf(page+len, "spare%s", sep);
2557 sep = ","; 2586 if (test_bit(WriteErrorSeen, &flags))
2558 } 2587 len += sprintf(page+len, "write_error%s", sep);
2559 if (test_bit(WriteErrorSeen, &flags)) { 2588 if (test_bit(WantReplacement, &flags))
2560 len += sprintf(page+len, "%swrite_error", sep); 2589 len += sprintf(page+len, "want_replacement%s", sep);
2561 sep = ","; 2590 if (test_bit(Replacement, &flags))
2562 } 2591 len += sprintf(page+len, "replacement%s", sep);
2563 if (test_bit(WantReplacement, &flags)) { 2592 if (test_bit(ExternalBbl, &flags))
2564 len += sprintf(page+len, "%swant_replacement", sep); 2593 len += sprintf(page+len, "external_bbl%s", sep);
2565 sep = ","; 2594 if (test_bit(FailFast, &flags))
2566 } 2595 len += sprintf(page+len, "failfast%s", sep);
2567 if (test_bit(Replacement, &flags)) { 2596
2568 len += sprintf(page+len, "%sreplacement", sep); 2597 if (len)
2569 sep = ","; 2598 len -= strlen(sep);
2570 }
2571 2599
2572 return len+sprintf(page+len, "\n"); 2600 return len+sprintf(page+len, "\n");
2573} 2601}
@@ -2587,6 +2615,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2587 * so that it gets rebuilt based on bitmap 2615 * so that it gets rebuilt based on bitmap
2588 * write_error - sets WriteErrorSeen 2616 * write_error - sets WriteErrorSeen
2589 * -write_error - clears WriteErrorSeen 2617 * -write_error - clears WriteErrorSeen
2618 * {,-}failfast - set/clear FailFast
2590 */ 2619 */
2591 int err = -EINVAL; 2620 int err = -EINVAL;
2592 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2621 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -2610,8 +2639,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2610 2639
2611 if (err == 0) { 2640 if (err == 0) {
2612 md_kick_rdev_from_array(rdev); 2641 md_kick_rdev_from_array(rdev);
2613 if (mddev->pers) 2642 if (mddev->pers) {
2614 md_update_sb(mddev, 1); 2643 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2644 md_wakeup_thread(mddev->thread);
2645 }
2615 md_new_event(mddev); 2646 md_new_event(mddev);
2616 } 2647 }
2617 } 2648 }
@@ -2626,6 +2657,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2626 err = 0; 2657 err = 0;
2627 } else if (cmd_match(buf, "-blocked")) { 2658 } else if (cmd_match(buf, "-blocked")) {
2628 if (!test_bit(Faulty, &rdev->flags) && 2659 if (!test_bit(Faulty, &rdev->flags) &&
2660 !test_bit(ExternalBbl, &rdev->flags) &&
2629 rdev->badblocks.unacked_exist) { 2661 rdev->badblocks.unacked_exist) {
2630 /* metadata handler doesn't understand badblocks, 2662 /* metadata handler doesn't understand badblocks,
2631 * so we need to fail the device 2663 * so we need to fail the device
@@ -2642,6 +2674,12 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2642 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2674 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2643 set_bit(In_sync, &rdev->flags); 2675 set_bit(In_sync, &rdev->flags);
2644 err = 0; 2676 err = 0;
2677 } else if (cmd_match(buf, "failfast")) {
2678 set_bit(FailFast, &rdev->flags);
2679 err = 0;
2680 } else if (cmd_match(buf, "-failfast")) {
2681 clear_bit(FailFast, &rdev->flags);
2682 err = 0;
2645 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 2683 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2646 !test_bit(Journal, &rdev->flags)) { 2684 !test_bit(Journal, &rdev->flags)) {
2647 if (rdev->mddev->pers == NULL) { 2685 if (rdev->mddev->pers == NULL) {
@@ -2708,6 +2746,13 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2708 } 2746 }
2709 } else 2747 } else
2710 err = -EBUSY; 2748 err = -EBUSY;
2749 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
2750 set_bit(ExternalBbl, &rdev->flags);
2751 rdev->badblocks.shift = 0;
2752 err = 0;
2753 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
2754 clear_bit(ExternalBbl, &rdev->flags);
2755 err = 0;
2711 } 2756 }
2712 if (!err) 2757 if (!err)
2713 sysfs_notify_dirent_safe(rdev->sysfs_state); 2758 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -3211,10 +3256,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
3211 sector_t size; 3256 sector_t size;
3212 3257
3213 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3258 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3214 if (!rdev) { 3259 if (!rdev)
3215 printk(KERN_ERR "md: could not alloc mem for new device!\n");
3216 return ERR_PTR(-ENOMEM); 3260 return ERR_PTR(-ENOMEM);
3217 }
3218 3261
3219 err = md_rdev_init(rdev); 3262 err = md_rdev_init(rdev);
3220 if (err) 3263 if (err)
@@ -3231,8 +3274,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
3231 3274
3232 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3275 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3233 if (!size) { 3276 if (!size) {
3234 printk(KERN_WARNING 3277 pr_warn("md: %s has zero or unknown size, marking faulty!\n",
3235 "md: %s has zero or unknown size, marking faulty!\n",
3236 bdevname(rdev->bdev,b)); 3278 bdevname(rdev->bdev,b));
3237 err = -EINVAL; 3279 err = -EINVAL;
3238 goto abort_free; 3280 goto abort_free;
@@ -3242,16 +3284,13 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
3242 err = super_types[super_format]. 3284 err = super_types[super_format].
3243 load_super(rdev, NULL, super_minor); 3285 load_super(rdev, NULL, super_minor);
3244 if (err == -EINVAL) { 3286 if (err == -EINVAL) {
3245 printk(KERN_WARNING 3287 pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3246 "md: %s does not have a valid v%d.%d "
3247 "superblock, not importing!\n",
3248 bdevname(rdev->bdev,b), 3288 bdevname(rdev->bdev,b),
3249 super_format, super_minor); 3289 super_format, super_minor);
3250 goto abort_free; 3290 goto abort_free;
3251 } 3291 }
3252 if (err < 0) { 3292 if (err < 0) {
3253 printk(KERN_WARNING 3293 pr_warn("md: could not read %s's sb, not importing!\n",
3254 "md: could not read %s's sb, not importing!\n",
3255 bdevname(rdev->bdev,b)); 3294 bdevname(rdev->bdev,b));
3256 goto abort_free; 3295 goto abort_free;
3257 } 3296 }
@@ -3287,9 +3326,7 @@ static void analyze_sbs(struct mddev *mddev)
3287 case 0: 3326 case 0:
3288 break; 3327 break;
3289 default: 3328 default:
3290 printk( KERN_ERR \ 3329 pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
3291 "md: fatal superblock inconsistency in %s"
3292 " -- removing from array\n",
3293 bdevname(rdev->bdev,b)); 3330 bdevname(rdev->bdev,b));
3294 md_kick_rdev_from_array(rdev); 3331 md_kick_rdev_from_array(rdev);
3295 } 3332 }
@@ -3302,18 +3339,16 @@ static void analyze_sbs(struct mddev *mddev)
3302 if (mddev->max_disks && 3339 if (mddev->max_disks &&
3303 (rdev->desc_nr >= mddev->max_disks || 3340 (rdev->desc_nr >= mddev->max_disks ||
3304 i > mddev->max_disks)) { 3341 i > mddev->max_disks)) {
3305 printk(KERN_WARNING 3342 pr_warn("md: %s: %s: only %d devices permitted\n",
3306 "md: %s: %s: only %d devices permitted\n", 3343 mdname(mddev), bdevname(rdev->bdev, b),
3307 mdname(mddev), bdevname(rdev->bdev, b), 3344 mddev->max_disks);
3308 mddev->max_disks);
3309 md_kick_rdev_from_array(rdev); 3345 md_kick_rdev_from_array(rdev);
3310 continue; 3346 continue;
3311 } 3347 }
3312 if (rdev != freshest) { 3348 if (rdev != freshest) {
3313 if (super_types[mddev->major_version]. 3349 if (super_types[mddev->major_version].
3314 validate_super(mddev, rdev)) { 3350 validate_super(mddev, rdev)) {
3315 printk(KERN_WARNING "md: kicking non-fresh %s" 3351 pr_warn("md: kicking non-fresh %s from array!\n",
3316 " from array!\n",
3317 bdevname(rdev->bdev,b)); 3352 bdevname(rdev->bdev,b));
3318 md_kick_rdev_from_array(rdev); 3353 md_kick_rdev_from_array(rdev);
3319 continue; 3354 continue;
@@ -3384,7 +3419,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3384 unsigned long msec; 3419 unsigned long msec;
3385 3420
3386 if (mddev_is_clustered(mddev)) { 3421 if (mddev_is_clustered(mddev)) {
3387 pr_info("md: Safemode is disabled for clustered mode\n"); 3422 pr_warn("md: Safemode is disabled for clustered mode\n");
3388 return -EINVAL; 3423 return -EINVAL;
3389 } 3424 }
3390 3425
@@ -3472,8 +3507,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3472 3507
3473 rv = -EINVAL; 3508 rv = -EINVAL;
3474 if (!mddev->pers->quiesce) { 3509 if (!mddev->pers->quiesce) {
3475 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 3510 pr_warn("md: %s: %s does not support online personality change\n",
3476 mdname(mddev), mddev->pers->name); 3511 mdname(mddev), mddev->pers->name);
3477 goto out_unlock; 3512 goto out_unlock;
3478 } 3513 }
3479 3514
@@ -3491,7 +3526,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3491 pers = find_pers(level, clevel); 3526 pers = find_pers(level, clevel);
3492 if (!pers || !try_module_get(pers->owner)) { 3527 if (!pers || !try_module_get(pers->owner)) {
3493 spin_unlock(&pers_lock); 3528 spin_unlock(&pers_lock);
3494 printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 3529 pr_warn("md: personality %s not loaded\n", clevel);
3495 rv = -EINVAL; 3530 rv = -EINVAL;
3496 goto out_unlock; 3531 goto out_unlock;
3497 } 3532 }
@@ -3505,8 +3540,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3505 } 3540 }
3506 if (!pers->takeover) { 3541 if (!pers->takeover) {
3507 module_put(pers->owner); 3542 module_put(pers->owner);
3508 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3543 pr_warn("md: %s: %s does not support personality takeover\n",
3509 mdname(mddev), clevel); 3544 mdname(mddev), clevel);
3510 rv = -EINVAL; 3545 rv = -EINVAL;
3511 goto out_unlock; 3546 goto out_unlock;
3512 } 3547 }
@@ -3526,8 +3561,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3526 mddev->delta_disks = 0; 3561 mddev->delta_disks = 0;
3527 mddev->reshape_backwards = 0; 3562 mddev->reshape_backwards = 0;
3528 module_put(pers->owner); 3563 module_put(pers->owner);
3529 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3564 pr_warn("md: %s: %s would not accept array\n",
3530 mdname(mddev), clevel); 3565 mdname(mddev), clevel);
3531 rv = PTR_ERR(priv); 3566 rv = PTR_ERR(priv);
3532 goto out_unlock; 3567 goto out_unlock;
3533 } 3568 }
@@ -3570,9 +3605,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3570 pers->sync_request != NULL) { 3605 pers->sync_request != NULL) {
3571 /* need to add the md_redundancy_group */ 3606 /* need to add the md_redundancy_group */
3572 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3607 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3573 printk(KERN_WARNING 3608 pr_warn("md: cannot register extra attributes for %s\n",
3574 "md: cannot register extra attributes for %s\n", 3609 mdname(mddev));
3575 mdname(mddev));
3576 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3610 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3577 } 3611 }
3578 if (oldpers->sync_request != NULL && 3612 if (oldpers->sync_request != NULL &&
@@ -3603,9 +3637,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3603 clear_bit(In_sync, &rdev->flags); 3637 clear_bit(In_sync, &rdev->flags);
3604 else { 3638 else {
3605 if (sysfs_link_rdev(mddev, rdev)) 3639 if (sysfs_link_rdev(mddev, rdev))
3606 printk(KERN_WARNING "md: cannot register rd%d" 3640 pr_warn("md: cannot register rd%d for %s after level change\n",
3607 " for %s after level change\n", 3641 rdev->raid_disk, mdname(mddev));
3608 rdev->raid_disk, mdname(mddev));
3609 } 3642 }
3610 } 3643 }
3611 3644
@@ -3618,7 +3651,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3618 } 3651 }
3619 blk_set_stacking_limits(&mddev->queue->limits); 3652 blk_set_stacking_limits(&mddev->queue->limits);
3620 pers->run(mddev); 3653 pers->run(mddev);
3621 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3654 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3622 mddev_resume(mddev); 3655 mddev_resume(mddev);
3623 if (!mddev->thread) 3656 if (!mddev->thread)
3624 md_update_sb(mddev, 1); 3657 md_update_sb(mddev, 1);
@@ -3813,7 +3846,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3813 if (!err) { 3846 if (!err) {
3814 mddev->recovery_cp = n; 3847 mddev->recovery_cp = n;
3815 if (mddev->pers) 3848 if (mddev->pers)
3816 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3849 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
3817 } 3850 }
3818 mddev_unlock(mddev); 3851 mddev_unlock(mddev);
3819 return err ?: len; 3852 return err ?: len;
@@ -3887,7 +3920,7 @@ array_state_show(struct mddev *mddev, char *page)
3887 st = read_auto; 3920 st = read_auto;
3888 break; 3921 break;
3889 case 0: 3922 case 0:
3890 if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 3923 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
3891 st = write_pending; 3924 st = write_pending;
3892 else if (mddev->in_sync) 3925 else if (mddev->in_sync)
3893 st = clean; 3926 st = clean;
@@ -3925,7 +3958,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3925 spin_lock(&mddev->lock); 3958 spin_lock(&mddev->lock);
3926 if (st == active) { 3959 if (st == active) {
3927 restart_array(mddev); 3960 restart_array(mddev);
3928 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 3961 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
3962 md_wakeup_thread(mddev->thread);
3929 wake_up(&mddev->sb_wait); 3963 wake_up(&mddev->sb_wait);
3930 err = 0; 3964 err = 0;
3931 } else /* st == clean */ { 3965 } else /* st == clean */ {
@@ -3935,7 +3969,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3935 mddev->in_sync = 1; 3969 mddev->in_sync = 1;
3936 if (mddev->safemode == 1) 3970 if (mddev->safemode == 1)
3937 mddev->safemode = 0; 3971 mddev->safemode = 0;
3938 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3972 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
3939 } 3973 }
3940 err = 0; 3974 err = 0;
3941 } else 3975 } else
@@ -4001,7 +4035,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
4001 mddev->in_sync = 1; 4035 mddev->in_sync = 1;
4002 if (mddev->safemode == 1) 4036 if (mddev->safemode == 1)
4003 mddev->safemode = 0; 4037 mddev->safemode = 0;
4004 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 4038 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4005 } 4039 }
4006 err = 0; 4040 err = 0;
4007 } else 4041 } else
@@ -4015,7 +4049,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
4015 err = restart_array(mddev); 4049 err = restart_array(mddev);
4016 if (err) 4050 if (err)
4017 break; 4051 break;
4018 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 4052 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4019 wake_up(&mddev->sb_wait); 4053 wake_up(&mddev->sb_wait);
4020 err = 0; 4054 err = 0;
4021 } else { 4055 } else {
@@ -5071,13 +5105,13 @@ static int md_alloc(dev_t dev, char *name)
5071 /* This isn't possible, but as kobject_init_and_add is marked 5105 /* This isn't possible, but as kobject_init_and_add is marked
5072 * __must_check, we must do something with the result 5106 * __must_check, we must do something with the result
5073 */ 5107 */
5074 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 5108 pr_debug("md: cannot register %s/md - name in use\n",
5075 disk->disk_name); 5109 disk->disk_name);
5076 error = 0; 5110 error = 0;
5077 } 5111 }
5078 if (mddev->kobj.sd && 5112 if (mddev->kobj.sd &&
5079 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 5113 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5080 printk(KERN_DEBUG "pointless warning\n"); 5114 pr_debug("pointless warning\n");
5081 mutex_unlock(&mddev->open_mutex); 5115 mutex_unlock(&mddev->open_mutex);
5082 abort: 5116 abort:
5083 mutex_unlock(&disks_mutex); 5117 mutex_unlock(&disks_mutex);
@@ -5179,15 +5213,15 @@ int md_run(struct mddev *mddev)
5179 if (mddev->dev_sectors && 5213 if (mddev->dev_sectors &&
5180 rdev->data_offset + mddev->dev_sectors 5214 rdev->data_offset + mddev->dev_sectors
5181 > rdev->sb_start) { 5215 > rdev->sb_start) {
5182 printk("md: %s: data overlaps metadata\n", 5216 pr_warn("md: %s: data overlaps metadata\n",
5183 mdname(mddev)); 5217 mdname(mddev));
5184 return -EINVAL; 5218 return -EINVAL;
5185 } 5219 }
5186 } else { 5220 } else {
5187 if (rdev->sb_start + rdev->sb_size/512 5221 if (rdev->sb_start + rdev->sb_size/512
5188 > rdev->data_offset) { 5222 > rdev->data_offset) {
5189 printk("md: %s: metadata overlaps data\n", 5223 pr_warn("md: %s: metadata overlaps data\n",
5190 mdname(mddev)); 5224 mdname(mddev));
5191 return -EINVAL; 5225 return -EINVAL;
5192 } 5226 }
5193 } 5227 }
@@ -5202,11 +5236,11 @@ int md_run(struct mddev *mddev)
5202 if (!pers || !try_module_get(pers->owner)) { 5236 if (!pers || !try_module_get(pers->owner)) {
5203 spin_unlock(&pers_lock); 5237 spin_unlock(&pers_lock);
5204 if (mddev->level != LEVEL_NONE) 5238 if (mddev->level != LEVEL_NONE)
5205 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 5239 pr_warn("md: personality for level %d is not loaded!\n",
5206 mddev->level); 5240 mddev->level);
5207 else 5241 else
5208 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 5242 pr_warn("md: personality for level %s is not loaded!\n",
5209 mddev->clevel); 5243 mddev->clevel);
5210 return -EINVAL; 5244 return -EINVAL;
5211 } 5245 }
5212 spin_unlock(&pers_lock); 5246 spin_unlock(&pers_lock);
@@ -5236,21 +5270,16 @@ int md_run(struct mddev *mddev)
5236 if (rdev < rdev2 && 5270 if (rdev < rdev2 &&
5237 rdev->bdev->bd_contains == 5271 rdev->bdev->bd_contains ==
5238 rdev2->bdev->bd_contains) { 5272 rdev2->bdev->bd_contains) {
5239 printk(KERN_WARNING 5273 pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
5240 "%s: WARNING: %s appears to be" 5274 mdname(mddev),
5241 " on the same physical disk as" 5275 bdevname(rdev->bdev,b),
5242 " %s.\n", 5276 bdevname(rdev2->bdev,b2));
5243 mdname(mddev),
5244 bdevname(rdev->bdev,b),
5245 bdevname(rdev2->bdev,b2));
5246 warned = 1; 5277 warned = 1;
5247 } 5278 }
5248 } 5279 }
5249 5280
5250 if (warned) 5281 if (warned)
5251 printk(KERN_WARNING 5282 pr_warn("True protection against single-disk failure might be compromised.\n");
5252 "True protection against single-disk"
5253 " failure might be compromised.\n");
5254 } 5283 }
5255 5284
5256 mddev->recovery = 0; 5285 mddev->recovery = 0;
@@ -5264,14 +5293,14 @@ int md_run(struct mddev *mddev)
5264 5293
5265 err = pers->run(mddev); 5294 err = pers->run(mddev);
5266 if (err) 5295 if (err)
5267 printk(KERN_ERR "md: pers->run() failed ...\n"); 5296 pr_warn("md: pers->run() failed ...\n");
5268 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 5297 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5269 WARN_ONCE(!mddev->external_size, "%s: default size too small," 5298 WARN_ONCE(!mddev->external_size,
5270 " but 'external_size' not in effect?\n", __func__); 5299 "%s: default size too small, but 'external_size' not in effect?\n",
5271 printk(KERN_ERR 5300 __func__);
5272 "md: invalid array_size %llu > default size %llu\n", 5301 pr_warn("md: invalid array_size %llu > default size %llu\n",
5273 (unsigned long long)mddev->array_sectors / 2, 5302 (unsigned long long)mddev->array_sectors / 2,
5274 (unsigned long long)pers->size(mddev, 0, 0) / 2); 5303 (unsigned long long)pers->size(mddev, 0, 0) / 2);
5275 err = -EINVAL; 5304 err = -EINVAL;
5276 } 5305 }
5277 if (err == 0 && pers->sync_request && 5306 if (err == 0 && pers->sync_request &&
@@ -5281,8 +5310,8 @@ int md_run(struct mddev *mddev)
5281 bitmap = bitmap_create(mddev, -1); 5310 bitmap = bitmap_create(mddev, -1);
5282 if (IS_ERR(bitmap)) { 5311 if (IS_ERR(bitmap)) {
5283 err = PTR_ERR(bitmap); 5312 err = PTR_ERR(bitmap);
5284 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 5313 pr_warn("%s: failed to create bitmap (%d)\n",
5285 mdname(mddev), err); 5314 mdname(mddev), err);
5286 } else 5315 } else
5287 mddev->bitmap = bitmap; 5316 mddev->bitmap = bitmap;
5288 5317
@@ -5318,9 +5347,8 @@ int md_run(struct mddev *mddev)
5318 if (pers->sync_request) { 5347 if (pers->sync_request) {
5319 if (mddev->kobj.sd && 5348 if (mddev->kobj.sd &&
5320 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5349 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5321 printk(KERN_WARNING 5350 pr_warn("md: cannot register extra attributes for %s\n",
5322 "md: cannot register extra attributes for %s\n", 5351 mdname(mddev));
5323 mdname(mddev));
5324 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5352 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5325 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5353 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
5326 mddev->ro = 0; 5354 mddev->ro = 0;
@@ -5350,7 +5378,7 @@ int md_run(struct mddev *mddev)
5350 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5378 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5351 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5379 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5352 5380
5353 if (mddev->flags & MD_UPDATE_SB_FLAGS) 5381 if (mddev->sb_flags)
5354 md_update_sb(mddev, 0); 5382 md_update_sb(mddev, 0);
5355 5383
5356 md_new_event(mddev); 5384 md_new_event(mddev);
@@ -5421,8 +5449,7 @@ static int restart_array(struct mddev *mddev)
5421 mddev->safemode = 0; 5449 mddev->safemode = 0;
5422 mddev->ro = 0; 5450 mddev->ro = 0;
5423 set_disk_ro(disk, 0); 5451 set_disk_ro(disk, 0);
5424 printk(KERN_INFO "md: %s switched to read-write mode.\n", 5452 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
5425 mdname(mddev));
5426 /* Kick recovery or resync if necessary */ 5453 /* Kick recovery or resync if necessary */
5427 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5454 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5428 md_wakeup_thread(mddev->thread); 5455 md_wakeup_thread(mddev->thread);
@@ -5446,6 +5473,7 @@ static void md_clean(struct mddev *mddev)
5446 mddev->level = LEVEL_NONE; 5473 mddev->level = LEVEL_NONE;
5447 mddev->clevel[0] = 0; 5474 mddev->clevel[0] = 0;
5448 mddev->flags = 0; 5475 mddev->flags = 0;
5476 mddev->sb_flags = 0;
5449 mddev->ro = 0; 5477 mddev->ro = 0;
5450 mddev->metadata_type[0] = 0; 5478 mddev->metadata_type[0] = 0;
5451 mddev->chunk_sectors = 0; 5479 mddev->chunk_sectors = 0;
@@ -5490,12 +5518,15 @@ static void __md_stop_writes(struct mddev *mddev)
5490 5518
5491 del_timer_sync(&mddev->safemode_timer); 5519 del_timer_sync(&mddev->safemode_timer);
5492 5520
5521 if (mddev->pers && mddev->pers->quiesce) {
5522 mddev->pers->quiesce(mddev, 1);
5523 mddev->pers->quiesce(mddev, 0);
5524 }
5493 bitmap_flush(mddev); 5525 bitmap_flush(mddev);
5494 md_super_wait(mddev);
5495 5526
5496 if (mddev->ro == 0 && 5527 if (mddev->ro == 0 &&
5497 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 5528 ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
5498 (mddev->flags & MD_UPDATE_SB_FLAGS))) { 5529 mddev->sb_flags)) {
5499 /* mark array as shutdown cleanly */ 5530 /* mark array as shutdown cleanly */
5500 if (!mddev_is_clustered(mddev)) 5531 if (!mddev_is_clustered(mddev))
5501 mddev->in_sync = 1; 5532 mddev->in_sync = 1;
@@ -5516,8 +5547,8 @@ static void mddev_detach(struct mddev *mddev)
5516 struct bitmap *bitmap = mddev->bitmap; 5547 struct bitmap *bitmap = mddev->bitmap;
5517 /* wait for behind writes to complete */ 5548 /* wait for behind writes to complete */
5518 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 5549 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
5519 printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", 5550 pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
5520 mdname(mddev)); 5551 mdname(mddev));
5521 /* need to kick something here to make sure I/O goes? */ 5552 /* need to kick something here to make sure I/O goes? */
5522 wait_event(bitmap->behind_wait, 5553 wait_event(bitmap->behind_wait,
5523 atomic_read(&bitmap->behind_writes) == 0); 5554 atomic_read(&bitmap->behind_writes) == 0);
@@ -5578,20 +5609,20 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5578 * which will now never happen */ 5609 * which will now never happen */
5579 wake_up_process(mddev->sync_thread->tsk); 5610 wake_up_process(mddev->sync_thread->tsk);
5580 5611
5581 if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) 5612 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
5582 return -EBUSY; 5613 return -EBUSY;
5583 mddev_unlock(mddev); 5614 mddev_unlock(mddev);
5584 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 5615 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
5585 &mddev->recovery)); 5616 &mddev->recovery));
5586 wait_event(mddev->sb_wait, 5617 wait_event(mddev->sb_wait,
5587 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5618 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
5588 mddev_lock_nointr(mddev); 5619 mddev_lock_nointr(mddev);
5589 5620
5590 mutex_lock(&mddev->open_mutex); 5621 mutex_lock(&mddev->open_mutex);
5591 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5622 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5592 mddev->sync_thread || 5623 mddev->sync_thread ||
5593 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5624 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5594 printk("md: %s still in use.\n",mdname(mddev)); 5625 pr_warn("md: %s still in use.\n",mdname(mddev));
5595 if (did_freeze) { 5626 if (did_freeze) {
5596 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5627 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5597 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5628 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -5653,7 +5684,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
5653 mddev->sysfs_active || 5684 mddev->sysfs_active ||
5654 mddev->sync_thread || 5685 mddev->sync_thread ||
5655 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5686 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5656 printk("md: %s still in use.\n",mdname(mddev)); 5687 pr_warn("md: %s still in use.\n",mdname(mddev));
5657 mutex_unlock(&mddev->open_mutex); 5688 mutex_unlock(&mddev->open_mutex);
5658 if (did_freeze) { 5689 if (did_freeze) {
5659 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5690 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -5690,7 +5721,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
5690 * Free resources if final stop 5721 * Free resources if final stop
5691 */ 5722 */
5692 if (mode == 0) { 5723 if (mode == 0) {
5693 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 5724 pr_info("md: %s stopped.\n", mdname(mddev));
5694 5725
5695 bitmap_destroy(mddev); 5726 bitmap_destroy(mddev);
5696 if (mddev->bitmap_info.file) { 5727 if (mddev->bitmap_info.file) {
@@ -5722,17 +5753,17 @@ static void autorun_array(struct mddev *mddev)
5722 if (list_empty(&mddev->disks)) 5753 if (list_empty(&mddev->disks))
5723 return; 5754 return;
5724 5755
5725 printk(KERN_INFO "md: running: "); 5756 pr_info("md: running: ");
5726 5757
5727 rdev_for_each(rdev, mddev) { 5758 rdev_for_each(rdev, mddev) {
5728 char b[BDEVNAME_SIZE]; 5759 char b[BDEVNAME_SIZE];
5729 printk("<%s>", bdevname(rdev->bdev,b)); 5760 pr_cont("<%s>", bdevname(rdev->bdev,b));
5730 } 5761 }
5731 printk("\n"); 5762 pr_cont("\n");
5732 5763
5733 err = do_md_run(mddev); 5764 err = do_md_run(mddev);
5734 if (err) { 5765 if (err) {
5735 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 5766 pr_warn("md: do_md_run() returned %d\n", err);
5736 do_md_stop(mddev, 0, NULL); 5767 do_md_stop(mddev, 0, NULL);
5737 } 5768 }
5738} 5769}
@@ -5755,7 +5786,7 @@ static void autorun_devices(int part)
5755 struct mddev *mddev; 5786 struct mddev *mddev;
5756 char b[BDEVNAME_SIZE]; 5787 char b[BDEVNAME_SIZE];
5757 5788
5758 printk(KERN_INFO "md: autorun ...\n"); 5789 pr_info("md: autorun ...\n");
5759 while (!list_empty(&pending_raid_disks)) { 5790 while (!list_empty(&pending_raid_disks)) {
5760 int unit; 5791 int unit;
5761 dev_t dev; 5792 dev_t dev;
@@ -5763,13 +5794,12 @@ static void autorun_devices(int part)
5763 rdev0 = list_entry(pending_raid_disks.next, 5794 rdev0 = list_entry(pending_raid_disks.next,
5764 struct md_rdev, same_set); 5795 struct md_rdev, same_set);
5765 5796
5766 printk(KERN_INFO "md: considering %s ...\n", 5797 pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
5767 bdevname(rdev0->bdev,b));
5768 INIT_LIST_HEAD(&candidates); 5798 INIT_LIST_HEAD(&candidates);
5769 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 5799 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
5770 if (super_90_load(rdev, rdev0, 0) >= 0) { 5800 if (super_90_load(rdev, rdev0, 0) >= 0) {
5771 printk(KERN_INFO "md: adding %s ...\n", 5801 pr_debug("md: adding %s ...\n",
5772 bdevname(rdev->bdev,b)); 5802 bdevname(rdev->bdev,b));
5773 list_move(&rdev->same_set, &candidates); 5803 list_move(&rdev->same_set, &candidates);
5774 } 5804 }
5775 /* 5805 /*
@@ -5786,8 +5816,8 @@ static void autorun_devices(int part)
5786 unit = MINOR(dev); 5816 unit = MINOR(dev);
5787 } 5817 }
5788 if (rdev0->preferred_minor != unit) { 5818 if (rdev0->preferred_minor != unit) {
5789 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 5819 pr_warn("md: unit number in %s is bad: %d\n",
5790 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 5820 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5791 break; 5821 break;
5792 } 5822 }
5793 5823
@@ -5796,21 +5826,17 @@ static void autorun_devices(int part)
5796 if (!mddev || !mddev->gendisk) { 5826 if (!mddev || !mddev->gendisk) {
5797 if (mddev) 5827 if (mddev)
5798 mddev_put(mddev); 5828 mddev_put(mddev);
5799 printk(KERN_ERR
5800 "md: cannot allocate memory for md drive.\n");
5801 break; 5829 break;
5802 } 5830 }
5803 if (mddev_lock(mddev)) 5831 if (mddev_lock(mddev))
5804 printk(KERN_WARNING "md: %s locked, cannot run\n", 5832 pr_warn("md: %s locked, cannot run\n", mdname(mddev));
5805 mdname(mddev));
5806 else if (mddev->raid_disks || mddev->major_version 5833 else if (mddev->raid_disks || mddev->major_version
5807 || !list_empty(&mddev->disks)) { 5834 || !list_empty(&mddev->disks)) {
5808 printk(KERN_WARNING 5835 pr_warn("md: %s already running, cannot run %s\n",
5809 "md: %s already running, cannot run %s\n",
5810 mdname(mddev), bdevname(rdev0->bdev,b)); 5836 mdname(mddev), bdevname(rdev0->bdev,b));
5811 mddev_unlock(mddev); 5837 mddev_unlock(mddev);
5812 } else { 5838 } else {
5813 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 5839 pr_debug("md: created %s\n", mdname(mddev));
5814 mddev->persistent = 1; 5840 mddev->persistent = 1;
5815 rdev_for_each_list(rdev, tmp, &candidates) { 5841 rdev_for_each_list(rdev, tmp, &candidates) {
5816 list_del_init(&rdev->same_set); 5842 list_del_init(&rdev->same_set);
@@ -5829,7 +5855,7 @@ static void autorun_devices(int part)
5829 } 5855 }
5830 mddev_put(mddev); 5856 mddev_put(mddev);
5831 } 5857 }
5832 printk(KERN_INFO "md: ... autorun DONE.\n"); 5858 pr_info("md: ... autorun DONE.\n");
5833} 5859}
5834#endif /* !MODULE */ 5860#endif /* !MODULE */
5835 5861
@@ -5964,6 +5990,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
5964 info.state |= (1<<MD_DISK_JOURNAL); 5990 info.state |= (1<<MD_DISK_JOURNAL);
5965 if (test_bit(WriteMostly, &rdev->flags)) 5991 if (test_bit(WriteMostly, &rdev->flags))
5966 info.state |= (1<<MD_DISK_WRITEMOSTLY); 5992 info.state |= (1<<MD_DISK_WRITEMOSTLY);
5993 if (test_bit(FailFast, &rdev->flags))
5994 info.state |= (1<<MD_DISK_FAILFAST);
5967 } else { 5995 } else {
5968 info.major = info.minor = 0; 5996 info.major = info.minor = 0;
5969 info.raid_disk = -1; 5997 info.raid_disk = -1;
@@ -5985,8 +6013,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
5985 6013
5986 if (mddev_is_clustered(mddev) && 6014 if (mddev_is_clustered(mddev) &&
5987 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6015 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
5988 pr_err("%s: Cannot add to clustered mddev.\n", 6016 pr_warn("%s: Cannot add to clustered mddev.\n",
5989 mdname(mddev)); 6017 mdname(mddev));
5990 return -EINVAL; 6018 return -EINVAL;
5991 } 6019 }
5992 6020
@@ -5998,8 +6026,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
5998 /* expecting a device which has a superblock */ 6026 /* expecting a device which has a superblock */
5999 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6027 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6000 if (IS_ERR(rdev)) { 6028 if (IS_ERR(rdev)) {
6001 printk(KERN_WARNING 6029 pr_warn("md: md_import_device returned %ld\n",
6002 "md: md_import_device returned %ld\n",
6003 PTR_ERR(rdev)); 6030 PTR_ERR(rdev));
6004 return PTR_ERR(rdev); 6031 return PTR_ERR(rdev);
6005 } 6032 }
@@ -6010,8 +6037,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6010 err = super_types[mddev->major_version] 6037 err = super_types[mddev->major_version]
6011 .load_super(rdev, rdev0, mddev->minor_version); 6038 .load_super(rdev, rdev0, mddev->minor_version);
6012 if (err < 0) { 6039 if (err < 0) {
6013 printk(KERN_WARNING 6040 pr_warn("md: %s has different UUID to %s\n",
6014 "md: %s has different UUID to %s\n",
6015 bdevname(rdev->bdev,b), 6041 bdevname(rdev->bdev,b),
6016 bdevname(rdev0->bdev,b2)); 6042 bdevname(rdev0->bdev,b2));
6017 export_rdev(rdev); 6043 export_rdev(rdev);
@@ -6032,9 +6058,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6032 if (mddev->pers) { 6058 if (mddev->pers) {
6033 int err; 6059 int err;
6034 if (!mddev->pers->hot_add_disk) { 6060 if (!mddev->pers->hot_add_disk) {
6035 printk(KERN_WARNING 6061 pr_warn("%s: personality does not support diskops!\n",
6036 "%s: personality does not support diskops!\n", 6062 mdname(mddev));
6037 mdname(mddev));
6038 return -EINVAL; 6063 return -EINVAL;
6039 } 6064 }
6040 if (mddev->persistent) 6065 if (mddev->persistent)
@@ -6043,8 +6068,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6043 else 6068 else
6044 rdev = md_import_device(dev, -1, -1); 6069 rdev = md_import_device(dev, -1, -1);
6045 if (IS_ERR(rdev)) { 6070 if (IS_ERR(rdev)) {
6046 printk(KERN_WARNING 6071 pr_warn("md: md_import_device returned %ld\n",
6047 "md: md_import_device returned %ld\n",
6048 PTR_ERR(rdev)); 6072 PTR_ERR(rdev));
6049 return PTR_ERR(rdev); 6073 return PTR_ERR(rdev);
6050 } 6074 }
@@ -6075,6 +6099,10 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6075 set_bit(WriteMostly, &rdev->flags); 6099 set_bit(WriteMostly, &rdev->flags);
6076 else 6100 else
6077 clear_bit(WriteMostly, &rdev->flags); 6101 clear_bit(WriteMostly, &rdev->flags);
6102 if (info->state & (1<<MD_DISK_FAILFAST))
6103 set_bit(FailFast, &rdev->flags);
6104 else
6105 clear_bit(FailFast, &rdev->flags);
6078 6106
6079 if (info->state & (1<<MD_DISK_JOURNAL)) { 6107 if (info->state & (1<<MD_DISK_JOURNAL)) {
6080 struct md_rdev *rdev2; 6108 struct md_rdev *rdev2;
@@ -6140,8 +6168,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6140 * for major_version==0 superblocks 6168 * for major_version==0 superblocks
6141 */ 6169 */
6142 if (mddev->major_version != 0) { 6170 if (mddev->major_version != 0) {
6143 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 6171 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6144 mdname(mddev));
6145 return -EINVAL; 6172 return -EINVAL;
6146 } 6173 }
6147 6174
@@ -6149,8 +6176,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6149 int err; 6176 int err;
6150 rdev = md_import_device(dev, -1, 0); 6177 rdev = md_import_device(dev, -1, 0);
6151 if (IS_ERR(rdev)) { 6178 if (IS_ERR(rdev)) {
6152 printk(KERN_WARNING 6179 pr_warn("md: error, md_import_device() returned %ld\n",
6153 "md: error, md_import_device() returned %ld\n",
6154 PTR_ERR(rdev)); 6180 PTR_ERR(rdev));
6155 return PTR_ERR(rdev); 6181 return PTR_ERR(rdev);
6156 } 6182 }
@@ -6166,9 +6192,11 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6166 6192
6167 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6193 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6168 set_bit(WriteMostly, &rdev->flags); 6194 set_bit(WriteMostly, &rdev->flags);
6195 if (info->state & (1<<MD_DISK_FAILFAST))
6196 set_bit(FailFast, &rdev->flags);
6169 6197
6170 if (!mddev->persistent) { 6198 if (!mddev->persistent) {
6171 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 6199 pr_debug("md: nonpersistent superblock ...\n");
6172 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6200 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6173 } else 6201 } else
6174 rdev->sb_start = calc_dev_sboffset(rdev); 6202 rdev->sb_start = calc_dev_sboffset(rdev);
@@ -6207,13 +6235,17 @@ kick_rdev:
6207 md_cluster_ops->remove_disk(mddev, rdev); 6235 md_cluster_ops->remove_disk(mddev, rdev);
6208 6236
6209 md_kick_rdev_from_array(rdev); 6237 md_kick_rdev_from_array(rdev);
6210 md_update_sb(mddev, 1); 6238 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6239 if (mddev->thread)
6240 md_wakeup_thread(mddev->thread);
6241 else
6242 md_update_sb(mddev, 1);
6211 md_new_event(mddev); 6243 md_new_event(mddev);
6212 6244
6213 return 0; 6245 return 0;
6214busy: 6246busy:
6215 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 6247 pr_debug("md: cannot remove active disk %s from %s ...\n",
6216 bdevname(rdev->bdev,b), mdname(mddev)); 6248 bdevname(rdev->bdev,b), mdname(mddev));
6217 return -EBUSY; 6249 return -EBUSY;
6218} 6250}
6219 6251
@@ -6227,22 +6259,19 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
6227 return -ENODEV; 6259 return -ENODEV;
6228 6260
6229 if (mddev->major_version != 0) { 6261 if (mddev->major_version != 0) {
6230 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 6262 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
6231 " version-0 superblocks.\n",
6232 mdname(mddev)); 6263 mdname(mddev));
6233 return -EINVAL; 6264 return -EINVAL;
6234 } 6265 }
6235 if (!mddev->pers->hot_add_disk) { 6266 if (!mddev->pers->hot_add_disk) {
6236 printk(KERN_WARNING 6267 pr_warn("%s: personality does not support diskops!\n",
6237 "%s: personality does not support diskops!\n",
6238 mdname(mddev)); 6268 mdname(mddev));
6239 return -EINVAL; 6269 return -EINVAL;
6240 } 6270 }
6241 6271
6242 rdev = md_import_device(dev, -1, 0); 6272 rdev = md_import_device(dev, -1, 0);
6243 if (IS_ERR(rdev)) { 6273 if (IS_ERR(rdev)) {
6244 printk(KERN_WARNING 6274 pr_warn("md: error, md_import_device() returned %ld\n",
6245 "md: error, md_import_device() returned %ld\n",
6246 PTR_ERR(rdev)); 6275 PTR_ERR(rdev));
6247 return -EINVAL; 6276 return -EINVAL;
6248 } 6277 }
@@ -6255,8 +6284,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
6255 rdev->sectors = rdev->sb_start; 6284 rdev->sectors = rdev->sb_start;
6256 6285
6257 if (test_bit(Faulty, &rdev->flags)) { 6286 if (test_bit(Faulty, &rdev->flags)) {
6258 printk(KERN_WARNING 6287 pr_warn("md: can not hot-add faulty %s disk to %s!\n",
6259 "md: can not hot-add faulty %s disk to %s!\n",
6260 bdevname(rdev->bdev,b), mdname(mddev)); 6288 bdevname(rdev->bdev,b), mdname(mddev));
6261 err = -EINVAL; 6289 err = -EINVAL;
6262 goto abort_export; 6290 goto abort_export;
@@ -6276,7 +6304,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
6276 6304
6277 rdev->raid_disk = -1; 6305 rdev->raid_disk = -1;
6278 6306
6279 md_update_sb(mddev, 1); 6307 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6308 if (!mddev->thread)
6309 md_update_sb(mddev, 1);
6280 /* 6310 /*
6281 * Kick recovery, maybe this spare has to be added to the 6311 * Kick recovery, maybe this spare has to be added to the
6282 * array immediately. 6312 * array immediately.
@@ -6312,23 +6342,23 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
6312 f = fget(fd); 6342 f = fget(fd);
6313 6343
6314 if (f == NULL) { 6344 if (f == NULL) {
6315 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 6345 pr_warn("%s: error: failed to get bitmap file\n",
6316 mdname(mddev)); 6346 mdname(mddev));
6317 return -EBADF; 6347 return -EBADF;
6318 } 6348 }
6319 6349
6320 inode = f->f_mapping->host; 6350 inode = f->f_mapping->host;
6321 if (!S_ISREG(inode->i_mode)) { 6351 if (!S_ISREG(inode->i_mode)) {
6322 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", 6352 pr_warn("%s: error: bitmap file must be a regular file\n",
6323 mdname(mddev)); 6353 mdname(mddev));
6324 err = -EBADF; 6354 err = -EBADF;
6325 } else if (!(f->f_mode & FMODE_WRITE)) { 6355 } else if (!(f->f_mode & FMODE_WRITE)) {
6326 printk(KERN_ERR "%s: error: bitmap file must open for write\n", 6356 pr_warn("%s: error: bitmap file must open for write\n",
6327 mdname(mddev)); 6357 mdname(mddev));
6328 err = -EBADF; 6358 err = -EBADF;
6329 } else if (atomic_read(&inode->i_writecount) != 1) { 6359 } else if (atomic_read(&inode->i_writecount) != 1) {
6330 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 6360 pr_warn("%s: error: bitmap file is already in use\n",
6331 mdname(mddev)); 6361 mdname(mddev));
6332 err = -EBUSY; 6362 err = -EBUSY;
6333 } 6363 }
6334 if (err) { 6364 if (err) {
@@ -6393,8 +6423,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6393 info->major_version >= ARRAY_SIZE(super_types) || 6423 info->major_version >= ARRAY_SIZE(super_types) ||
6394 super_types[info->major_version].name == NULL) { 6424 super_types[info->major_version].name == NULL) {
6395 /* maybe try to auto-load a module? */ 6425 /* maybe try to auto-load a module? */
6396 printk(KERN_INFO 6426 pr_warn("md: superblock version %d not known\n",
6397 "md: superblock version %d not known\n",
6398 info->major_version); 6427 info->major_version);
6399 return -EINVAL; 6428 return -EINVAL;
6400 } 6429 }
@@ -6432,9 +6461,11 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6432 6461
6433 mddev->max_disks = MD_SB_DISKS; 6462 mddev->max_disks = MD_SB_DISKS;
6434 6463
6435 if (mddev->persistent) 6464 if (mddev->persistent) {
6436 mddev->flags = 0; 6465 mddev->flags = 0;
6437 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6466 mddev->sb_flags = 0;
6467 }
6468 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6438 6469
6439 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6470 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6440 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6471 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
@@ -6660,8 +6691,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6660 if (mddev->bitmap_info.nodes) { 6691 if (mddev->bitmap_info.nodes) {
6661 /* hold PW on all the bitmap lock */ 6692 /* hold PW on all the bitmap lock */
6662 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 6693 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
6663 printk("md: can't change bitmap to none since the" 6694 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
6664 " array is in use by more than one node\n");
6665 rv = -EPERM; 6695 rv = -EPERM;
6666 md_cluster_ops->unlock_all_bitmaps(mddev); 6696 md_cluster_ops->unlock_all_bitmaps(mddev);
6667 goto err; 6697 goto err;
@@ -6829,7 +6859,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6829 /* need to ensure recovery thread has run */ 6859 /* need to ensure recovery thread has run */
6830 wait_event_interruptible_timeout(mddev->sb_wait, 6860 wait_event_interruptible_timeout(mddev->sb_wait,
6831 !test_bit(MD_RECOVERY_NEEDED, 6861 !test_bit(MD_RECOVERY_NEEDED,
6832 &mddev->flags), 6862 &mddev->recovery),
6833 msecs_to_jiffies(5000)); 6863 msecs_to_jiffies(5000));
6834 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 6864 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
6835 /* Need to flush page cache, and ensure no-one else opens 6865 /* Need to flush page cache, and ensure no-one else opens
@@ -6847,9 +6877,8 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6847 } 6877 }
6848 err = mddev_lock(mddev); 6878 err = mddev_lock(mddev);
6849 if (err) { 6879 if (err) {
6850 printk(KERN_INFO 6880 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
6851 "md: ioctl lock interrupted, reason %d, cmd %d\n", 6881 err, cmd);
6852 err, cmd);
6853 goto out; 6882 goto out;
6854 } 6883 }
6855 6884
@@ -6864,30 +6893,24 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6864 if (mddev->pers) { 6893 if (mddev->pers) {
6865 err = update_array_info(mddev, &info); 6894 err = update_array_info(mddev, &info);
6866 if (err) { 6895 if (err) {
6867 printk(KERN_WARNING "md: couldn't update" 6896 pr_warn("md: couldn't update array info. %d\n", err);
6868 " array info. %d\n", err);
6869 goto unlock; 6897 goto unlock;
6870 } 6898 }
6871 goto unlock; 6899 goto unlock;
6872 } 6900 }
6873 if (!list_empty(&mddev->disks)) { 6901 if (!list_empty(&mddev->disks)) {
6874 printk(KERN_WARNING 6902 pr_warn("md: array %s already has disks!\n", mdname(mddev));
6875 "md: array %s already has disks!\n",
6876 mdname(mddev));
6877 err = -EBUSY; 6903 err = -EBUSY;
6878 goto unlock; 6904 goto unlock;
6879 } 6905 }
6880 if (mddev->raid_disks) { 6906 if (mddev->raid_disks) {
6881 printk(KERN_WARNING 6907 pr_warn("md: array %s already initialised!\n", mdname(mddev));
6882 "md: array %s already initialised!\n",
6883 mdname(mddev));
6884 err = -EBUSY; 6908 err = -EBUSY;
6885 goto unlock; 6909 goto unlock;
6886 } 6910 }
6887 err = set_array_info(mddev, &info); 6911 err = set_array_info(mddev, &info);
6888 if (err) { 6912 if (err) {
6889 printk(KERN_WARNING "md: couldn't set" 6913 pr_warn("md: couldn't set array info. %d\n", err);
6890 " array info. %d\n", err);
6891 goto unlock; 6914 goto unlock;
6892 } 6915 }
6893 goto unlock; 6916 goto unlock;
@@ -6987,11 +7010,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6987 /* If a device failed while we were read-only, we 7010 /* If a device failed while we were read-only, we
6988 * need to make sure the metadata is updated now. 7011 * need to make sure the metadata is updated now.
6989 */ 7012 */
6990 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 7013 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
6991 mddev_unlock(mddev); 7014 mddev_unlock(mddev);
6992 wait_event(mddev->sb_wait, 7015 wait_event(mddev->sb_wait,
6993 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 7016 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
6994 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 7017 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6995 mddev_lock_nointr(mddev); 7018 mddev_lock_nointr(mddev);
6996 } 7019 }
6997 } else { 7020 } else {
@@ -7092,7 +7115,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
7092 7115
7093 if (test_bit(MD_CLOSING, &mddev->flags)) { 7116 if (test_bit(MD_CLOSING, &mddev->flags)) {
7094 mutex_unlock(&mddev->open_mutex); 7117 mutex_unlock(&mddev->open_mutex);
7095 return -ENODEV; 7118 err = -ENODEV;
7119 goto out;
7096 } 7120 }
7097 7121
7098 err = 0; 7122 err = 0;
@@ -7101,6 +7125,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
7101 7125
7102 check_disk_change(bdev); 7126 check_disk_change(bdev);
7103 out: 7127 out:
7128 if (err)
7129 mddev_put(mddev);
7104 return err; 7130 return err;
7105} 7131}
7106 7132
@@ -7171,10 +7197,12 @@ static int md_thread(void *arg)
7171 wait_event_interruptible_timeout 7197 wait_event_interruptible_timeout
7172 (thread->wqueue, 7198 (thread->wqueue,
7173 test_bit(THREAD_WAKEUP, &thread->flags) 7199 test_bit(THREAD_WAKEUP, &thread->flags)
7174 || kthread_should_stop(), 7200 || kthread_should_stop() || kthread_should_park(),
7175 thread->timeout); 7201 thread->timeout);
7176 7202
7177 clear_bit(THREAD_WAKEUP, &thread->flags); 7203 clear_bit(THREAD_WAKEUP, &thread->flags);
7204 if (kthread_should_park())
7205 kthread_parkme();
7178 if (!kthread_should_stop()) 7206 if (!kthread_should_stop())
7179 thread->run(thread); 7207 thread->run(thread);
7180 } 7208 }
@@ -7588,8 +7616,8 @@ static const struct file_operations md_seq_fops = {
7588 7616
7589int register_md_personality(struct md_personality *p) 7617int register_md_personality(struct md_personality *p)
7590{ 7618{
7591 printk(KERN_INFO "md: %s personality registered for level %d\n", 7619 pr_debug("md: %s personality registered for level %d\n",
7592 p->name, p->level); 7620 p->name, p->level);
7593 spin_lock(&pers_lock); 7621 spin_lock(&pers_lock);
7594 list_add_tail(&p->list, &pers_list); 7622 list_add_tail(&p->list, &pers_list);
7595 spin_unlock(&pers_lock); 7623 spin_unlock(&pers_lock);
@@ -7599,7 +7627,7 @@ EXPORT_SYMBOL(register_md_personality);
7599 7627
7600int unregister_md_personality(struct md_personality *p) 7628int unregister_md_personality(struct md_personality *p)
7601{ 7629{
7602 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 7630 pr_debug("md: %s personality unregistered\n", p->name);
7603 spin_lock(&pers_lock); 7631 spin_lock(&pers_lock);
7604 list_del_init(&p->list); 7632 list_del_init(&p->list);
7605 spin_unlock(&pers_lock); 7633 spin_unlock(&pers_lock);
@@ -7639,7 +7667,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
7639 spin_lock(&pers_lock); 7667 spin_lock(&pers_lock);
7640 /* ensure module won't be unloaded */ 7668 /* ensure module won't be unloaded */
7641 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 7669 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
7642 pr_err("can't find md-cluster module or get it's reference.\n"); 7670 pr_warn("can't find md-cluster module or get it's reference.\n");
7643 spin_unlock(&pers_lock); 7671 spin_unlock(&pers_lock);
7644 return -ENOENT; 7672 return -ENOENT;
7645 } 7673 }
@@ -7741,8 +7769,8 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
7741 spin_lock(&mddev->lock); 7769 spin_lock(&mddev->lock);
7742 if (mddev->in_sync) { 7770 if (mddev->in_sync) {
7743 mddev->in_sync = 0; 7771 mddev->in_sync = 0;
7744 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7772 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
7745 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7773 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
7746 md_wakeup_thread(mddev->thread); 7774 md_wakeup_thread(mddev->thread);
7747 did_change = 1; 7775 did_change = 1;
7748 } 7776 }
@@ -7751,7 +7779,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
7751 if (did_change) 7779 if (did_change)
7752 sysfs_notify_dirent_safe(mddev->sysfs_state); 7780 sysfs_notify_dirent_safe(mddev->sysfs_state);
7753 wait_event(mddev->sb_wait, 7781 wait_event(mddev->sb_wait,
7754 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 7782 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7755} 7783}
7756EXPORT_SYMBOL(md_write_start); 7784EXPORT_SYMBOL(md_write_start);
7757 7785
@@ -7772,7 +7800,7 @@ EXPORT_SYMBOL(md_write_end);
7772 * attempting a GFP_KERNEL allocation while holding the mddev lock. 7800 * attempting a GFP_KERNEL allocation while holding the mddev lock.
7773 * Must be called with mddev_lock held. 7801 * Must be called with mddev_lock held.
7774 * 7802 *
7775 * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock 7803 * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
7776 * is dropped, so return -EAGAIN after notifying userspace. 7804 * is dropped, so return -EAGAIN after notifying userspace.
7777 */ 7805 */
7778int md_allow_write(struct mddev *mddev) 7806int md_allow_write(struct mddev *mddev)
@@ -7787,8 +7815,8 @@ int md_allow_write(struct mddev *mddev)
7787 spin_lock(&mddev->lock); 7815 spin_lock(&mddev->lock);
7788 if (mddev->in_sync) { 7816 if (mddev->in_sync) {
7789 mddev->in_sync = 0; 7817 mddev->in_sync = 0;
7790 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7818 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
7791 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7819 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
7792 if (mddev->safemode_delay && 7820 if (mddev->safemode_delay &&
7793 mddev->safemode == 0) 7821 mddev->safemode == 0)
7794 mddev->safemode = 1; 7822 mddev->safemode = 1;
@@ -7798,7 +7826,7 @@ int md_allow_write(struct mddev *mddev)
7798 } else 7826 } else
7799 spin_unlock(&mddev->lock); 7827 spin_unlock(&mddev->lock);
7800 7828
7801 if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 7829 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
7802 return -EAGAIN; 7830 return -EAGAIN;
7803 else 7831 else
7804 return 0; 7832 return 0;
@@ -7914,11 +7942,9 @@ void md_do_sync(struct md_thread *thread)
7914 mddev2->curr_resync >= mddev->curr_resync) { 7942 mddev2->curr_resync >= mddev->curr_resync) {
7915 if (mddev2_minor != mddev2->md_minor) { 7943 if (mddev2_minor != mddev2->md_minor) {
7916 mddev2_minor = mddev2->md_minor; 7944 mddev2_minor = mddev2->md_minor;
7917 printk(KERN_INFO "md: delaying %s of %s" 7945 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
7918 " until %s has finished (they" 7946 desc, mdname(mddev),
7919 " share one or more physical units)\n", 7947 mdname(mddev2));
7920 desc, mdname(mddev),
7921 mdname(mddev2));
7922 } 7948 }
7923 mddev_put(mddev2); 7949 mddev_put(mddev2);
7924 if (signal_pending(current)) 7950 if (signal_pending(current))
@@ -7975,12 +8001,10 @@ void md_do_sync(struct md_thread *thread)
7975 } 8001 }
7976 } 8002 }
7977 8003
7978 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 8004 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
7979 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 8005 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
7980 " %d KB/sec/disk.\n", speed_min(mddev)); 8006 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
7981 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 8007 speed_max(mddev), desc);
7982 "(but not more than %d KB/sec) for %s.\n",
7983 speed_max(mddev), desc);
7984 8008
7985 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8009 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
7986 8010
@@ -7997,16 +8021,15 @@ void md_do_sync(struct md_thread *thread)
7997 * Tune reconstruction: 8021 * Tune reconstruction:
7998 */ 8022 */
7999 window = 32*(PAGE_SIZE/512); 8023 window = 32*(PAGE_SIZE/512);
8000 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", 8024 pr_debug("md: using %dk window, over a total of %lluk.\n",
8001 window/2, (unsigned long long)max_sectors/2); 8025 window/2, (unsigned long long)max_sectors/2);
8002 8026
8003 atomic_set(&mddev->recovery_active, 0); 8027 atomic_set(&mddev->recovery_active, 0);
8004 last_check = 0; 8028 last_check = 0;
8005 8029
8006 if (j>2) { 8030 if (j>2) {
8007 printk(KERN_INFO 8031 pr_debug("md: resuming %s of %s from checkpoint.\n",
8008 "md: resuming %s of %s from checkpoint.\n", 8032 desc, mdname(mddev));
8009 desc, mdname(mddev));
8010 mddev->curr_resync = j; 8033 mddev->curr_resync = j;
8011 } else 8034 } else
8012 mddev->curr_resync = 3; /* no longer delayed */ 8035 mddev->curr_resync = 3; /* no longer delayed */
@@ -8038,7 +8061,7 @@ void md_do_sync(struct md_thread *thread)
8038 j > mddev->recovery_cp) 8061 j > mddev->recovery_cp)
8039 mddev->recovery_cp = j; 8062 mddev->recovery_cp = j;
8040 update_time = jiffies; 8063 update_time = jiffies;
8041 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 8064 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8042 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8065 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8043 } 8066 }
8044 8067
@@ -8133,9 +8156,9 @@ void md_do_sync(struct md_thread *thread)
8133 } 8156 }
8134 } 8157 }
8135 } 8158 }
8136 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, 8159 pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
8137 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 8160 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
8138 ? "interrupted" : "done"); 8161 ? "interrupted" : "done");
8139 /* 8162 /*
8140 * this also signals 'finished resyncing' to md_stop 8163 * this also signals 'finished resyncing' to md_stop
8141 */ 8164 */
@@ -8155,9 +8178,8 @@ void md_do_sync(struct md_thread *thread)
8155 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8178 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8156 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8179 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8157 if (mddev->curr_resync >= mddev->recovery_cp) { 8180 if (mddev->curr_resync >= mddev->recovery_cp) {
8158 printk(KERN_INFO 8181 pr_debug("md: checkpointing %s of %s.\n",
8159 "md: checkpointing %s of %s.\n", 8182 desc, mdname(mddev));
8160 desc, mdname(mddev));
8161 if (test_bit(MD_RECOVERY_ERROR, 8183 if (test_bit(MD_RECOVERY_ERROR,
8162 &mddev->recovery)) 8184 &mddev->recovery))
8163 mddev->recovery_cp = 8185 mddev->recovery_cp =
@@ -8187,8 +8209,8 @@ void md_do_sync(struct md_thread *thread)
8187 /* set CHANGE_PENDING here since maybe another update is needed, 8209 /* set CHANGE_PENDING here since maybe another update is needed,
8188 * so other nodes are informed. It should be harmless for normal 8210 * so other nodes are informed. It should be harmless for normal
8189 * raid */ 8211 * raid */
8190 set_mask_bits(&mddev->flags, 0, 8212 set_mask_bits(&mddev->sb_flags, 0,
8191 BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); 8213 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8192 8214
8193 spin_lock(&mddev->lock); 8215 spin_lock(&mddev->lock);
8194 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8216 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -8288,12 +8310,12 @@ static int remove_and_add_spares(struct mddev *mddev,
8288 if (!test_bit(Journal, &rdev->flags)) 8310 if (!test_bit(Journal, &rdev->flags))
8289 spares++; 8311 spares++;
8290 md_new_event(mddev); 8312 md_new_event(mddev);
8291 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8313 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8292 } 8314 }
8293 } 8315 }
8294no_add: 8316no_add:
8295 if (removed) 8317 if (removed)
8296 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8318 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8297 return spares; 8319 return spares;
8298} 8320}
8299 8321
@@ -8305,8 +8327,8 @@ static void md_start_sync(struct work_struct *ws)
8305 mddev, 8327 mddev,
8306 "resync"); 8328 "resync");
8307 if (!mddev->sync_thread) { 8329 if (!mddev->sync_thread) {
8308 printk(KERN_ERR "%s: could not start resync thread...\n", 8330 pr_warn("%s: could not start resync thread...\n",
8309 mdname(mddev)); 8331 mdname(mddev));
8310 /* leave the spares where they are, it shouldn't hurt */ 8332 /* leave the spares where they are, it shouldn't hurt */
8311 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8333 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8312 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8334 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -8356,8 +8378,8 @@ void md_check_recovery(struct mddev *mddev)
8356 8378
8357 if (signal_pending(current)) { 8379 if (signal_pending(current)) {
8358 if (mddev->pers->sync_request && !mddev->external) { 8380 if (mddev->pers->sync_request && !mddev->external) {
8359 printk(KERN_INFO "md: %s in immediate safe mode\n", 8381 pr_debug("md: %s in immediate safe mode\n",
8360 mdname(mddev)); 8382 mdname(mddev));
8361 mddev->safemode = 2; 8383 mddev->safemode = 2;
8362 } 8384 }
8363 flush_signals(current); 8385 flush_signals(current);
@@ -8366,7 +8388,7 @@ void md_check_recovery(struct mddev *mddev)
8366 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 8388 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
8367 return; 8389 return;
8368 if ( ! ( 8390 if ( ! (
8369 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || 8391 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
8370 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8392 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8371 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8393 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8372 test_bit(MD_RELOAD_SB, &mddev->flags) || 8394 test_bit(MD_RELOAD_SB, &mddev->flags) ||
@@ -8404,7 +8426,7 @@ void md_check_recovery(struct mddev *mddev)
8404 md_reap_sync_thread(mddev); 8426 md_reap_sync_thread(mddev);
8405 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8427 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8406 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8428 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8407 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 8429 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8408 goto unlock; 8430 goto unlock;
8409 } 8431 }
8410 8432
@@ -8432,7 +8454,7 @@ void md_check_recovery(struct mddev *mddev)
8432 mddev->recovery_cp == MaxSector) { 8454 mddev->recovery_cp == MaxSector) {
8433 mddev->in_sync = 1; 8455 mddev->in_sync = 1;
8434 did_change = 1; 8456 did_change = 1;
8435 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 8457 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8436 } 8458 }
8437 if (mddev->safemode == 1) 8459 if (mddev->safemode == 1)
8438 mddev->safemode = 0; 8460 mddev->safemode = 0;
@@ -8441,7 +8463,7 @@ void md_check_recovery(struct mddev *mddev)
8441 sysfs_notify_dirent_safe(mddev->sysfs_state); 8463 sysfs_notify_dirent_safe(mddev->sysfs_state);
8442 } 8464 }
8443 8465
8444 if (mddev->flags & MD_UPDATE_SB_FLAGS) 8466 if (mddev->sb_flags)
8445 md_update_sb(mddev, 0); 8467 md_update_sb(mddev, 0);
8446 8468
8447 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 8469 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
@@ -8537,7 +8559,7 @@ void md_reap_sync_thread(struct mddev *mddev)
8537 if (mddev->pers->spare_active(mddev)) { 8559 if (mddev->pers->spare_active(mddev)) {
8538 sysfs_notify(&mddev->kobj, NULL, 8560 sysfs_notify(&mddev->kobj, NULL,
8539 "degraded"); 8561 "degraded");
8540 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8562 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8541 } 8563 }
8542 } 8564 }
8543 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8565 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
@@ -8552,7 +8574,7 @@ void md_reap_sync_thread(struct mddev *mddev)
8552 rdev->saved_raid_disk = -1; 8574 rdev->saved_raid_disk = -1;
8553 8575
8554 md_update_sb(mddev, 1); 8576 md_update_sb(mddev, 1);
8555 /* MD_CHANGE_PENDING should be cleared by md_update_sb, so we can 8577 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
8556 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 8578 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
8557 * clustered raid */ 8579 * clustered raid */
8558 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 8580 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
@@ -8614,9 +8636,12 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8614 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 8636 rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
8615 if (rv == 0) { 8637 if (rv == 0) {
8616 /* Make sure they get written out promptly */ 8638 /* Make sure they get written out promptly */
8639 if (test_bit(ExternalBbl, &rdev->flags))
8640 sysfs_notify(&rdev->kobj, NULL,
8641 "unacknowledged_bad_blocks");
8617 sysfs_notify_dirent_safe(rdev->sysfs_state); 8642 sysfs_notify_dirent_safe(rdev->sysfs_state);
8618 set_mask_bits(&mddev->flags, 0, 8643 set_mask_bits(&mddev->sb_flags, 0,
8619 BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING)); 8644 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
8620 md_wakeup_thread(rdev->mddev->thread); 8645 md_wakeup_thread(rdev->mddev->thread);
8621 return 1; 8646 return 1;
8622 } else 8647 } else
@@ -8627,12 +8652,15 @@ EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8627int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8652int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8628 int is_new) 8653 int is_new)
8629{ 8654{
8655 int rv;
8630 if (is_new) 8656 if (is_new)
8631 s += rdev->new_data_offset; 8657 s += rdev->new_data_offset;
8632 else 8658 else
8633 s += rdev->data_offset; 8659 s += rdev->data_offset;
8634 return badblocks_clear(&rdev->badblocks, 8660 rv = badblocks_clear(&rdev->badblocks, s, sectors);
8635 s, sectors); 8661 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
8662 sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
8663 return rv;
8636} 8664}
8637EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8665EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8638 8666
@@ -8749,7 +8777,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
8749 rdev2->saved_raid_disk = role; 8777 rdev2->saved_raid_disk = role;
8750 ret = remove_and_add_spares(mddev, rdev2); 8778 ret = remove_and_add_spares(mddev, rdev2);
8751 pr_info("Activated spare: %s\n", 8779 pr_info("Activated spare: %s\n",
8752 bdevname(rdev2->bdev,b)); 8780 bdevname(rdev2->bdev,b));
8753 /* wakeup mddev->thread here, so array could 8781 /* wakeup mddev->thread here, so array could
8754 * perform resync with the new activated disk */ 8782 * perform resync with the new activated disk */
8755 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8783 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -8785,15 +8813,18 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
8785 * variable in case we err in the future 8813 * variable in case we err in the future
8786 */ 8814 */
8787 rdev->sb_page = NULL; 8815 rdev->sb_page = NULL;
8788 alloc_disk_sb(rdev); 8816 err = alloc_disk_sb(rdev);
8789 ClearPageUptodate(rdev->sb_page); 8817 if (err == 0) {
8790 rdev->sb_loaded = 0; 8818 ClearPageUptodate(rdev->sb_page);
8791 err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); 8819 rdev->sb_loaded = 0;
8792 8820 err = super_types[mddev->major_version].
8821 load_super(rdev, NULL, mddev->minor_version);
8822 }
8793 if (err < 0) { 8823 if (err < 0) {
8794 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 8824 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
8795 __func__, __LINE__, rdev->desc_nr, err); 8825 __func__, __LINE__, rdev->desc_nr, err);
8796 put_page(rdev->sb_page); 8826 if (rdev->sb_page)
8827 put_page(rdev->sb_page);
8797 rdev->sb_page = swapout; 8828 rdev->sb_page = swapout;
8798 rdev->sb_loaded = 1; 8829 rdev->sb_loaded = 1;
8799 return err; 8830 return err;
@@ -8871,9 +8902,6 @@ void md_autodetect_dev(dev_t dev)
8871 mutex_lock(&detected_devices_mutex); 8902 mutex_lock(&detected_devices_mutex);
8872 list_add_tail(&node_detected_dev->list, &all_detected_devices); 8903 list_add_tail(&node_detected_dev->list, &all_detected_devices);
8873 mutex_unlock(&detected_devices_mutex); 8904 mutex_unlock(&detected_devices_mutex);
8874 } else {
8875 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8876 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
8877 } 8905 }
8878} 8906}
8879 8907
@@ -8887,7 +8915,7 @@ static void autostart_arrays(int part)
8887 i_scanned = 0; 8915 i_scanned = 0;
8888 i_passed = 0; 8916 i_passed = 0;
8889 8917
8890 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 8918 pr_info("md: Autodetecting RAID arrays.\n");
8891 8919
8892 mutex_lock(&detected_devices_mutex); 8920 mutex_lock(&detected_devices_mutex);
8893 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 8921 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
@@ -8912,8 +8940,7 @@ static void autostart_arrays(int part)
8912 } 8940 }
8913 mutex_unlock(&detected_devices_mutex); 8941 mutex_unlock(&detected_devices_mutex);
8914 8942
8915 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 8943 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
8916 i_scanned, i_passed);
8917 8944
8918 autorun_devices(part); 8945 autorun_devices(part);
8919} 8946}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2b2041773e79..e38936d05df1 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -30,6 +30,16 @@
30#define MaxSector (~(sector_t)0) 30#define MaxSector (~(sector_t)0)
31 31
32/* 32/*
33 * These flags should really be called "NO_RETRY" rather than
34 * "FAILFAST" because they don't make any promise about time lapse,
35 * only about the number of retries, which will be zero.
36 * REQ_FAILFAST_DRIVER is not included because
37 * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
38 * seems to suggest that the errors it avoids retrying should usually
39 * be retried.
40 */
41#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
42/*
33 * MD's 'extended' device 43 * MD's 'extended' device
34 */ 44 */
35struct md_rdev { 45struct md_rdev {
@@ -168,6 +178,19 @@ enum flag_bits {
168 * so it is safe to remove without 178 * so it is safe to remove without
169 * another synchronize_rcu() call. 179 * another synchronize_rcu() call.
170 */ 180 */
181 ExternalBbl, /* External metadata provides bad
182 * block management for a disk
183 */
184 FailFast, /* Minimal retries should be attempted on
185 * this device, so use REQ_FAILFAST_DEV.
186 * Also don't try to repair failed reads.
187 * It is expects that no bad block log
188 * is present.
189 */
190 LastDev, /* Seems to be the last working dev as
191 * it didn't fail, so don't use FailFast
192 * any more for metadata
193 */
171}; 194};
172 195
173static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, 196static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
189 int is_new); 212 int is_new);
190struct md_cluster_info; 213struct md_cluster_info;
191 214
215enum mddev_flags {
216 MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */
217 MD_CLOSING, /* If set, we are closing the array, do not open
218 * it then */
219 MD_JOURNAL_CLEAN, /* A raid with journal is already clean */
220 MD_HAS_JOURNAL, /* The raid array has journal feature set */
221 MD_RELOAD_SB, /* Reload the superblock because another node
222 * updated it.
223 */
224 MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
225 * already took resync lock, need to
226 * release the lock */
227 MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is
228 * supported as calls to md_error() will
229 * never cause the array to become failed.
230 */
231};
232
233enum mddev_sb_flags {
234 MD_SB_CHANGE_DEVS, /* Some device status has changed */
235 MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */
236 MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */
237 MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
238};
239
192struct mddev { 240struct mddev {
193 void *private; 241 void *private;
194 struct md_personality *pers; 242 struct md_personality *pers;
@@ -196,21 +244,7 @@ struct mddev {
196 int md_minor; 244 int md_minor;
197 struct list_head disks; 245 struct list_head disks;
198 unsigned long flags; 246 unsigned long flags;
199#define MD_CHANGE_DEVS 0 /* Some device status has changed */ 247 unsigned long sb_flags;
200#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
201#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */
202#define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */
203#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */
204#define MD_CLOSING 4 /* If set, we are closing the array, do not open
205 * it then */
206#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */
207#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */
208#define MD_RELOAD_SB 7 /* Reload the superblock because another node
209 * updated it.
210 */
211#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node
212 * already took resync lock, need to
213 * release the lock */
214 248
215 int suspended; 249 int suspended;
216 atomic_t active_io; 250 atomic_t active_io;
@@ -304,31 +338,6 @@ struct mddev {
304 int parallel_resync; 338 int parallel_resync;
305 339
306 int ok_start_degraded; 340 int ok_start_degraded;
307 /* recovery/resync flags
308 * NEEDED: we might need to start a resync/recover
309 * RUNNING: a thread is running, or about to be started
310 * SYNC: actually doing a resync, not a recovery
311 * RECOVER: doing recovery, or need to try it.
312 * INTR: resync needs to be aborted for some reason
313 * DONE: thread is done and is waiting to be reaped
314 * REQUEST: user-space has requested a sync (used with SYNC)
315 * CHECK: user-space request for check-only, no repair
316 * RESHAPE: A reshape is happening
317 * ERROR: sync-action interrupted because io-error
318 *
319 * If neither SYNC or RESHAPE are set, then it is a recovery.
320 */
321#define MD_RECOVERY_RUNNING 0
322#define MD_RECOVERY_SYNC 1
323#define MD_RECOVERY_RECOVER 2
324#define MD_RECOVERY_INTR 3
325#define MD_RECOVERY_DONE 4
326#define MD_RECOVERY_NEEDED 5
327#define MD_RECOVERY_REQUESTED 6
328#define MD_RECOVERY_CHECK 7
329#define MD_RECOVERY_RESHAPE 8
330#define MD_RECOVERY_FROZEN 9
331#define MD_RECOVERY_ERROR 10
332 341
333 unsigned long recovery; 342 unsigned long recovery;
334 /* If a RAID personality determines that recovery (of a particular 343 /* If a RAID personality determines that recovery (of a particular
@@ -442,6 +451,23 @@ struct mddev {
442 unsigned int good_device_nr; /* good device num within cluster raid */ 451 unsigned int good_device_nr; /* good device num within cluster raid */
443}; 452};
444 453
454enum recovery_flags {
455 /*
456 * If neither SYNC or RESHAPE are set, then it is a recovery.
457 */
458 MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */
459 MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */
460 MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */
461 MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */
462 MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */
463 MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */
464 MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */
465 MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */
466 MD_RECOVERY_RESHAPE, /* A reshape is happening */
467 MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */
468 MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */
469};
470
445static inline int __must_check mddev_lock(struct mddev *mddev) 471static inline int __must_check mddev_lock(struct mddev *mddev)
446{ 472{
447 return mutex_lock_interruptible(&mddev->reconfig_mutex); 473 return mutex_lock_interruptible(&mddev->reconfig_mutex);
@@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
623extern void md_flush_request(struct mddev *mddev, struct bio *bio); 649extern void md_flush_request(struct mddev *mddev, struct bio *bio);
624extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 650extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
625 sector_t sector, int size, struct page *page); 651 sector_t sector, int size, struct page *page);
626extern void md_super_wait(struct mddev *mddev); 652extern int md_super_wait(struct mddev *mddev);
627extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 653extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
628 struct page *page, int op, int op_flags, 654 struct page *page, int op, int op_flags,
629 bool metadata_op); 655 bool metadata_op);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 4da06d813b8f..aa8c4e5c1ee2 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf)
52 } 52 }
53 rcu_read_unlock(); 53 rcu_read_unlock();
54 54
55 printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); 55 pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
56 return (-1); 56 return (-1);
57} 57}
58 58
@@ -97,9 +97,9 @@ static void multipath_end_request(struct bio *bio)
97 */ 97 */
98 char b[BDEVNAME_SIZE]; 98 char b[BDEVNAME_SIZE];
99 md_error (mp_bh->mddev, rdev); 99 md_error (mp_bh->mddev, rdev);
100 printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 100 pr_info("multipath: %s: rescheduling sector %llu\n",
101 bdevname(rdev->bdev,b), 101 bdevname(rdev->bdev,b),
102 (unsigned long long)bio->bi_iter.bi_sector); 102 (unsigned long long)bio->bi_iter.bi_sector);
103 multipath_reschedule_retry(mp_bh); 103 multipath_reschedule_retry(mp_bh);
104 } else 104 } else
105 multipath_end_bh_io(mp_bh, bio->bi_error); 105 multipath_end_bh_io(mp_bh, bio->bi_error);
@@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
194 * first check if this is a queued request for a device 194 * first check if this is a queued request for a device
195 * which has just failed. 195 * which has just failed.
196 */ 196 */
197 printk(KERN_ALERT 197 pr_warn("multipath: only one IO path left and IO error.\n");
198 "multipath: only one IO path left and IO error.\n");
199 /* leave it active... it's all we have */ 198 /* leave it active... it's all we have */
200 return; 199 return;
201 } 200 }
@@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
209 spin_unlock_irqrestore(&conf->device_lock, flags); 208 spin_unlock_irqrestore(&conf->device_lock, flags);
210 } 209 }
211 set_bit(Faulty, &rdev->flags); 210 set_bit(Faulty, &rdev->flags);
212 set_bit(MD_CHANGE_DEVS, &mddev->flags); 211 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
213 printk(KERN_ALERT "multipath: IO failure on %s," 212 pr_err("multipath: IO failure on %s, disabling IO path.\n"
214 " disabling IO path.\n" 213 "multipath: Operation continuing on %d IO paths.\n",
215 "multipath: Operation continuing"
216 " on %d IO paths.\n",
217 bdevname(rdev->bdev, b), 214 bdevname(rdev->bdev, b),
218 conf->raid_disks - mddev->degraded); 215 conf->raid_disks - mddev->degraded);
219} 216}
@@ -223,21 +220,21 @@ static void print_multipath_conf (struct mpconf *conf)
223 int i; 220 int i;
224 struct multipath_info *tmp; 221 struct multipath_info *tmp;
225 222
226 printk("MULTIPATH conf printout:\n"); 223 pr_debug("MULTIPATH conf printout:\n");
227 if (!conf) { 224 if (!conf) {
228 printk("(conf==NULL)\n"); 225 pr_debug("(conf==NULL)\n");
229 return; 226 return;
230 } 227 }
231 printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 228 pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
232 conf->raid_disks); 229 conf->raid_disks);
233 230
234 for (i = 0; i < conf->raid_disks; i++) { 231 for (i = 0; i < conf->raid_disks; i++) {
235 char b[BDEVNAME_SIZE]; 232 char b[BDEVNAME_SIZE];
236 tmp = conf->multipaths + i; 233 tmp = conf->multipaths + i;
237 if (tmp->rdev) 234 if (tmp->rdev)
238 printk(" disk%d, o:%d, dev:%s\n", 235 pr_debug(" disk%d, o:%d, dev:%s\n",
239 i,!test_bit(Faulty, &tmp->rdev->flags), 236 i,!test_bit(Faulty, &tmp->rdev->flags),
240 bdevname(tmp->rdev->bdev,b)); 237 bdevname(tmp->rdev->bdev,b));
241 } 238 }
242} 239}
243 240
@@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
292 if (rdev == p->rdev) { 289 if (rdev == p->rdev) {
293 if (test_bit(In_sync, &rdev->flags) || 290 if (test_bit(In_sync, &rdev->flags) ||
294 atomic_read(&rdev->nr_pending)) { 291 atomic_read(&rdev->nr_pending)) {
295 printk(KERN_ERR "hot-remove-disk, slot %d is identified" 292 pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
296 " but is still operational!\n", number);
297 err = -EBUSY; 293 err = -EBUSY;
298 goto abort; 294 goto abort;
299 } 295 }
@@ -346,16 +342,14 @@ static void multipathd(struct md_thread *thread)
346 bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; 342 bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
347 343
348 if ((mp_bh->path = multipath_map (conf))<0) { 344 if ((mp_bh->path = multipath_map (conf))<0) {
349 printk(KERN_ALERT "multipath: %s: unrecoverable IO read" 345 pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
350 " error for block %llu\n", 346 bdevname(bio->bi_bdev,b),
351 bdevname(bio->bi_bdev,b), 347 (unsigned long long)bio->bi_iter.bi_sector);
352 (unsigned long long)bio->bi_iter.bi_sector);
353 multipath_end_bh_io(mp_bh, -EIO); 348 multipath_end_bh_io(mp_bh, -EIO);
354 } else { 349 } else {
355 printk(KERN_ERR "multipath: %s: redirecting sector %llu" 350 pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
356 " to another IO path\n", 351 bdevname(bio->bi_bdev,b),
357 bdevname(bio->bi_bdev,b), 352 (unsigned long long)bio->bi_iter.bi_sector);
358 (unsigned long long)bio->bi_iter.bi_sector);
359 *bio = *(mp_bh->master_bio); 353 *bio = *(mp_bh->master_bio);
360 bio->bi_iter.bi_sector += 354 bio->bi_iter.bi_sector +=
361 conf->multipaths[mp_bh->path].rdev->data_offset; 355 conf->multipaths[mp_bh->path].rdev->data_offset;
@@ -389,8 +383,8 @@ static int multipath_run (struct mddev *mddev)
389 return -EINVAL; 383 return -EINVAL;
390 384
391 if (mddev->level != LEVEL_MULTIPATH) { 385 if (mddev->level != LEVEL_MULTIPATH) {
392 printk("multipath: %s: raid level not set to multipath IO (%d)\n", 386 pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
393 mdname(mddev), mddev->level); 387 mdname(mddev), mddev->level);
394 goto out; 388 goto out;
395 } 389 }
396 /* 390 /*
@@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev)
401 395
402 conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); 396 conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
403 mddev->private = conf; 397 mddev->private = conf;
404 if (!conf) { 398 if (!conf)
405 printk(KERN_ERR
406 "multipath: couldn't allocate memory for %s\n",
407 mdname(mddev));
408 goto out; 399 goto out;
409 }
410 400
411 conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, 401 conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
412 GFP_KERNEL); 402 GFP_KERNEL);
413 if (!conf->multipaths) { 403 if (!conf->multipaths)
414 printk(KERN_ERR
415 "multipath: couldn't allocate memory for %s\n",
416 mdname(mddev));
417 goto out_free_conf; 404 goto out_free_conf;
418 }
419 405
420 working_disks = 0; 406 working_disks = 0;
421 rdev_for_each(rdev, mddev) { 407 rdev_for_each(rdev, mddev) {
@@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev)
439 INIT_LIST_HEAD(&conf->retry_list); 425 INIT_LIST_HEAD(&conf->retry_list);
440 426
441 if (!working_disks) { 427 if (!working_disks) {
442 printk(KERN_ERR "multipath: no operational IO paths for %s\n", 428 pr_warn("multipath: no operational IO paths for %s\n",
443 mdname(mddev)); 429 mdname(mddev));
444 goto out_free_conf; 430 goto out_free_conf;
445 } 431 }
@@ -447,27 +433,17 @@ static int multipath_run (struct mddev *mddev)
447 433
448 conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, 434 conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
449 sizeof(struct multipath_bh)); 435 sizeof(struct multipath_bh));
450 if (conf->pool == NULL) { 436 if (conf->pool == NULL)
451 printk(KERN_ERR
452 "multipath: couldn't allocate memory for %s\n",
453 mdname(mddev));
454 goto out_free_conf; 437 goto out_free_conf;
455 }
456 438
457 { 439 mddev->thread = md_register_thread(multipathd, mddev,
458 mddev->thread = md_register_thread(multipathd, mddev, 440 "multipath");
459 "multipath"); 441 if (!mddev->thread)
460 if (!mddev->thread) { 442 goto out_free_conf;
461 printk(KERN_ERR "multipath: couldn't allocate thread"
462 " for %s\n", mdname(mddev));
463 goto out_free_conf;
464 }
465 }
466 443
467 printk(KERN_INFO 444 pr_info("multipath: array %s active with %d out of %d IO paths\n",
468 "multipath: array %s active with %d out of %d IO paths\n",
469 mdname(mddev), conf->raid_disks - mddev->degraded, 445 mdname(mddev), conf->raid_disks - mddev->degraded,
470 mddev->raid_disks); 446 mddev->raid_disks);
471 /* 447 /*
472 * Ok, everything is just fine now 448 * Ok, everything is just fine now
473 */ 449 */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 258986a2699d..a162fedeb51a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -21,6 +21,7 @@
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <trace/events/block.h>
24#include "md.h" 25#include "md.h"
25#include "raid0.h" 26#include "raid0.h"
26#include "raid5.h" 27#include "raid5.h"
@@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev)
51 char b[BDEVNAME_SIZE]; 52 char b[BDEVNAME_SIZE];
52 struct r0conf *conf = mddev->private; 53 struct r0conf *conf = mddev->private;
53 int raid_disks = conf->strip_zone[0].nb_dev; 54 int raid_disks = conf->strip_zone[0].nb_dev;
54 printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n", 55 pr_debug("md: RAID0 configuration for %s - %d zone%s\n",
55 mdname(mddev), 56 mdname(mddev),
56 conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); 57 conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
57 for (j = 0; j < conf->nr_strip_zones; j++) { 58 for (j = 0; j < conf->nr_strip_zones; j++) {
58 printk(KERN_INFO "md: zone%d=[", j); 59 char line[200];
60 int len = 0;
61
59 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 62 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
60 printk(KERN_CONT "%s%s", k?"/":"", 63 len += snprintf(line+len, 200-len, "%s%s", k?"/":"",
61 bdevname(conf->devlist[j*raid_disks 64 bdevname(conf->devlist[j*raid_disks
62 + k]->bdev, b)); 65 + k]->bdev, b));
63 printk(KERN_CONT "]\n"); 66 pr_debug("md: zone%d=[%s]\n", j, line);
64 67
65 zone_size = conf->strip_zone[j].zone_end - zone_start; 68 zone_size = conf->strip_zone[j].zone_end - zone_start;
66 printk(KERN_INFO " zone-offset=%10lluKB, " 69 pr_debug(" zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n",
67 "device-offset=%10lluKB, size=%10lluKB\n",
68 (unsigned long long)zone_start>>1, 70 (unsigned long long)zone_start>>1,
69 (unsigned long long)conf->strip_zone[j].dev_start>>1, 71 (unsigned long long)conf->strip_zone[j].dev_start>>1,
70 (unsigned long long)zone_size>>1); 72 (unsigned long long)zone_size>>1);
@@ -142,9 +144,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
142 * chunk size is a multiple of that sector size 144 * chunk size is a multiple of that sector size
143 */ 145 */
144 if ((mddev->chunk_sectors << 9) % blksize) { 146 if ((mddev->chunk_sectors << 9) % blksize) {
145 printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n", 147 pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
146 mdname(mddev), 148 mdname(mddev),
147 mddev->chunk_sectors << 9, blksize); 149 mddev->chunk_sectors << 9, blksize);
148 err = -EINVAL; 150 err = -EINVAL;
149 goto abort; 151 goto abort;
150 } 152 }
@@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
186 } 188 }
187 189
188 if (j < 0) { 190 if (j < 0) {
189 printk(KERN_ERR 191 pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n",
190 "md/raid0:%s: remove inactive devices before converting to RAID0\n", 192 mdname(mddev));
191 mdname(mddev));
192 goto abort; 193 goto abort;
193 } 194 }
194 if (j >= mddev->raid_disks) { 195 if (j >= mddev->raid_disks) {
195 printk(KERN_ERR "md/raid0:%s: bad disk number %d - " 196 pr_warn("md/raid0:%s: bad disk number %d - aborting!\n",
196 "aborting!\n", mdname(mddev), j); 197 mdname(mddev), j);
197 goto abort; 198 goto abort;
198 } 199 }
199 if (dev[j]) { 200 if (dev[j]) {
200 printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " 201 pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n",
201 "aborting!\n", mdname(mddev), j); 202 mdname(mddev), j);
202 goto abort; 203 goto abort;
203 } 204 }
204 dev[j] = rdev1; 205 dev[j] = rdev1;
@@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
208 cnt++; 209 cnt++;
209 } 210 }
210 if (cnt != mddev->raid_disks) { 211 if (cnt != mddev->raid_disks) {
211 printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " 212 pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n",
212 "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); 213 mdname(mddev), cnt, mddev->raid_disks);
213 goto abort; 214 goto abort;
214 } 215 }
215 zone->nb_dev = cnt; 216 zone->nb_dev = cnt;
@@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev)
357 int ret; 358 int ret;
358 359
359 if (mddev->chunk_sectors == 0) { 360 if (mddev->chunk_sectors == 0) {
360 printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", 361 pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev));
361 mdname(mddev));
362 return -EINVAL; 362 return -EINVAL;
363 } 363 }
364 if (md_check_no_bitmap(mddev)) 364 if (md_check_no_bitmap(mddev))
@@ -399,9 +399,9 @@ static int raid0_run(struct mddev *mddev)
399 /* calculate array device size */ 399 /* calculate array device size */
400 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); 400 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
401 401
402 printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", 402 pr_debug("md/raid0:%s: md_size is %llu sectors.\n",
403 mdname(mddev), 403 mdname(mddev),
404 (unsigned long long)mddev->array_sectors); 404 (unsigned long long)mddev->array_sectors);
405 405
406 if (mddev->queue) { 406 if (mddev->queue) {
407 /* calculate the max read-ahead size. 407 /* calculate the max read-ahead size.
@@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
464 } 464 }
465 465
466 do { 466 do {
467 sector_t sector = bio->bi_iter.bi_sector; 467 sector_t bio_sector = bio->bi_iter.bi_sector;
468 sector_t sector = bio_sector;
468 unsigned chunk_sects = mddev->chunk_sectors; 469 unsigned chunk_sects = mddev->chunk_sectors;
469 470
470 unsigned sectors = chunk_sects - 471 unsigned sectors = chunk_sects -
@@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
473 : sector_div(sector, chunk_sects)); 474 : sector_div(sector, chunk_sects));
474 475
475 /* Restore due to sector_div */ 476 /* Restore due to sector_div */
476 sector = bio->bi_iter.bi_sector; 477 sector = bio_sector;
477 478
478 if (sectors < bio_sectors(bio)) { 479 if (sectors < bio_sectors(bio)) {
479 split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); 480 split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
@@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
492 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { 493 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
493 /* Just ignore it */ 494 /* Just ignore it */
494 bio_endio(split); 495 bio_endio(split);
495 } else 496 } else {
497 if (mddev->gendisk)
498 trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
499 split, disk_devt(mddev->gendisk),
500 bio_sector);
496 generic_make_request(split); 501 generic_make_request(split);
502 }
497 } while (split != bio); 503 } while (split != bio);
498} 504}
499 505
@@ -509,17 +515,17 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
509 struct r0conf *priv_conf; 515 struct r0conf *priv_conf;
510 516
511 if (mddev->degraded != 1) { 517 if (mddev->degraded != 1) {
512 printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", 518 pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
513 mdname(mddev), 519 mdname(mddev),
514 mddev->degraded); 520 mddev->degraded);
515 return ERR_PTR(-EINVAL); 521 return ERR_PTR(-EINVAL);
516 } 522 }
517 523
518 rdev_for_each(rdev, mddev) { 524 rdev_for_each(rdev, mddev) {
519 /* check slot number for a disk */ 525 /* check slot number for a disk */
520 if (rdev->raid_disk == mddev->raid_disks-1) { 526 if (rdev->raid_disk == mddev->raid_disks-1) {
521 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", 527 pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n",
522 mdname(mddev)); 528 mdname(mddev));
523 return ERR_PTR(-EINVAL); 529 return ERR_PTR(-EINVAL);
524 } 530 }
525 rdev->sectors = mddev->dev_sectors; 531 rdev->sectors = mddev->dev_sectors;
@@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
533 mddev->delta_disks = -1; 539 mddev->delta_disks = -1;
534 /* make sure it will be not marked as dirty */ 540 /* make sure it will be not marked as dirty */
535 mddev->recovery_cp = MaxSector; 541 mddev->recovery_cp = MaxSector;
542 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
543 clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);
536 544
537 create_strip_zones(mddev, &priv_conf); 545 create_strip_zones(mddev, &priv_conf);
546
538 return priv_conf; 547 return priv_conf;
539} 548}
540 549
@@ -549,19 +558,19 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
549 * - all mirrors must be already degraded 558 * - all mirrors must be already degraded
550 */ 559 */
551 if (mddev->layout != ((1 << 8) + 2)) { 560 if (mddev->layout != ((1 << 8) + 2)) {
552 printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", 561 pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
553 mdname(mddev), 562 mdname(mddev),
554 mddev->layout); 563 mddev->layout);
555 return ERR_PTR(-EINVAL); 564 return ERR_PTR(-EINVAL);
556 } 565 }
557 if (mddev->raid_disks & 1) { 566 if (mddev->raid_disks & 1) {
558 printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", 567 pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
559 mdname(mddev)); 568 mdname(mddev));
560 return ERR_PTR(-EINVAL); 569 return ERR_PTR(-EINVAL);
561 } 570 }
562 if (mddev->degraded != (mddev->raid_disks>>1)) { 571 if (mddev->degraded != (mddev->raid_disks>>1)) {
563 printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", 572 pr_warn("md/raid0:%s: All mirrors must be already degraded!\n",
564 mdname(mddev)); 573 mdname(mddev));
565 return ERR_PTR(-EINVAL); 574 return ERR_PTR(-EINVAL);
566 } 575 }
567 576
@@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
574 mddev->degraded = 0; 583 mddev->degraded = 0;
575 /* make sure it will be not marked as dirty */ 584 /* make sure it will be not marked as dirty */
576 mddev->recovery_cp = MaxSector; 585 mddev->recovery_cp = MaxSector;
586 clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
577 587
578 create_strip_zones(mddev, &priv_conf); 588 create_strip_zones(mddev, &priv_conf);
579 return priv_conf; 589 return priv_conf;
@@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
588 * - (N - 1) mirror drives must be already faulty 598 * - (N - 1) mirror drives must be already faulty
589 */ 599 */
590 if ((mddev->raid_disks - 1) != mddev->degraded) { 600 if ((mddev->raid_disks - 1) != mddev->degraded) {
591 printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", 601 pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
592 mdname(mddev)); 602 mdname(mddev));
593 return ERR_PTR(-EINVAL); 603 return ERR_PTR(-EINVAL);
594 } 604 }
@@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
616 mddev->raid_disks = 1; 626 mddev->raid_disks = 1;
617 /* make sure it will be not marked as dirty */ 627 /* make sure it will be not marked as dirty */
618 mddev->recovery_cp = MaxSector; 628 mddev->recovery_cp = MaxSector;
629 clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
619 630
620 create_strip_zones(mddev, &priv_conf); 631 create_strip_zones(mddev, &priv_conf);
621 return priv_conf; 632 return priv_conf;
@@ -631,8 +642,8 @@ static void *raid0_takeover(struct mddev *mddev)
631 */ 642 */
632 643
633 if (mddev->bitmap) { 644 if (mddev->bitmap) {
634 printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n", 645 pr_warn("md/raid0: %s: cannot takeover array with bitmap\n",
635 mdname(mddev)); 646 mdname(mddev));
636 return ERR_PTR(-EBUSY); 647 return ERR_PTR(-EBUSY);
637 } 648 }
638 if (mddev->level == 4) 649 if (mddev->level == 4)
@@ -642,8 +653,8 @@ static void *raid0_takeover(struct mddev *mddev)
642 if (mddev->layout == ALGORITHM_PARITY_N) 653 if (mddev->layout == ALGORITHM_PARITY_N)
643 return raid0_takeover_raid45(mddev); 654 return raid0_takeover_raid45(mddev);
644 655
645 printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", 656 pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
646 mdname(mddev), ALGORITHM_PARITY_N); 657 mdname(mddev), ALGORITHM_PARITY_N);
647 } 658 }
648 659
649 if (mddev->level == 10) 660 if (mddev->level == 10)
@@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev)
652 if (mddev->level == 1) 663 if (mddev->level == 1)
653 return raid0_takeover_raid1(mddev); 664 return raid0_takeover_raid1(mddev);
654 665
655 printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", 666 pr_warn("Takeover from raid%i to raid0 not supported\n",
656 mddev->level); 667 mddev->level);
657 668
658 return ERR_PTR(-EINVAL); 669 return ERR_PTR(-EINVAL);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 29e2df5cd77b..a1f3fbed9100 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,6 +37,7 @@
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <trace/events/block.h>
40#include "md.h" 41#include "md.h"
41#include "raid1.h" 42#include "raid1.h"
42#include "bitmap.h" 43#include "bitmap.h"
@@ -70,6 +71,9 @@ static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
70 sector_t bi_sector); 71 sector_t bi_sector);
71static void lower_barrier(struct r1conf *conf); 72static void lower_barrier(struct r1conf *conf);
72 73
74#define raid1_log(md, fmt, args...) \
75 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
76
73static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 77static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
74{ 78{
75 struct pool_info *pi = data; 79 struct pool_info *pi = data;
@@ -325,6 +329,11 @@ static void raid1_end_read_request(struct bio *bio)
325 329
326 if (uptodate) 330 if (uptodate)
327 set_bit(R1BIO_Uptodate, &r1_bio->state); 331 set_bit(R1BIO_Uptodate, &r1_bio->state);
332 else if (test_bit(FailFast, &rdev->flags) &&
333 test_bit(R1BIO_FailFast, &r1_bio->state))
334 /* This was a fail-fast read so we definitely
335 * want to retry */
336 ;
328 else { 337 else {
329 /* If all other devices have failed, we want to return 338 /* If all other devices have failed, we want to return
330 * the error upwards rather than fail the last device. 339 * the error upwards rather than fail the last device.
@@ -347,13 +356,10 @@ static void raid1_end_read_request(struct bio *bio)
347 * oops, read error: 356 * oops, read error:
348 */ 357 */
349 char b[BDEVNAME_SIZE]; 358 char b[BDEVNAME_SIZE];
350 printk_ratelimited( 359 pr_err_ratelimited("md/raid1:%s: %s: rescheduling sector %llu\n",
351 KERN_ERR "md/raid1:%s: %s: " 360 mdname(conf->mddev),
352 "rescheduling sector %llu\n", 361 bdevname(rdev->bdev, b),
353 mdname(conf->mddev), 362 (unsigned long long)r1_bio->sector);
354 bdevname(rdev->bdev,
355 b),
356 (unsigned long long)r1_bio->sector);
357 set_bit(R1BIO_ReadError, &r1_bio->state); 363 set_bit(R1BIO_ReadError, &r1_bio->state);
358 reschedule_retry(r1_bio); 364 reschedule_retry(r1_bio);
359 /* don't drop the reference on read_disk yet */ 365 /* don't drop the reference on read_disk yet */
@@ -416,7 +422,24 @@ static void raid1_end_write_request(struct bio *bio)
416 set_bit(MD_RECOVERY_NEEDED, & 422 set_bit(MD_RECOVERY_NEEDED, &
417 conf->mddev->recovery); 423 conf->mddev->recovery);
418 424
419 set_bit(R1BIO_WriteError, &r1_bio->state); 425 if (test_bit(FailFast, &rdev->flags) &&
426 (bio->bi_opf & MD_FAILFAST) &&
427 /* We never try FailFast to WriteMostly devices */
428 !test_bit(WriteMostly, &rdev->flags)) {
429 md_error(r1_bio->mddev, rdev);
430 if (!test_bit(Faulty, &rdev->flags))
431 /* This is the only remaining device,
432 * We need to retry the write without
433 * FailFast
434 */
435 set_bit(R1BIO_WriteError, &r1_bio->state);
436 else {
437 /* Finished with this branch */
438 r1_bio->bios[mirror] = NULL;
439 to_put = bio;
440 }
441 } else
442 set_bit(R1BIO_WriteError, &r1_bio->state);
420 } else { 443 } else {
421 /* 444 /*
422 * Set R1BIO_Uptodate in our master bio, so that we 445 * Set R1BIO_Uptodate in our master bio, so that we
@@ -534,6 +557,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
534 best_good_sectors = 0; 557 best_good_sectors = 0;
535 has_nonrot_disk = 0; 558 has_nonrot_disk = 0;
536 choose_next_idle = 0; 559 choose_next_idle = 0;
560 clear_bit(R1BIO_FailFast, &r1_bio->state);
537 561
538 if ((conf->mddev->recovery_cp < this_sector + sectors) || 562 if ((conf->mddev->recovery_cp < this_sector + sectors) ||
539 (mddev_is_clustered(conf->mddev) && 563 (mddev_is_clustered(conf->mddev) &&
@@ -607,6 +631,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
607 } else 631 } else
608 best_good_sectors = sectors; 632 best_good_sectors = sectors;
609 633
634 if (best_disk >= 0)
635 /* At least two disks to choose from so failfast is OK */
636 set_bit(R1BIO_FailFast, &r1_bio->state);
637
610 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); 638 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
611 has_nonrot_disk |= nonrot; 639 has_nonrot_disk |= nonrot;
612 pending = atomic_read(&rdev->nr_pending); 640 pending = atomic_read(&rdev->nr_pending);
@@ -645,11 +673,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
645 } 673 }
646 break; 674 break;
647 } 675 }
648 /* If device is idle, use it */
649 if (pending == 0) {
650 best_disk = disk;
651 break;
652 }
653 676
654 if (choose_next_idle) 677 if (choose_next_idle)
655 continue; 678 continue;
@@ -672,7 +695,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
672 * mixed ratation/non-rotational disks depending on workload. 695 * mixed ratation/non-rotational disks depending on workload.
673 */ 696 */
674 if (best_disk == -1) { 697 if (best_disk == -1) {
675 if (has_nonrot_disk) 698 if (has_nonrot_disk || min_pending == 0)
676 best_disk = best_pending_disk; 699 best_disk = best_pending_disk;
677 else 700 else
678 best_disk = best_dist_disk; 701 best_disk = best_dist_disk;
@@ -745,9 +768,14 @@ static void flush_pending_writes(struct r1conf *conf)
745 768
746 while (bio) { /* submit pending writes */ 769 while (bio) { /* submit pending writes */
747 struct bio *next = bio->bi_next; 770 struct bio *next = bio->bi_next;
771 struct md_rdev *rdev = (void*)bio->bi_bdev;
748 bio->bi_next = NULL; 772 bio->bi_next = NULL;
749 if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 773 bio->bi_bdev = rdev->bdev;
750 !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) 774 if (test_bit(Faulty, &rdev->flags)) {
775 bio->bi_error = -EIO;
776 bio_endio(bio);
777 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
778 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
751 /* Just ignore it */ 779 /* Just ignore it */
752 bio_endio(bio); 780 bio_endio(bio);
753 else 781 else
@@ -832,7 +860,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
832 else if (conf->barrier && bio_data_dir(bio) == WRITE) { 860 else if (conf->barrier && bio_data_dir(bio) == WRITE) {
833 if ((conf->mddev->curr_resync_completed 861 if ((conf->mddev->curr_resync_completed
834 >= bio_end_sector(bio)) || 862 >= bio_end_sector(bio)) ||
835 (conf->next_resync + NEXT_NORMALIO_DISTANCE 863 (conf->start_next_window + NEXT_NORMALIO_DISTANCE
836 <= bio->bi_iter.bi_sector)) 864 <= bio->bi_iter.bi_sector))
837 wait = false; 865 wait = false;
838 else 866 else
@@ -858,6 +886,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
858 * that queue to allow conf->start_next_window 886 * that queue to allow conf->start_next_window
859 * to increase. 887 * to increase.
860 */ 888 */
889 raid1_log(conf->mddev, "wait barrier");
861 wait_event_lock_irq(conf->wait_barrier, 890 wait_event_lock_irq(conf->wait_barrier,
862 !conf->array_frozen && 891 !conf->array_frozen &&
863 (!conf->barrier || 892 (!conf->barrier ||
@@ -937,6 +966,7 @@ static void freeze_array(struct r1conf *conf, int extra)
937 */ 966 */
938 spin_lock_irq(&conf->resync_lock); 967 spin_lock_irq(&conf->resync_lock);
939 conf->array_frozen = 1; 968 conf->array_frozen = 1;
969 raid1_log(conf->mddev, "wait freeze");
940 wait_event_lock_irq_cmd(conf->wait_barrier, 970 wait_event_lock_irq_cmd(conf->wait_barrier,
941 conf->nr_pending == conf->nr_queued+extra, 971 conf->nr_pending == conf->nr_queued+extra,
942 conf->resync_lock, 972 conf->resync_lock,
@@ -1019,9 +1049,14 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
1019 1049
1020 while (bio) { /* submit pending writes */ 1050 while (bio) { /* submit pending writes */
1021 struct bio *next = bio->bi_next; 1051 struct bio *next = bio->bi_next;
1052 struct md_rdev *rdev = (void*)bio->bi_bdev;
1022 bio->bi_next = NULL; 1053 bio->bi_next = NULL;
1023 if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 1054 bio->bi_bdev = rdev->bdev;
1024 !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) 1055 if (test_bit(Faulty, &rdev->flags)) {
1056 bio->bi_error = -EIO;
1057 bio_endio(bio);
1058 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1059 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1025 /* Just ignore it */ 1060 /* Just ignore it */
1026 bio_endio(bio); 1061 bio_endio(bio);
1027 else 1062 else
@@ -1136,6 +1171,7 @@ read_again:
1136 * take care not to over-take any writes 1171 * take care not to over-take any writes
1137 * that are 'behind' 1172 * that are 'behind'
1138 */ 1173 */
1174 raid1_log(mddev, "wait behind writes");
1139 wait_event(bitmap->behind_wait, 1175 wait_event(bitmap->behind_wait,
1140 atomic_read(&bitmap->behind_writes) == 0); 1176 atomic_read(&bitmap->behind_writes) == 0);
1141 } 1177 }
@@ -1153,8 +1189,16 @@ read_again:
1153 read_bio->bi_bdev = mirror->rdev->bdev; 1189 read_bio->bi_bdev = mirror->rdev->bdev;
1154 read_bio->bi_end_io = raid1_end_read_request; 1190 read_bio->bi_end_io = raid1_end_read_request;
1155 bio_set_op_attrs(read_bio, op, do_sync); 1191 bio_set_op_attrs(read_bio, op, do_sync);
1192 if (test_bit(FailFast, &mirror->rdev->flags) &&
1193 test_bit(R1BIO_FailFast, &r1_bio->state))
1194 read_bio->bi_opf |= MD_FAILFAST;
1156 read_bio->bi_private = r1_bio; 1195 read_bio->bi_private = r1_bio;
1157 1196
1197 if (mddev->gendisk)
1198 trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
1199 read_bio, disk_devt(mddev->gendisk),
1200 r1_bio->sector);
1201
1158 if (max_sectors < r1_bio->sectors) { 1202 if (max_sectors < r1_bio->sectors) {
1159 /* could not read all from this device, so we will 1203 /* could not read all from this device, so we will
1160 * need another r1_bio. 1204 * need another r1_bio.
@@ -1195,6 +1239,7 @@ read_again:
1195 */ 1239 */
1196 if (conf->pending_count >= max_queued_requests) { 1240 if (conf->pending_count >= max_queued_requests) {
1197 md_wakeup_thread(mddev->thread); 1241 md_wakeup_thread(mddev->thread);
1242 raid1_log(mddev, "wait queued");
1198 wait_event(conf->wait_barrier, 1243 wait_event(conf->wait_barrier,
1199 conf->pending_count < max_queued_requests); 1244 conf->pending_count < max_queued_requests);
1200 } 1245 }
@@ -1286,6 +1331,7 @@ read_again:
1286 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1331 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1287 r1_bio->state = 0; 1332 r1_bio->state = 0;
1288 allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); 1333 allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
1334 raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1289 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1335 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1290 start_next_window = wait_barrier(conf, bio); 1336 start_next_window = wait_barrier(conf, bio);
1291 /* 1337 /*
@@ -1363,10 +1409,21 @@ read_again:
1363 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1409 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1364 mbio->bi_end_io = raid1_end_write_request; 1410 mbio->bi_end_io = raid1_end_write_request;
1365 bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); 1411 bio_set_op_attrs(mbio, op, do_flush_fua | do_sync);
1412 if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
1413 !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
1414 conf->raid_disks - mddev->degraded > 1)
1415 mbio->bi_opf |= MD_FAILFAST;
1366 mbio->bi_private = r1_bio; 1416 mbio->bi_private = r1_bio;
1367 1417
1368 atomic_inc(&r1_bio->remaining); 1418 atomic_inc(&r1_bio->remaining);
1369 1419
1420 if (mddev->gendisk)
1421 trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
1422 mbio, disk_devt(mddev->gendisk),
1423 r1_bio->sector);
1424 /* flush_pending_writes() needs access to the rdev so...*/
1425 mbio->bi_bdev = (void*)conf->mirrors[i].rdev;
1426
1370 cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); 1427 cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
1371 if (cb) 1428 if (cb)
1372 plug = container_of(cb, struct raid1_plug_cb, cb); 1429 plug = container_of(cb, struct raid1_plug_cb, cb);
@@ -1436,6 +1493,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
1436 * next level up know. 1493 * next level up know.
1437 * else mark the drive as failed 1494 * else mark the drive as failed
1438 */ 1495 */
1496 spin_lock_irqsave(&conf->device_lock, flags);
1439 if (test_bit(In_sync, &rdev->flags) 1497 if (test_bit(In_sync, &rdev->flags)
1440 && (conf->raid_disks - mddev->degraded) == 1) { 1498 && (conf->raid_disks - mddev->degraded) == 1) {
1441 /* 1499 /*
@@ -1445,10 +1503,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
1445 * it is very likely to fail. 1503 * it is very likely to fail.
1446 */ 1504 */
1447 conf->recovery_disabled = mddev->recovery_disabled; 1505 conf->recovery_disabled = mddev->recovery_disabled;
1506 spin_unlock_irqrestore(&conf->device_lock, flags);
1448 return; 1507 return;
1449 } 1508 }
1450 set_bit(Blocked, &rdev->flags); 1509 set_bit(Blocked, &rdev->flags);
1451 spin_lock_irqsave(&conf->device_lock, flags);
1452 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1510 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1453 mddev->degraded++; 1511 mddev->degraded++;
1454 set_bit(Faulty, &rdev->flags); 1512 set_bit(Faulty, &rdev->flags);
@@ -1459,36 +1517,35 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
1459 * if recovery is running, make sure it aborts. 1517 * if recovery is running, make sure it aborts.
1460 */ 1518 */
1461 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1519 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1462 set_mask_bits(&mddev->flags, 0, 1520 set_mask_bits(&mddev->sb_flags, 0,
1463 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1521 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1464 printk(KERN_ALERT 1522 pr_crit("md/raid1:%s: Disk failure on %s, disabling device.\n"
1465 "md/raid1:%s: Disk failure on %s, disabling device.\n" 1523 "md/raid1:%s: Operation continuing on %d devices.\n",
1466 "md/raid1:%s: Operation continuing on %d devices.\n", 1524 mdname(mddev), bdevname(rdev->bdev, b),
1467 mdname(mddev), bdevname(rdev->bdev, b), 1525 mdname(mddev), conf->raid_disks - mddev->degraded);
1468 mdname(mddev), conf->raid_disks - mddev->degraded);
1469} 1526}
1470 1527
1471static void print_conf(struct r1conf *conf) 1528static void print_conf(struct r1conf *conf)
1472{ 1529{
1473 int i; 1530 int i;
1474 1531
1475 printk(KERN_DEBUG "RAID1 conf printout:\n"); 1532 pr_debug("RAID1 conf printout:\n");
1476 if (!conf) { 1533 if (!conf) {
1477 printk(KERN_DEBUG "(!conf)\n"); 1534 pr_debug("(!conf)\n");
1478 return; 1535 return;
1479 } 1536 }
1480 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1537 pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1481 conf->raid_disks); 1538 conf->raid_disks);
1482 1539
1483 rcu_read_lock(); 1540 rcu_read_lock();
1484 for (i = 0; i < conf->raid_disks; i++) { 1541 for (i = 0; i < conf->raid_disks; i++) {
1485 char b[BDEVNAME_SIZE]; 1542 char b[BDEVNAME_SIZE];
1486 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1543 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1487 if (rdev) 1544 if (rdev)
1488 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", 1545 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1489 i, !test_bit(In_sync, &rdev->flags), 1546 i, !test_bit(In_sync, &rdev->flags),
1490 !test_bit(Faulty, &rdev->flags), 1547 !test_bit(Faulty, &rdev->flags),
1491 bdevname(rdev->bdev,b)); 1548 bdevname(rdev->bdev,b));
1492 } 1549 }
1493 rcu_read_unlock(); 1550 rcu_read_unlock();
1494} 1551}
@@ -1788,12 +1845,24 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1788 sector_t sect = r1_bio->sector; 1845 sector_t sect = r1_bio->sector;
1789 int sectors = r1_bio->sectors; 1846 int sectors = r1_bio->sectors;
1790 int idx = 0; 1847 int idx = 0;
1848 struct md_rdev *rdev;
1849
1850 rdev = conf->mirrors[r1_bio->read_disk].rdev;
1851 if (test_bit(FailFast, &rdev->flags)) {
1852 /* Don't try recovering from here - just fail it
1853 * ... unless it is the last working device of course */
1854 md_error(mddev, rdev);
1855 if (test_bit(Faulty, &rdev->flags))
1856 /* Don't try to read from here, but make sure
1857 * put_buf does it's thing
1858 */
1859 bio->bi_end_io = end_sync_write;
1860 }
1791 1861
1792 while(sectors) { 1862 while(sectors) {
1793 int s = sectors; 1863 int s = sectors;
1794 int d = r1_bio->read_disk; 1864 int d = r1_bio->read_disk;
1795 int success = 0; 1865 int success = 0;
1796 struct md_rdev *rdev;
1797 int start; 1866 int start;
1798 1867
1799 if (s > (PAGE_SIZE>>9)) 1868 if (s > (PAGE_SIZE>>9))
@@ -1825,11 +1894,10 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
1825 * work just disable and interrupt the recovery. 1894 * work just disable and interrupt the recovery.
1826 * Don't fail devices as that won't really help. 1895 * Don't fail devices as that won't really help.
1827 */ 1896 */
1828 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" 1897 pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
1829 " for block %llu\n", 1898 mdname(mddev),
1830 mdname(mddev), 1899 bdevname(bio->bi_bdev, b),
1831 bdevname(bio->bi_bdev, b), 1900 (unsigned long long)r1_bio->sector);
1832 (unsigned long long)r1_bio->sector);
1833 for (d = 0; d < conf->raid_disks * 2; d++) { 1901 for (d = 0; d < conf->raid_disks * 2; d++) {
1834 rdev = conf->mirrors[d].rdev; 1902 rdev = conf->mirrors[d].rdev;
1835 if (!rdev || test_bit(Faulty, &rdev->flags)) 1903 if (!rdev || test_bit(Faulty, &rdev->flags))
@@ -2013,6 +2081,9 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
2013 continue; 2081 continue;
2014 2082
2015 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); 2083 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2084 if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
2085 wbio->bi_opf |= MD_FAILFAST;
2086
2016 wbio->bi_end_io = end_sync_write; 2087 wbio->bi_end_io = end_sync_write;
2017 atomic_inc(&r1_bio->remaining); 2088 atomic_inc(&r1_bio->remaining);
2018 md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); 2089 md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
@@ -2122,13 +2193,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
2122 if (r1_sync_page_io(rdev, sect, s, 2193 if (r1_sync_page_io(rdev, sect, s,
2123 conf->tmppage, READ)) { 2194 conf->tmppage, READ)) {
2124 atomic_add(s, &rdev->corrected_errors); 2195 atomic_add(s, &rdev->corrected_errors);
2125 printk(KERN_INFO 2196 pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
2126 "md/raid1:%s: read error corrected " 2197 mdname(mddev), s,
2127 "(%d sectors at %llu on %s)\n", 2198 (unsigned long long)(sect +
2128 mdname(mddev), s, 2199 rdev->data_offset),
2129 (unsigned long long)(sect + 2200 bdevname(rdev->bdev, b));
2130 rdev->data_offset),
2131 bdevname(rdev->bdev, b));
2132 } 2201 }
2133 rdev_dec_pending(rdev, mddev); 2202 rdev_dec_pending(rdev, mddev);
2134 } else 2203 } else
@@ -2287,6 +2356,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
2287 struct bio *bio; 2356 struct bio *bio;
2288 char b[BDEVNAME_SIZE]; 2357 char b[BDEVNAME_SIZE];
2289 struct md_rdev *rdev; 2358 struct md_rdev *rdev;
2359 dev_t bio_dev;
2360 sector_t bio_sector;
2290 2361
2291 clear_bit(R1BIO_ReadError, &r1_bio->state); 2362 clear_bit(R1BIO_ReadError, &r1_bio->state);
2292 /* we got a read error. Maybe the drive is bad. Maybe just 2363 /* we got a read error. Maybe the drive is bad. Maybe just
@@ -2300,10 +2371,14 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
2300 2371
2301 bio = r1_bio->bios[r1_bio->read_disk]; 2372 bio = r1_bio->bios[r1_bio->read_disk];
2302 bdevname(bio->bi_bdev, b); 2373 bdevname(bio->bi_bdev, b);
2374 bio_dev = bio->bi_bdev->bd_dev;
2375 bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
2303 bio_put(bio); 2376 bio_put(bio);
2304 r1_bio->bios[r1_bio->read_disk] = NULL; 2377 r1_bio->bios[r1_bio->read_disk] = NULL;
2305 2378
2306 if (mddev->ro == 0) { 2379 rdev = conf->mirrors[r1_bio->read_disk].rdev;
2380 if (mddev->ro == 0
2381 && !test_bit(FailFast, &rdev->flags)) {
2307 freeze_array(conf, 1); 2382 freeze_array(conf, 1);
2308 fix_read_error(conf, r1_bio->read_disk, 2383 fix_read_error(conf, r1_bio->read_disk,
2309 r1_bio->sector, r1_bio->sectors); 2384 r1_bio->sector, r1_bio->sectors);
@@ -2312,14 +2387,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
2312 r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; 2387 r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
2313 } 2388 }
2314 2389
2315 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 2390 rdev_dec_pending(rdev, conf->mddev);
2316 2391
2317read_more: 2392read_more:
2318 disk = read_balance(conf, r1_bio, &max_sectors); 2393 disk = read_balance(conf, r1_bio, &max_sectors);
2319 if (disk == -1) { 2394 if (disk == -1) {
2320 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" 2395 pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
2321 " read error for block %llu\n", 2396 mdname(mddev), b, (unsigned long long)r1_bio->sector);
2322 mdname(mddev), b, (unsigned long long)r1_bio->sector);
2323 raid_end_bio_io(r1_bio); 2397 raid_end_bio_io(r1_bio);
2324 } else { 2398 } else {
2325 const unsigned long do_sync 2399 const unsigned long do_sync
@@ -2330,16 +2404,17 @@ read_more:
2330 max_sectors); 2404 max_sectors);
2331 r1_bio->bios[r1_bio->read_disk] = bio; 2405 r1_bio->bios[r1_bio->read_disk] = bio;
2332 rdev = conf->mirrors[disk].rdev; 2406 rdev = conf->mirrors[disk].rdev;
2333 printk_ratelimited(KERN_ERR 2407 pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
2334 "md/raid1:%s: redirecting sector %llu" 2408 mdname(mddev),
2335 " to other mirror: %s\n", 2409 (unsigned long long)r1_bio->sector,
2336 mdname(mddev), 2410 bdevname(rdev->bdev, b));
2337 (unsigned long long)r1_bio->sector,
2338 bdevname(rdev->bdev, b));
2339 bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; 2411 bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
2340 bio->bi_bdev = rdev->bdev; 2412 bio->bi_bdev = rdev->bdev;
2341 bio->bi_end_io = raid1_end_read_request; 2413 bio->bi_end_io = raid1_end_read_request;
2342 bio_set_op_attrs(bio, REQ_OP_READ, do_sync); 2414 bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
2415 if (test_bit(FailFast, &rdev->flags) &&
2416 test_bit(R1BIO_FailFast, &r1_bio->state))
2417 bio->bi_opf |= MD_FAILFAST;
2343 bio->bi_private = r1_bio; 2418 bio->bi_private = r1_bio;
2344 if (max_sectors < r1_bio->sectors) { 2419 if (max_sectors < r1_bio->sectors) {
2345 /* Drat - have to split this up more */ 2420 /* Drat - have to split this up more */
@@ -2353,6 +2428,8 @@ read_more:
2353 else 2428 else
2354 mbio->bi_phys_segments++; 2429 mbio->bi_phys_segments++;
2355 spin_unlock_irq(&conf->device_lock); 2430 spin_unlock_irq(&conf->device_lock);
2431 trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
2432 bio, bio_dev, bio_sector);
2356 generic_make_request(bio); 2433 generic_make_request(bio);
2357 bio = NULL; 2434 bio = NULL;
2358 2435
@@ -2367,8 +2444,11 @@ read_more:
2367 sectors_handled; 2444 sectors_handled;
2368 2445
2369 goto read_more; 2446 goto read_more;
2370 } else 2447 } else {
2448 trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
2449 bio, bio_dev, bio_sector);
2371 generic_make_request(bio); 2450 generic_make_request(bio);
2451 }
2372 } 2452 }
2373} 2453}
2374 2454
@@ -2384,10 +2464,10 @@ static void raid1d(struct md_thread *thread)
2384 md_check_recovery(mddev); 2464 md_check_recovery(mddev);
2385 2465
2386 if (!list_empty_careful(&conf->bio_end_io_list) && 2466 if (!list_empty_careful(&conf->bio_end_io_list) &&
2387 !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { 2467 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2388 LIST_HEAD(tmp); 2468 LIST_HEAD(tmp);
2389 spin_lock_irqsave(&conf->device_lock, flags); 2469 spin_lock_irqsave(&conf->device_lock, flags);
2390 if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { 2470 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2391 while (!list_empty(&conf->bio_end_io_list)) { 2471 while (!list_empty(&conf->bio_end_io_list)) {
2392 list_move(conf->bio_end_io_list.prev, &tmp); 2472 list_move(conf->bio_end_io_list.prev, &tmp);
2393 conf->nr_queued--; 2473 conf->nr_queued--;
@@ -2441,7 +2521,7 @@ static void raid1d(struct md_thread *thread)
2441 generic_make_request(r1_bio->bios[r1_bio->read_disk]); 2521 generic_make_request(r1_bio->bios[r1_bio->read_disk]);
2442 2522
2443 cond_resched(); 2523 cond_resched();
2444 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 2524 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2445 md_check_recovery(mddev); 2525 md_check_recovery(mddev);
2446 } 2526 }
2447 blk_finish_plug(&plug); 2527 blk_finish_plug(&plug);
@@ -2623,6 +2703,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2623 bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; 2703 bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
2624 bio->bi_bdev = rdev->bdev; 2704 bio->bi_bdev = rdev->bdev;
2625 bio->bi_private = r1_bio; 2705 bio->bi_private = r1_bio;
2706 if (test_bit(FailFast, &rdev->flags))
2707 bio->bi_opf |= MD_FAILFAST;
2626 } 2708 }
2627 } 2709 }
2628 rcu_read_unlock(); 2710 rcu_read_unlock();
@@ -2642,7 +2724,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2642 min_bad, 0 2724 min_bad, 0
2643 ) && ok; 2725 ) && ok;
2644 } 2726 }
2645 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2727 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2646 *skipped = 1; 2728 *skipped = 1;
2647 put_buf(r1_bio); 2729 put_buf(r1_bio);
2648 2730
@@ -2753,6 +2835,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2753 if (bio->bi_end_io == end_sync_read) { 2835 if (bio->bi_end_io == end_sync_read) {
2754 read_targets--; 2836 read_targets--;
2755 md_sync_acct(bio->bi_bdev, nr_sectors); 2837 md_sync_acct(bio->bi_bdev, nr_sectors);
2838 if (read_targets == 1)
2839 bio->bi_opf &= ~MD_FAILFAST;
2756 generic_make_request(bio); 2840 generic_make_request(bio);
2757 } 2841 }
2758 } 2842 }
@@ -2760,6 +2844,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2760 atomic_set(&r1_bio->remaining, 1); 2844 atomic_set(&r1_bio->remaining, 1);
2761 bio = r1_bio->bios[r1_bio->read_disk]; 2845 bio = r1_bio->bios[r1_bio->read_disk];
2762 md_sync_acct(bio->bi_bdev, nr_sectors); 2846 md_sync_acct(bio->bi_bdev, nr_sectors);
2847 if (read_targets == 1)
2848 bio->bi_opf &= ~MD_FAILFAST;
2763 generic_make_request(bio); 2849 generic_make_request(bio);
2764 2850
2765 } 2851 }
@@ -2875,12 +2961,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2875 2961
2876 err = -ENOMEM; 2962 err = -ENOMEM;
2877 conf->thread = md_register_thread(raid1d, mddev, "raid1"); 2963 conf->thread = md_register_thread(raid1d, mddev, "raid1");
2878 if (!conf->thread) { 2964 if (!conf->thread)
2879 printk(KERN_ERR
2880 "md/raid1:%s: couldn't allocate thread\n",
2881 mdname(mddev));
2882 goto abort; 2965 goto abort;
2883 }
2884 2966
2885 return conf; 2967 return conf;
2886 2968
@@ -2905,13 +2987,13 @@ static int raid1_run(struct mddev *mddev)
2905 bool discard_supported = false; 2987 bool discard_supported = false;
2906 2988
2907 if (mddev->level != 1) { 2989 if (mddev->level != 1) {
2908 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", 2990 pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n",
2909 mdname(mddev), mddev->level); 2991 mdname(mddev), mddev->level);
2910 return -EIO; 2992 return -EIO;
2911 } 2993 }
2912 if (mddev->reshape_position != MaxSector) { 2994 if (mddev->reshape_position != MaxSector) {
2913 printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", 2995 pr_warn("md/raid1:%s: reshape_position set but not supported\n",
2914 mdname(mddev)); 2996 mdname(mddev));
2915 return -EIO; 2997 return -EIO;
2916 } 2998 }
2917 /* 2999 /*
@@ -2950,11 +3032,9 @@ static int raid1_run(struct mddev *mddev)
2950 mddev->recovery_cp = MaxSector; 3032 mddev->recovery_cp = MaxSector;
2951 3033
2952 if (mddev->recovery_cp != MaxSector) 3034 if (mddev->recovery_cp != MaxSector)
2953 printk(KERN_NOTICE "md/raid1:%s: not clean" 3035 pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
2954 " -- starting background reconstruction\n", 3036 mdname(mddev));
2955 mdname(mddev)); 3037 pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
2956 printk(KERN_INFO
2957 "md/raid1:%s: active with %d out of %d mirrors\n",
2958 mdname(mddev), mddev->raid_disks - mddev->degraded, 3038 mdname(mddev), mddev->raid_disks - mddev->degraded,
2959 mddev->raid_disks); 3039 mddev->raid_disks);
2960 3040
@@ -2964,6 +3044,7 @@ static int raid1_run(struct mddev *mddev)
2964 mddev->thread = conf->thread; 3044 mddev->thread = conf->thread;
2965 conf->thread = NULL; 3045 conf->thread = NULL;
2966 mddev->private = conf; 3046 mddev->private = conf;
3047 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
2967 3048
2968 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 3049 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2969 3050
@@ -3107,9 +3188,8 @@ static int raid1_reshape(struct mddev *mddev)
3107 rdev->raid_disk = d2; 3188 rdev->raid_disk = d2;
3108 sysfs_unlink_rdev(mddev, rdev); 3189 sysfs_unlink_rdev(mddev, rdev);
3109 if (sysfs_link_rdev(mddev, rdev)) 3190 if (sysfs_link_rdev(mddev, rdev))
3110 printk(KERN_WARNING 3191 pr_warn("md/raid1:%s: cannot register rd%d\n",
3111 "md/raid1:%s: cannot register rd%d\n", 3192 mdname(mddev), rdev->raid_disk);
3112 mdname(mddev), rdev->raid_disk);
3113 } 3193 }
3114 if (rdev) 3194 if (rdev)
3115 newmirrors[d2++].rdev = rdev; 3195 newmirrors[d2++].rdev = rdev;
@@ -3163,9 +3243,12 @@ static void *raid1_takeover(struct mddev *mddev)
3163 mddev->new_layout = 0; 3243 mddev->new_layout = 0;
3164 mddev->new_chunk_sectors = 0; 3244 mddev->new_chunk_sectors = 0;
3165 conf = setup_conf(mddev); 3245 conf = setup_conf(mddev);
3166 if (!IS_ERR(conf)) 3246 if (!IS_ERR(conf)) {
3167 /* Array must appear to be quiesced */ 3247 /* Array must appear to be quiesced */
3168 conf->array_frozen = 1; 3248 conf->array_frozen = 1;
3249 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
3250 clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);
3251 }
3169 return conf; 3252 return conf;
3170 } 3253 }
3171 return ERR_PTR(-EINVAL); 3254 return ERR_PTR(-EINVAL);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 61c39b390cd8..c52ef424a24b 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -161,14 +161,15 @@ struct r1bio {
161}; 161};
162 162
163/* bits for r1bio.state */ 163/* bits for r1bio.state */
164#define R1BIO_Uptodate 0 164enum r1bio_state {
165#define R1BIO_IsSync 1 165 R1BIO_Uptodate,
166#define R1BIO_Degraded 2 166 R1BIO_IsSync,
167#define R1BIO_BehindIO 3 167 R1BIO_Degraded,
168 R1BIO_BehindIO,
168/* Set ReadError on bios that experience a readerror so that 169/* Set ReadError on bios that experience a readerror so that
169 * raid1d knows what to do with them. 170 * raid1d knows what to do with them.
170 */ 171 */
171#define R1BIO_ReadError 4 172 R1BIO_ReadError,
172/* For write-behind requests, we call bi_end_io when 173/* For write-behind requests, we call bi_end_io when
173 * the last non-write-behind device completes, providing 174 * the last non-write-behind device completes, providing
174 * any write was successful. Otherwise we call when 175 * any write was successful. Otherwise we call when
@@ -176,10 +177,12 @@ struct r1bio {
176 * with failure when last write completes (and all failed). 177 * with failure when last write completes (and all failed).
177 * Record that bi_end_io was called with this flag... 178 * Record that bi_end_io was called with this flag...
178 */ 179 */
179#define R1BIO_Returned 6 180 R1BIO_Returned,
180/* If a write for this request means we can clear some 181/* If a write for this request means we can clear some
181 * known-bad-block records, we set this flag 182 * known-bad-block records, we set this flag
182 */ 183 */
183#define R1BIO_MadeGood 7 184 R1BIO_MadeGood,
184#define R1BIO_WriteError 8 185 R1BIO_WriteError,
186 R1BIO_FailFast,
187};
185#endif 188#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 39fddda2fef2..ab5e86209322 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -25,6 +25,7 @@
25#include <linux/seq_file.h> 25#include <linux/seq_file.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <trace/events/block.h>
28#include "md.h" 29#include "md.h"
29#include "raid10.h" 30#include "raid10.h"
30#include "raid0.h" 31#include "raid0.h"
@@ -99,12 +100,16 @@ static int max_queued_requests = 1024;
99static void allow_barrier(struct r10conf *conf); 100static void allow_barrier(struct r10conf *conf);
100static void lower_barrier(struct r10conf *conf); 101static void lower_barrier(struct r10conf *conf);
101static int _enough(struct r10conf *conf, int previous, int ignore); 102static int _enough(struct r10conf *conf, int previous, int ignore);
103static int enough(struct r10conf *conf, int ignore);
102static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 104static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
103 int *skipped); 105 int *skipped);
104static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); 106static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
105static void end_reshape_write(struct bio *bio); 107static void end_reshape_write(struct bio *bio);
106static void end_reshape(struct r10conf *conf); 108static void end_reshape(struct r10conf *conf);
107 109
110#define raid10_log(md, fmt, args...) \
111 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
112
108static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 113static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
109{ 114{
110 struct r10conf *conf = data; 115 struct r10conf *conf = data;
@@ -404,8 +409,7 @@ static void raid10_end_read_request(struct bio *bio)
404 * oops, read error - keep the refcount on the rdev 409 * oops, read error - keep the refcount on the rdev
405 */ 410 */
406 char b[BDEVNAME_SIZE]; 411 char b[BDEVNAME_SIZE];
407 printk_ratelimited(KERN_ERR 412 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
408 "md/raid10:%s: %s: rescheduling sector %llu\n",
409 mdname(conf->mddev), 413 mdname(conf->mddev),
410 bdevname(rdev->bdev, b), 414 bdevname(rdev->bdev, b),
411 (unsigned long long)r10_bio->sector); 415 (unsigned long long)r10_bio->sector);
@@ -447,6 +451,7 @@ static void raid10_end_write_request(struct bio *bio)
447 struct r10conf *conf = r10_bio->mddev->private; 451 struct r10conf *conf = r10_bio->mddev->private;
448 int slot, repl; 452 int slot, repl;
449 struct md_rdev *rdev = NULL; 453 struct md_rdev *rdev = NULL;
454 struct bio *to_put = NULL;
450 bool discard_error; 455 bool discard_error;
451 456
452 discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; 457 discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
@@ -474,8 +479,24 @@ static void raid10_end_write_request(struct bio *bio)
474 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 479 if (!test_and_set_bit(WantReplacement, &rdev->flags))
475 set_bit(MD_RECOVERY_NEEDED, 480 set_bit(MD_RECOVERY_NEEDED,
476 &rdev->mddev->recovery); 481 &rdev->mddev->recovery);
477 set_bit(R10BIO_WriteError, &r10_bio->state); 482
478 dec_rdev = 0; 483 dec_rdev = 0;
484 if (test_bit(FailFast, &rdev->flags) &&
485 (bio->bi_opf & MD_FAILFAST)) {
486 md_error(rdev->mddev, rdev);
487 if (!test_bit(Faulty, &rdev->flags))
488 /* This is the only remaining device,
489 * We need to retry the write without
490 * FailFast
491 */
492 set_bit(R10BIO_WriteError, &r10_bio->state);
493 else {
494 r10_bio->devs[slot].bio = NULL;
495 to_put = bio;
496 dec_rdev = 1;
497 }
498 } else
499 set_bit(R10BIO_WriteError, &r10_bio->state);
479 } 500 }
480 } else { 501 } else {
481 /* 502 /*
@@ -525,6 +546,8 @@ static void raid10_end_write_request(struct bio *bio)
525 one_write_done(r10_bio); 546 one_write_done(r10_bio);
526 if (dec_rdev) 547 if (dec_rdev)
527 rdev_dec_pending(rdev, conf->mddev); 548 rdev_dec_pending(rdev, conf->mddev);
549 if (to_put)
550 bio_put(to_put);
528} 551}
529 552
530/* 553/*
@@ -716,6 +739,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
716 best_dist = MaxSector; 739 best_dist = MaxSector;
717 best_good_sectors = 0; 740 best_good_sectors = 0;
718 do_balance = 1; 741 do_balance = 1;
742 clear_bit(R10BIO_FailFast, &r10_bio->state);
719 /* 743 /*
720 * Check if we can balance. We can balance on the whole 744 * Check if we can balance. We can balance on the whole
721 * device if no resync is going on (recovery is ok), or below 745 * device if no resync is going on (recovery is ok), or below
@@ -780,15 +804,18 @@ static struct md_rdev *read_balance(struct r10conf *conf,
780 if (!do_balance) 804 if (!do_balance)
781 break; 805 break;
782 806
807 if (best_slot >= 0)
808 /* At least 2 disks to choose from so failfast is OK */
809 set_bit(R10BIO_FailFast, &r10_bio->state);
783 /* This optimisation is debatable, and completely destroys 810 /* This optimisation is debatable, and completely destroys
784 * sequential read speed for 'far copies' arrays. So only 811 * sequential read speed for 'far copies' arrays. So only
785 * keep it for 'near' arrays, and review those later. 812 * keep it for 'near' arrays, and review those later.
786 */ 813 */
787 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) 814 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
788 break; 815 new_distance = 0;
789 816
790 /* for far > 1 always use the lowest address */ 817 /* for far > 1 always use the lowest address */
791 if (geo->far_copies > 1) 818 else if (geo->far_copies > 1)
792 new_distance = r10_bio->devs[slot].addr; 819 new_distance = r10_bio->devs[slot].addr;
793 else 820 else
794 new_distance = abs(r10_bio->devs[slot].addr - 821 new_distance = abs(r10_bio->devs[slot].addr -
@@ -859,9 +886,14 @@ static void flush_pending_writes(struct r10conf *conf)
859 886
860 while (bio) { /* submit pending writes */ 887 while (bio) { /* submit pending writes */
861 struct bio *next = bio->bi_next; 888 struct bio *next = bio->bi_next;
889 struct md_rdev *rdev = (void*)bio->bi_bdev;
862 bio->bi_next = NULL; 890 bio->bi_next = NULL;
863 if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 891 bio->bi_bdev = rdev->bdev;
864 !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) 892 if (test_bit(Faulty, &rdev->flags)) {
893 bio->bi_error = -EIO;
894 bio_endio(bio);
895 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
896 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
865 /* Just ignore it */ 897 /* Just ignore it */
866 bio_endio(bio); 898 bio_endio(bio);
867 else 899 else
@@ -937,6 +969,7 @@ static void wait_barrier(struct r10conf *conf)
937 * that queue to get the nr_pending 969 * that queue to get the nr_pending
938 * count down. 970 * count down.
939 */ 971 */
972 raid10_log(conf->mddev, "wait barrier");
940 wait_event_lock_irq(conf->wait_barrier, 973 wait_event_lock_irq(conf->wait_barrier,
941 !conf->barrier || 974 !conf->barrier ||
942 (atomic_read(&conf->nr_pending) && 975 (atomic_read(&conf->nr_pending) &&
@@ -1037,9 +1070,14 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1037 1070
1038 while (bio) { /* submit pending writes */ 1071 while (bio) { /* submit pending writes */
1039 struct bio *next = bio->bi_next; 1072 struct bio *next = bio->bi_next;
1073 struct md_rdev *rdev = (void*)bio->bi_bdev;
1040 bio->bi_next = NULL; 1074 bio->bi_next = NULL;
1041 if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 1075 bio->bi_bdev = rdev->bdev;
1042 !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) 1076 if (test_bit(Faulty, &rdev->flags)) {
1077 bio->bi_error = -EIO;
1078 bio_endio(bio);
1079 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1080 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1043 /* Just ignore it */ 1081 /* Just ignore it */
1044 bio_endio(bio); 1082 bio_endio(bio);
1045 else 1083 else
@@ -1083,6 +1121,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
1083 /* IO spans the reshape position. Need to wait for 1121 /* IO spans the reshape position. Need to wait for
1084 * reshape to pass 1122 * reshape to pass
1085 */ 1123 */
1124 raid10_log(conf->mddev, "wait reshape");
1086 allow_barrier(conf); 1125 allow_barrier(conf);
1087 wait_event(conf->wait_barrier, 1126 wait_event(conf->wait_barrier,
1088 conf->reshape_progress <= bio->bi_iter.bi_sector || 1127 conf->reshape_progress <= bio->bi_iter.bi_sector ||
@@ -1099,11 +1138,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
1099 bio->bi_iter.bi_sector < conf->reshape_progress))) { 1138 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1100 /* Need to update reshape_position in metadata */ 1139 /* Need to update reshape_position in metadata */
1101 mddev->reshape_position = conf->reshape_progress; 1140 mddev->reshape_position = conf->reshape_progress;
1102 set_mask_bits(&mddev->flags, 0, 1141 set_mask_bits(&mddev->sb_flags, 0,
1103 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1142 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1104 md_wakeup_thread(mddev->thread); 1143 md_wakeup_thread(mddev->thread);
1144 raid10_log(conf->mddev, "wait reshape metadata");
1105 wait_event(mddev->sb_wait, 1145 wait_event(mddev->sb_wait,
1106 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 1146 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1107 1147
1108 conf->reshape_safe = mddev->reshape_position; 1148 conf->reshape_safe = mddev->reshape_position;
1109 } 1149 }
@@ -1154,8 +1194,15 @@ read_again:
1154 read_bio->bi_bdev = rdev->bdev; 1194 read_bio->bi_bdev = rdev->bdev;
1155 read_bio->bi_end_io = raid10_end_read_request; 1195 read_bio->bi_end_io = raid10_end_read_request;
1156 bio_set_op_attrs(read_bio, op, do_sync); 1196 bio_set_op_attrs(read_bio, op, do_sync);
1197 if (test_bit(FailFast, &rdev->flags) &&
1198 test_bit(R10BIO_FailFast, &r10_bio->state))
1199 read_bio->bi_opf |= MD_FAILFAST;
1157 read_bio->bi_private = r10_bio; 1200 read_bio->bi_private = r10_bio;
1158 1201
1202 if (mddev->gendisk)
1203 trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
1204 read_bio, disk_devt(mddev->gendisk),
1205 r10_bio->sector);
1159 if (max_sectors < r10_bio->sectors) { 1206 if (max_sectors < r10_bio->sectors) {
1160 /* Could not read all from this device, so we will 1207 /* Could not read all from this device, so we will
1161 * need another r10_bio. 1208 * need another r10_bio.
@@ -1195,6 +1242,7 @@ read_again:
1195 */ 1242 */
1196 if (conf->pending_count >= max_queued_requests) { 1243 if (conf->pending_count >= max_queued_requests) {
1197 md_wakeup_thread(mddev->thread); 1244 md_wakeup_thread(mddev->thread);
1245 raid10_log(mddev, "wait queued");
1198 wait_event(conf->wait_barrier, 1246 wait_event(conf->wait_barrier,
1199 conf->pending_count < max_queued_requests); 1247 conf->pending_count < max_queued_requests);
1200 } 1248 }
@@ -1322,6 +1370,7 @@ retry_write:
1322 } 1370 }
1323 } 1371 }
1324 allow_barrier(conf); 1372 allow_barrier(conf);
1373 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1325 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1374 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1326 wait_barrier(conf); 1375 wait_barrier(conf);
1327 goto retry_write; 1376 goto retry_write;
@@ -1361,8 +1410,18 @@ retry_write:
1361 mbio->bi_bdev = rdev->bdev; 1410 mbio->bi_bdev = rdev->bdev;
1362 mbio->bi_end_io = raid10_end_write_request; 1411 mbio->bi_end_io = raid10_end_write_request;
1363 bio_set_op_attrs(mbio, op, do_sync | do_fua); 1412 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1413 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
1414 enough(conf, d))
1415 mbio->bi_opf |= MD_FAILFAST;
1364 mbio->bi_private = r10_bio; 1416 mbio->bi_private = r10_bio;
1365 1417
1418 if (conf->mddev->gendisk)
1419 trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
1420 mbio, disk_devt(conf->mddev->gendisk),
1421 r10_bio->sector);
1422 /* flush_pending_writes() needs access to the rdev so...*/
1423 mbio->bi_bdev = (void*)rdev;
1424
1366 atomic_inc(&r10_bio->remaining); 1425 atomic_inc(&r10_bio->remaining);
1367 1426
1368 cb = blk_check_plugged(raid10_unplug, mddev, 1427 cb = blk_check_plugged(raid10_unplug, mddev,
@@ -1405,6 +1464,13 @@ retry_write:
1405 bio_set_op_attrs(mbio, op, do_sync | do_fua); 1464 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1406 mbio->bi_private = r10_bio; 1465 mbio->bi_private = r10_bio;
1407 1466
1467 if (conf->mddev->gendisk)
1468 trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
1469 mbio, disk_devt(conf->mddev->gendisk),
1470 r10_bio->sector);
1471 /* flush_pending_writes() needs access to the rdev so...*/
1472 mbio->bi_bdev = (void*)rdev;
1473
1408 atomic_inc(&r10_bio->remaining); 1474 atomic_inc(&r10_bio->remaining);
1409 spin_lock_irqsave(&conf->device_lock, flags); 1475 spin_lock_irqsave(&conf->device_lock, flags);
1410 bio_list_add(&conf->pending_bio_list, mbio); 1476 bio_list_add(&conf->pending_bio_list, mbio);
@@ -1586,14 +1652,13 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1586 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1652 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1587 set_bit(Blocked, &rdev->flags); 1653 set_bit(Blocked, &rdev->flags);
1588 set_bit(Faulty, &rdev->flags); 1654 set_bit(Faulty, &rdev->flags);
1589 set_mask_bits(&mddev->flags, 0, 1655 set_mask_bits(&mddev->sb_flags, 0,
1590 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1656 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1591 spin_unlock_irqrestore(&conf->device_lock, flags); 1657 spin_unlock_irqrestore(&conf->device_lock, flags);
1592 printk(KERN_ALERT 1658 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1593 "md/raid10:%s: Disk failure on %s, disabling device.\n" 1659 "md/raid10:%s: Operation continuing on %d devices.\n",
1594 "md/raid10:%s: Operation continuing on %d devices.\n", 1660 mdname(mddev), bdevname(rdev->bdev, b),
1595 mdname(mddev), bdevname(rdev->bdev, b), 1661 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1596 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1597} 1662}
1598 1663
1599static void print_conf(struct r10conf *conf) 1664static void print_conf(struct r10conf *conf)
@@ -1601,13 +1666,13 @@ static void print_conf(struct r10conf *conf)
1601 int i; 1666 int i;
1602 struct md_rdev *rdev; 1667 struct md_rdev *rdev;
1603 1668
1604 printk(KERN_DEBUG "RAID10 conf printout:\n"); 1669 pr_debug("RAID10 conf printout:\n");
1605 if (!conf) { 1670 if (!conf) {
1606 printk(KERN_DEBUG "(!conf)\n"); 1671 pr_debug("(!conf)\n");
1607 return; 1672 return;
1608 } 1673 }
1609 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 1674 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1610 conf->geo.raid_disks); 1675 conf->geo.raid_disks);
1611 1676
1612 /* This is only called with ->reconfix_mutex held, so 1677 /* This is only called with ->reconfix_mutex held, so
1613 * rcu protection of rdev is not needed */ 1678 * rcu protection of rdev is not needed */
@@ -1615,10 +1680,10 @@ static void print_conf(struct r10conf *conf)
1615 char b[BDEVNAME_SIZE]; 1680 char b[BDEVNAME_SIZE];
1616 rdev = conf->mirrors[i].rdev; 1681 rdev = conf->mirrors[i].rdev;
1617 if (rdev) 1682 if (rdev)
1618 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", 1683 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1619 i, !test_bit(In_sync, &rdev->flags), 1684 i, !test_bit(In_sync, &rdev->flags),
1620 !test_bit(Faulty, &rdev->flags), 1685 !test_bit(Faulty, &rdev->flags),
1621 bdevname(rdev->bdev,b)); 1686 bdevname(rdev->bdev,b));
1622 } 1687 }
1623} 1688}
1624 1689
@@ -1953,6 +2018,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1953 /* now find blocks with errors */ 2018 /* now find blocks with errors */
1954 for (i=0 ; i < conf->copies ; i++) { 2019 for (i=0 ; i < conf->copies ; i++) {
1955 int j, d; 2020 int j, d;
2021 struct md_rdev *rdev;
1956 2022
1957 tbio = r10_bio->devs[i].bio; 2023 tbio = r10_bio->devs[i].bio;
1958 2024
@@ -1960,6 +2026,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1960 continue; 2026 continue;
1961 if (i == first) 2027 if (i == first)
1962 continue; 2028 continue;
2029 d = r10_bio->devs[i].devnum;
2030 rdev = conf->mirrors[d].rdev;
1963 if (!r10_bio->devs[i].bio->bi_error) { 2031 if (!r10_bio->devs[i].bio->bi_error) {
1964 /* We know that the bi_io_vec layout is the same for 2032 /* We know that the bi_io_vec layout is the same for
1965 * both 'first' and 'i', so we just compare them. 2033 * both 'first' and 'i', so we just compare them.
@@ -1982,6 +2050,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1982 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2050 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1983 /* Don't fix anything. */ 2051 /* Don't fix anything. */
1984 continue; 2052 continue;
2053 } else if (test_bit(FailFast, &rdev->flags)) {
2054 /* Just give up on this device */
2055 md_error(rdev->mddev, rdev);
2056 continue;
1985 } 2057 }
1986 /* Ok, we need to write this bio, either to correct an 2058 /* Ok, we need to write this bio, either to correct an
1987 * inconsistency or to correct an unreadable block. 2059 * inconsistency or to correct an unreadable block.
@@ -1999,11 +2071,12 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1999 2071
2000 bio_copy_data(tbio, fbio); 2072 bio_copy_data(tbio, fbio);
2001 2073
2002 d = r10_bio->devs[i].devnum;
2003 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2074 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2004 atomic_inc(&r10_bio->remaining); 2075 atomic_inc(&r10_bio->remaining);
2005 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); 2076 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2006 2077
2078 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2079 tbio->bi_opf |= MD_FAILFAST;
2007 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; 2080 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2008 tbio->bi_bdev = conf->mirrors[d].rdev->bdev; 2081 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2009 generic_make_request(tbio); 2082 generic_make_request(tbio);
@@ -2109,10 +2182,8 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
2109 ok = rdev_set_badblocks(rdev2, addr, s, 0); 2182 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2110 if (!ok) { 2183 if (!ok) {
2111 /* just abort the recovery */ 2184 /* just abort the recovery */
2112 printk(KERN_NOTICE 2185 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2113 "md/raid10:%s: recovery aborted" 2186 mdname(mddev));
2114 " due to read error\n",
2115 mdname(mddev));
2116 2187
2117 conf->mirrors[dw].recovery_disabled 2188 conf->mirrors[dw].recovery_disabled
2118 = mddev->recovery_disabled; 2189 = mddev->recovery_disabled;
@@ -2259,14 +2330,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2259 char b[BDEVNAME_SIZE]; 2330 char b[BDEVNAME_SIZE];
2260 bdevname(rdev->bdev, b); 2331 bdevname(rdev->bdev, b);
2261 2332
2262 printk(KERN_NOTICE 2333 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2263 "md/raid10:%s: %s: Raid device exceeded " 2334 mdname(mddev), b,
2264 "read_error threshold [cur %d:max %d]\n", 2335 atomic_read(&rdev->read_errors), max_read_errors);
2265 mdname(mddev), b, 2336 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2266 atomic_read(&rdev->read_errors), max_read_errors); 2337 mdname(mddev), b);
2267 printk(KERN_NOTICE
2268 "md/raid10:%s: %s: Failing raid device\n",
2269 mdname(mddev), b);
2270 md_error(mddev, rdev); 2338 md_error(mddev, rdev);
2271 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; 2339 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2272 return; 2340 return;
@@ -2356,20 +2424,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2356 s, conf->tmppage, WRITE) 2424 s, conf->tmppage, WRITE)
2357 == 0) { 2425 == 0) {
2358 /* Well, this device is dead */ 2426 /* Well, this device is dead */
2359 printk(KERN_NOTICE 2427 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2360 "md/raid10:%s: read correction " 2428 mdname(mddev), s,
2361 "write failed" 2429 (unsigned long long)(
2362 " (%d sectors at %llu on %s)\n", 2430 sect +
2363 mdname(mddev), s, 2431 choose_data_offset(r10_bio,
2364 (unsigned long long)( 2432 rdev)),
2365 sect + 2433 bdevname(rdev->bdev, b));
2366 choose_data_offset(r10_bio, 2434 pr_notice("md/raid10:%s: %s: failing drive\n",
2367 rdev)), 2435 mdname(mddev),
2368 bdevname(rdev->bdev, b)); 2436 bdevname(rdev->bdev, b));
2369 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2370 "drive\n",
2371 mdname(mddev),
2372 bdevname(rdev->bdev, b));
2373 } 2437 }
2374 rdev_dec_pending(rdev, mddev); 2438 rdev_dec_pending(rdev, mddev);
2375 rcu_read_lock(); 2439 rcu_read_lock();
@@ -2397,24 +2461,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2397 READ)) { 2461 READ)) {
2398 case 0: 2462 case 0:
2399 /* Well, this device is dead */ 2463 /* Well, this device is dead */
2400 printk(KERN_NOTICE 2464 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2401 "md/raid10:%s: unable to read back "
2402 "corrected sectors"
2403 " (%d sectors at %llu on %s)\n",
2404 mdname(mddev), s, 2465 mdname(mddev), s,
2405 (unsigned long long)( 2466 (unsigned long long)(
2406 sect + 2467 sect +
2407 choose_data_offset(r10_bio, rdev)), 2468 choose_data_offset(r10_bio, rdev)),
2408 bdevname(rdev->bdev, b)); 2469 bdevname(rdev->bdev, b));
2409 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 2470 pr_notice("md/raid10:%s: %s: failing drive\n",
2410 "drive\n",
2411 mdname(mddev), 2471 mdname(mddev),
2412 bdevname(rdev->bdev, b)); 2472 bdevname(rdev->bdev, b));
2413 break; 2473 break;
2414 case 1: 2474 case 1:
2415 printk(KERN_INFO 2475 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2416 "md/raid10:%s: read error corrected"
2417 " (%d sectors at %llu on %s)\n",
2418 mdname(mddev), s, 2476 mdname(mddev), s,
2419 (unsigned long long)( 2477 (unsigned long long)(
2420 sect + 2478 sect +
@@ -2503,6 +2561,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2503 char b[BDEVNAME_SIZE]; 2561 char b[BDEVNAME_SIZE];
2504 unsigned long do_sync; 2562 unsigned long do_sync;
2505 int max_sectors; 2563 int max_sectors;
2564 dev_t bio_dev;
2565 sector_t bio_last_sector;
2506 2566
2507 /* we got a read error. Maybe the drive is bad. Maybe just 2567 /* we got a read error. Maybe the drive is bad. Maybe just
2508 * the block and we can fix it. 2568 * the block and we can fix it.
@@ -2514,38 +2574,38 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2514 */ 2574 */
2515 bio = r10_bio->devs[slot].bio; 2575 bio = r10_bio->devs[slot].bio;
2516 bdevname(bio->bi_bdev, b); 2576 bdevname(bio->bi_bdev, b);
2577 bio_dev = bio->bi_bdev->bd_dev;
2578 bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
2517 bio_put(bio); 2579 bio_put(bio);
2518 r10_bio->devs[slot].bio = NULL; 2580 r10_bio->devs[slot].bio = NULL;
2519 2581
2520 if (mddev->ro == 0) { 2582 if (mddev->ro)
2583 r10_bio->devs[slot].bio = IO_BLOCKED;
2584 else if (!test_bit(FailFast, &rdev->flags)) {
2521 freeze_array(conf, 1); 2585 freeze_array(conf, 1);
2522 fix_read_error(conf, mddev, r10_bio); 2586 fix_read_error(conf, mddev, r10_bio);
2523 unfreeze_array(conf); 2587 unfreeze_array(conf);
2524 } else 2588 } else
2525 r10_bio->devs[slot].bio = IO_BLOCKED; 2589 md_error(mddev, rdev);
2526 2590
2527 rdev_dec_pending(rdev, mddev); 2591 rdev_dec_pending(rdev, mddev);
2528 2592
2529read_more: 2593read_more:
2530 rdev = read_balance(conf, r10_bio, &max_sectors); 2594 rdev = read_balance(conf, r10_bio, &max_sectors);
2531 if (rdev == NULL) { 2595 if (rdev == NULL) {
2532 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 2596 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
2533 " read error for block %llu\n", 2597 mdname(mddev), b,
2534 mdname(mddev), b, 2598 (unsigned long long)r10_bio->sector);
2535 (unsigned long long)r10_bio->sector);
2536 raid_end_bio_io(r10_bio); 2599 raid_end_bio_io(r10_bio);
2537 return; 2600 return;
2538 } 2601 }
2539 2602
2540 do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); 2603 do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
2541 slot = r10_bio->read_slot; 2604 slot = r10_bio->read_slot;
2542 printk_ratelimited( 2605 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
2543 KERN_ERR 2606 mdname(mddev),
2544 "md/raid10:%s: %s: redirecting " 2607 bdevname(rdev->bdev, b),
2545 "sector %llu to another mirror\n", 2608 (unsigned long long)r10_bio->sector);
2546 mdname(mddev),
2547 bdevname(rdev->bdev, b),
2548 (unsigned long long)r10_bio->sector);
2549 bio = bio_clone_mddev(r10_bio->master_bio, 2609 bio = bio_clone_mddev(r10_bio->master_bio,
2550 GFP_NOIO, mddev); 2610 GFP_NOIO, mddev);
2551 bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); 2611 bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
@@ -2555,8 +2615,15 @@ read_more:
2555 + choose_data_offset(r10_bio, rdev); 2615 + choose_data_offset(r10_bio, rdev);
2556 bio->bi_bdev = rdev->bdev; 2616 bio->bi_bdev = rdev->bdev;
2557 bio_set_op_attrs(bio, REQ_OP_READ, do_sync); 2617 bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
2618 if (test_bit(FailFast, &rdev->flags) &&
2619 test_bit(R10BIO_FailFast, &r10_bio->state))
2620 bio->bi_opf |= MD_FAILFAST;
2558 bio->bi_private = r10_bio; 2621 bio->bi_private = r10_bio;
2559 bio->bi_end_io = raid10_end_read_request; 2622 bio->bi_end_io = raid10_end_read_request;
2623 trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
2624 bio, bio_dev,
2625 bio_last_sector - r10_bio->sectors);
2626
2560 if (max_sectors < r10_bio->sectors) { 2627 if (max_sectors < r10_bio->sectors) {
2561 /* Drat - have to split this up more */ 2628 /* Drat - have to split this up more */
2562 struct bio *mbio = r10_bio->master_bio; 2629 struct bio *mbio = r10_bio->master_bio;
@@ -2694,10 +2761,10 @@ static void raid10d(struct md_thread *thread)
2694 md_check_recovery(mddev); 2761 md_check_recovery(mddev);
2695 2762
2696 if (!list_empty_careful(&conf->bio_end_io_list) && 2763 if (!list_empty_careful(&conf->bio_end_io_list) &&
2697 !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { 2764 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2698 LIST_HEAD(tmp); 2765 LIST_HEAD(tmp);
2699 spin_lock_irqsave(&conf->device_lock, flags); 2766 spin_lock_irqsave(&conf->device_lock, flags);
2700 if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { 2767 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2701 while (!list_empty(&conf->bio_end_io_list)) { 2768 while (!list_empty(&conf->bio_end_io_list)) {
2702 list_move(conf->bio_end_io_list.prev, &tmp); 2769 list_move(conf->bio_end_io_list.prev, &tmp);
2703 conf->nr_queued--; 2770 conf->nr_queued--;
@@ -2755,7 +2822,7 @@ static void raid10d(struct md_thread *thread)
2755 } 2822 }
2756 2823
2757 cond_resched(); 2824 cond_resched();
2758 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 2825 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2759 md_check_recovery(mddev); 2826 md_check_recovery(mddev);
2760 } 2827 }
2761 blk_finish_plug(&plug); 2828 blk_finish_plug(&plug);
@@ -3072,6 +3139,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3072 bio->bi_private = r10_bio; 3139 bio->bi_private = r10_bio;
3073 bio->bi_end_io = end_sync_read; 3140 bio->bi_end_io = end_sync_read;
3074 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3141 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3142 if (test_bit(FailFast, &rdev->flags))
3143 bio->bi_opf |= MD_FAILFAST;
3075 from_addr = r10_bio->devs[j].addr; 3144 from_addr = r10_bio->devs[j].addr;
3076 bio->bi_iter.bi_sector = from_addr + 3145 bio->bi_iter.bi_sector = from_addr +
3077 rdev->data_offset; 3146 rdev->data_offset;
@@ -3160,8 +3229,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3160 if (!any_working) { 3229 if (!any_working) {
3161 if (!test_and_set_bit(MD_RECOVERY_INTR, 3230 if (!test_and_set_bit(MD_RECOVERY_INTR,
3162 &mddev->recovery)) 3231 &mddev->recovery))
3163 printk(KERN_INFO "md/raid10:%s: insufficient " 3232 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3164 "working devices for recovery.\n",
3165 mdname(mddev)); 3233 mdname(mddev));
3166 mirror->recovery_disabled 3234 mirror->recovery_disabled
3167 = mddev->recovery_disabled; 3235 = mddev->recovery_disabled;
@@ -3178,6 +3246,23 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3178 rdev_dec_pending(mrdev, mddev); 3246 rdev_dec_pending(mrdev, mddev);
3179 if (mreplace) 3247 if (mreplace)
3180 rdev_dec_pending(mreplace, mddev); 3248 rdev_dec_pending(mreplace, mddev);
3249 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3250 /* Only want this if there is elsewhere to
3251 * read from. 'j' is currently the first
3252 * readable copy.
3253 */
3254 int targets = 1;
3255 for (; j < conf->copies; j++) {
3256 int d = r10_bio->devs[j].devnum;
3257 if (conf->mirrors[d].rdev &&
3258 test_bit(In_sync,
3259 &conf->mirrors[d].rdev->flags))
3260 targets++;
3261 }
3262 if (targets == 1)
3263 r10_bio->devs[0].bio->bi_opf
3264 &= ~MD_FAILFAST;
3265 }
3181 } 3266 }
3182 if (biolist == NULL) { 3267 if (biolist == NULL) {
3183 while (r10_bio) { 3268 while (r10_bio) {
@@ -3256,6 +3341,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3256 bio->bi_private = r10_bio; 3341 bio->bi_private = r10_bio;
3257 bio->bi_end_io = end_sync_read; 3342 bio->bi_end_io = end_sync_read;
3258 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3343 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3344 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
3345 bio->bi_opf |= MD_FAILFAST;
3259 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3346 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3260 bio->bi_bdev = rdev->bdev; 3347 bio->bi_bdev = rdev->bdev;
3261 count++; 3348 count++;
@@ -3279,6 +3366,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3279 bio->bi_private = r10_bio; 3366 bio->bi_private = r10_bio;
3280 bio->bi_end_io = end_sync_write; 3367 bio->bi_end_io = end_sync_write;
3281 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3368 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3369 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
3370 bio->bi_opf |= MD_FAILFAST;
3282 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3371 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3283 bio->bi_bdev = rdev->bdev; 3372 bio->bi_bdev = rdev->bdev;
3284 count++; 3373 count++;
@@ -3489,15 +3578,14 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3489 copies = setup_geo(&geo, mddev, geo_new); 3578 copies = setup_geo(&geo, mddev, geo_new);
3490 3579
3491 if (copies == -2) { 3580 if (copies == -2) {
3492 printk(KERN_ERR "md/raid10:%s: chunk size must be " 3581 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3493 "at least PAGE_SIZE(%ld) and be a power of 2.\n", 3582 mdname(mddev), PAGE_SIZE);
3494 mdname(mddev), PAGE_SIZE);
3495 goto out; 3583 goto out;
3496 } 3584 }
3497 3585
3498 if (copies < 2 || copies > mddev->raid_disks) { 3586 if (copies < 2 || copies > mddev->raid_disks) {
3499 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 3587 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3500 mdname(mddev), mddev->new_layout); 3588 mdname(mddev), mddev->new_layout);
3501 goto out; 3589 goto out;
3502 } 3590 }
3503 3591
@@ -3557,9 +3645,6 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3557 return conf; 3645 return conf;
3558 3646
3559 out: 3647 out:
3560 if (err == -ENOMEM)
3561 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3562 mdname(mddev));
3563 if (conf) { 3648 if (conf) {
3564 mempool_destroy(conf->r10bio_pool); 3649 mempool_destroy(conf->r10bio_pool);
3565 kfree(conf->mirrors); 3650 kfree(conf->mirrors);
@@ -3656,7 +3741,7 @@ static int raid10_run(struct mddev *mddev)
3656 } 3741 }
3657 /* need to check that every block has at least one working mirror */ 3742 /* need to check that every block has at least one working mirror */
3658 if (!enough(conf, -1)) { 3743 if (!enough(conf, -1)) {
3659 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 3744 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3660 mdname(mddev)); 3745 mdname(mddev));
3661 goto out_free_conf; 3746 goto out_free_conf;
3662 } 3747 }
@@ -3698,11 +3783,9 @@ static int raid10_run(struct mddev *mddev)
3698 } 3783 }
3699 3784
3700 if (mddev->recovery_cp != MaxSector) 3785 if (mddev->recovery_cp != MaxSector)
3701 printk(KERN_NOTICE "md/raid10:%s: not clean" 3786 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3702 " -- starting background reconstruction\n", 3787 mdname(mddev));
3703 mdname(mddev)); 3788 pr_info("md/raid10:%s: active with %d out of %d devices\n",
3704 printk(KERN_INFO
3705 "md/raid10:%s: active with %d out of %d devices\n",
3706 mdname(mddev), conf->geo.raid_disks - mddev->degraded, 3789 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3707 conf->geo.raid_disks); 3790 conf->geo.raid_disks);
3708 /* 3791 /*
@@ -3712,6 +3795,7 @@ static int raid10_run(struct mddev *mddev)
3712 size = raid10_size(mddev, 0, 0); 3795 size = raid10_size(mddev, 0, 0);
3713 md_set_array_sectors(mddev, size); 3796 md_set_array_sectors(mddev, size);
3714 mddev->resync_max_sectors = size; 3797 mddev->resync_max_sectors = size;
3798 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3715 3799
3716 if (mddev->queue) { 3800 if (mddev->queue) {
3717 int stripe = conf->geo.raid_disks * 3801 int stripe = conf->geo.raid_disks *
@@ -3739,7 +3823,7 @@ static int raid10_run(struct mddev *mddev)
3739 3823
3740 if (max(before_length, after_length) > min_offset_diff) { 3824 if (max(before_length, after_length) > min_offset_diff) {
3741 /* This cannot work */ 3825 /* This cannot work */
3742 printk("md/raid10: offset difference not enough to continue reshape\n"); 3826 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3743 goto out_free_conf; 3827 goto out_free_conf;
3744 } 3828 }
3745 conf->offset_diff = min_offset_diff; 3829 conf->offset_diff = min_offset_diff;
@@ -3846,8 +3930,8 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
3846 struct r10conf *conf; 3930 struct r10conf *conf;
3847 3931
3848 if (mddev->degraded > 0) { 3932 if (mddev->degraded > 0) {
3849 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", 3933 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
3850 mdname(mddev)); 3934 mdname(mddev));
3851 return ERR_PTR(-EINVAL); 3935 return ERR_PTR(-EINVAL);
3852 } 3936 }
3853 sector_div(size, devs); 3937 sector_div(size, devs);
@@ -3887,9 +3971,8 @@ static void *raid10_takeover(struct mddev *mddev)
3887 /* for raid0 takeover only one zone is supported */ 3971 /* for raid0 takeover only one zone is supported */
3888 raid0_conf = mddev->private; 3972 raid0_conf = mddev->private;
3889 if (raid0_conf->nr_strip_zones > 1) { 3973 if (raid0_conf->nr_strip_zones > 1) {
3890 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" 3974 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
3891 " with more than one zone.\n", 3975 mdname(mddev));
3892 mdname(mddev));
3893 return ERR_PTR(-EINVAL); 3976 return ERR_PTR(-EINVAL);
3894 } 3977 }
3895 return raid10_takeover_raid0(mddev, 3978 return raid10_takeover_raid0(mddev,
@@ -4078,8 +4161,8 @@ static int raid10_start_reshape(struct mddev *mddev)
4078 sector_t size = raid10_size(mddev, 0, 0); 4161 sector_t size = raid10_size(mddev, 0, 0);
4079 if (size < mddev->array_sectors) { 4162 if (size < mddev->array_sectors) {
4080 spin_unlock_irq(&conf->device_lock); 4163 spin_unlock_irq(&conf->device_lock);
4081 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", 4164 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4082 mdname(mddev)); 4165 mdname(mddev));
4083 return -EINVAL; 4166 return -EINVAL;
4084 } 4167 }
4085 mddev->resync_max_sectors = size; 4168 mddev->resync_max_sectors = size;
@@ -4126,7 +4209,7 @@ static int raid10_start_reshape(struct mddev *mddev)
4126 spin_unlock_irq(&conf->device_lock); 4209 spin_unlock_irq(&conf->device_lock);
4127 mddev->raid_disks = conf->geo.raid_disks; 4210 mddev->raid_disks = conf->geo.raid_disks;
4128 mddev->reshape_position = conf->reshape_progress; 4211 mddev->reshape_position = conf->reshape_progress;
4129 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4212 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4130 4213
4131 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4214 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4132 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4215 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -4321,9 +4404,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4321 else 4404 else
4322 mddev->curr_resync_completed = conf->reshape_progress; 4405 mddev->curr_resync_completed = conf->reshape_progress;
4323 conf->reshape_checkpoint = jiffies; 4406 conf->reshape_checkpoint = jiffies;
4324 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4407 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4325 md_wakeup_thread(mddev->thread); 4408 md_wakeup_thread(mddev->thread);
4326 wait_event(mddev->sb_wait, mddev->flags == 0 || 4409 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4327 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 4410 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4328 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4411 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4329 allow_barrier(conf); 4412 allow_barrier(conf);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 18ec1f7a98bf..3162615e57bd 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -156,5 +156,7 @@ enum r10bio_state {
156 * flag is set 156 * flag is set
157 */ 157 */
158 R10BIO_Previous, 158 R10BIO_Previous,
159/* failfast devices did receive failfast requests. */
160 R10BIO_FailFast,
159}; 161};
160#endif 162#endif
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 8491edcfb5a6..d7bfb6fc8aef 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2015 Shaohua Li <shli@fb.com> 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3 * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify it 5 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 6 * under the terms and conditions of the GNU General Public License,
@@ -18,8 +19,10 @@
18#include <linux/raid/md_p.h> 19#include <linux/raid/md_p.h>
19#include <linux/crc32c.h> 20#include <linux/crc32c.h>
20#include <linux/random.h> 21#include <linux/random.h>
22#include <linux/kthread.h>
21#include "md.h" 23#include "md.h"
22#include "raid5.h" 24#include "raid5.h"
25#include "bitmap.h"
23 26
24/* 27/*
25 * metadata/data stored in disk with 4k size unit (a block) regardless 28 * metadata/data stored in disk with 4k size unit (a block) regardless
@@ -28,18 +31,70 @@
28#define BLOCK_SECTORS (8) 31#define BLOCK_SECTORS (8)
29 32
30/* 33/*
31 * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent 34 * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
32 * recovery scans a very long log 35 *
36 * In write through mode, the reclaim runs every log->max_free_space.
37 * This can prevent the recovery scans for too long
33 */ 38 */
34#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 39#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
35#define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 40#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
36 41
42/* wake up reclaim thread periodically */
43#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
44/* start flush with these full stripes */
45#define R5C_FULL_STRIPE_FLUSH_BATCH 256
46/* reclaim stripes in groups */
47#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
48
37/* 49/*
38 * We only need 2 bios per I/O unit to make progress, but ensure we 50 * We only need 2 bios per I/O unit to make progress, but ensure we
39 * have a few more available to not get too tight. 51 * have a few more available to not get too tight.
40 */ 52 */
41#define R5L_POOL_SIZE 4 53#define R5L_POOL_SIZE 4
42 54
55/*
56 * r5c journal modes of the array: write-back or write-through.
57 * write-through mode has identical behavior as existing log only
58 * implementation.
59 */
60enum r5c_journal_mode {
61 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
62 R5C_JOURNAL_MODE_WRITE_BACK = 1,
63};
64
65static char *r5c_journal_mode_str[] = {"write-through",
66 "write-back"};
67/*
68 * raid5 cache state machine
69 *
70 * With the RAID cache, each stripe works in two phases:
71 * - caching phase
72 * - writing-out phase
73 *
74 * These two phases are controlled by bit STRIPE_R5C_CACHING:
75 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
76 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
77 *
78 * When there is no journal, or the journal is in write-through mode,
79 * the stripe is always in writing-out phase.
80 *
81 * For write-back journal, the stripe is sent to caching phase on write
82 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
83 * the write-out phase by clearing STRIPE_R5C_CACHING.
84 *
85 * Stripes in caching phase do not write the raid disks. Instead, all
86 * writes are committed from the log device. Therefore, a stripe in
87 * caching phase handles writes as:
88 * - write to log device
89 * - return IO
90 *
91 * Stripes in writing-out phase handle writes as:
92 * - calculate parity
93 * - write pending data and parity to journal
94 * - write data and parity to raid disks
95 * - return IO for pending writes
96 */
97
43struct r5l_log { 98struct r5l_log {
44 struct md_rdev *rdev; 99 struct md_rdev *rdev;
45 100
@@ -58,7 +113,6 @@ struct r5l_log {
58 u64 seq; /* log head sequence */ 113 u64 seq; /* log head sequence */
59 114
60 sector_t next_checkpoint; 115 sector_t next_checkpoint;
61 u64 next_cp_seq;
62 116
63 struct mutex io_mutex; 117 struct mutex io_mutex;
64 struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 118 struct r5l_io_unit *current_io; /* current io_unit accepting new data */
@@ -96,6 +150,18 @@ struct r5l_log {
96 spinlock_t no_space_stripes_lock; 150 spinlock_t no_space_stripes_lock;
97 151
98 bool need_cache_flush; 152 bool need_cache_flush;
153
154 /* for r5c_cache */
155 enum r5c_journal_mode r5c_journal_mode;
156
157 /* all stripes in r5cache, in the order of seq at sh->log_start */
158 struct list_head stripe_in_journal_list;
159
160 spinlock_t stripe_in_journal_lock;
161 atomic_t stripe_in_journal_count;
162
163 /* to submit async io_units, to fulfill ordering of flush */
164 struct work_struct deferred_io_work;
99}; 165};
100 166
101/* 167/*
@@ -122,6 +188,18 @@ struct r5l_io_unit {
122 188
123 int state; 189 int state;
124 bool need_split_bio; 190 bool need_split_bio;
191 struct bio *split_bio;
192
193 unsigned int has_flush:1; /* include flush request */
194 unsigned int has_fua:1; /* include fua request */
195 unsigned int has_null_flush:1; /* include empty flush request */
196 /*
197 * io isn't sent yet, flush/fua request can only be submitted till it's
198 * the first IO in running_ios list
199 */
200 unsigned int io_deferred:1;
201
202 struct bio_list flush_barriers; /* size == 0 flush bios */
125}; 203};
126 204
127/* r5l_io_unit state */ 205/* r5l_io_unit state */
@@ -133,6 +211,12 @@ enum r5l_io_unit_state {
133 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 211 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
134}; 212};
135 213
214bool r5c_is_writeback(struct r5l_log *log)
215{
216 return (log != NULL &&
217 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
218}
219
136static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 220static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
137{ 221{
138 start += inc; 222 start += inc;
@@ -168,12 +252,235 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
168 io->state = state; 252 io->state = state;
169} 253}
170 254
255static void
256r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
257 struct bio_list *return_bi)
258{
259 struct bio *wbi, *wbi2;
260
261 wbi = dev->written;
262 dev->written = NULL;
263 while (wbi && wbi->bi_iter.bi_sector <
264 dev->sector + STRIPE_SECTORS) {
265 wbi2 = r5_next_bio(wbi, dev->sector);
266 if (!raid5_dec_bi_active_stripes(wbi)) {
267 md_write_end(conf->mddev);
268 bio_list_add(return_bi, wbi);
269 }
270 wbi = wbi2;
271 }
272}
273
274void r5c_handle_cached_data_endio(struct r5conf *conf,
275 struct stripe_head *sh, int disks, struct bio_list *return_bi)
276{
277 int i;
278
279 for (i = sh->disks; i--; ) {
280 if (sh->dev[i].written) {
281 set_bit(R5_UPTODATE, &sh->dev[i].flags);
282 r5c_return_dev_pending_writes(conf, &sh->dev[i],
283 return_bi);
284 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
285 STRIPE_SECTORS,
286 !test_bit(STRIPE_DEGRADED, &sh->state),
287 0);
288 }
289 }
290}
291
292/* Check whether we should flush some stripes to free up stripe cache */
293void r5c_check_stripe_cache_usage(struct r5conf *conf)
294{
295 int total_cached;
296
297 if (!r5c_is_writeback(conf->log))
298 return;
299
300 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
301 atomic_read(&conf->r5c_cached_full_stripes);
302
303 /*
304 * The following condition is true for either of the following:
305 * - stripe cache pressure high:
306 * total_cached > 3/4 min_nr_stripes ||
307 * empty_inactive_list_nr > 0
308 * - stripe cache pressure moderate:
309 * total_cached > 1/2 min_nr_stripes
310 */
311 if (total_cached > conf->min_nr_stripes * 1 / 2 ||
312 atomic_read(&conf->empty_inactive_list_nr) > 0)
313 r5l_wake_reclaim(conf->log, 0);
314}
315
316/*
317 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
318 * stripes in the cache
319 */
320void r5c_check_cached_full_stripe(struct r5conf *conf)
321{
322 if (!r5c_is_writeback(conf->log))
323 return;
324
325 /*
326 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
327 * or a full stripe (chunk size / 4k stripes).
328 */
329 if (atomic_read(&conf->r5c_cached_full_stripes) >=
330 min(R5C_FULL_STRIPE_FLUSH_BATCH,
331 conf->chunk_sectors >> STRIPE_SHIFT))
332 r5l_wake_reclaim(conf->log, 0);
333}
334
335/*
336 * Total log space (in sectors) needed to flush all data in cache
337 *
338 * Currently, writing-out phase automatically includes all pending writes
339 * to the same sector. So the reclaim of each stripe takes up to
340 * (conf->raid_disks + 1) pages of log space.
341 *
342 * To totally avoid deadlock due to log space, the code reserves
343 * (conf->raid_disks + 1) pages for each stripe in cache, which is not
344 * necessary in most cases.
345 *
346 * To improve this, we will need writing-out phase to be able to NOT include
347 * pending writes, which will reduce the requirement to
348 * (conf->max_degraded + 1) pages per stripe in cache.
349 */
350static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
351{
352 struct r5l_log *log = conf->log;
353
354 if (!r5c_is_writeback(log))
355 return 0;
356
357 return BLOCK_SECTORS * (conf->raid_disks + 1) *
358 atomic_read(&log->stripe_in_journal_count);
359}
360
361/*
362 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
363 *
364 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
365 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
366 * device is less than 2x of reclaim_required_space.
367 */
368static inline void r5c_update_log_state(struct r5l_log *log)
369{
370 struct r5conf *conf = log->rdev->mddev->private;
371 sector_t free_space;
372 sector_t reclaim_space;
373 bool wake_reclaim = false;
374
375 if (!r5c_is_writeback(log))
376 return;
377
378 free_space = r5l_ring_distance(log, log->log_start,
379 log->last_checkpoint);
380 reclaim_space = r5c_log_required_to_flush_cache(conf);
381 if (free_space < 2 * reclaim_space)
382 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
383 else {
384 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
385 wake_reclaim = true;
386 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
387 }
388 if (free_space < 3 * reclaim_space)
389 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
390 else
391 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
392
393 if (wake_reclaim)
394 r5l_wake_reclaim(log, 0);
395}
396
397/*
398 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
399 * This function should only be called in write-back mode.
400 */
401void r5c_make_stripe_write_out(struct stripe_head *sh)
402{
403 struct r5conf *conf = sh->raid_conf;
404 struct r5l_log *log = conf->log;
405
406 BUG_ON(!r5c_is_writeback(log));
407
408 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
409 clear_bit(STRIPE_R5C_CACHING, &sh->state);
410
411 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
412 atomic_inc(&conf->preread_active_stripes);
413
414 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
415 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
416 atomic_dec(&conf->r5c_cached_partial_stripes);
417 }
418
419 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
420 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
421 atomic_dec(&conf->r5c_cached_full_stripes);
422 }
423}
424
425static void r5c_handle_data_cached(struct stripe_head *sh)
426{
427 int i;
428
429 for (i = sh->disks; i--; )
430 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
431 set_bit(R5_InJournal, &sh->dev[i].flags);
432 clear_bit(R5_LOCKED, &sh->dev[i].flags);
433 }
434 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
435}
436
437/*
438 * this journal write must contain full parity,
439 * it may also contain some data pages
440 */
441static void r5c_handle_parity_cached(struct stripe_head *sh)
442{
443 int i;
444
445 for (i = sh->disks; i--; )
446 if (test_bit(R5_InJournal, &sh->dev[i].flags))
447 set_bit(R5_Wantwrite, &sh->dev[i].flags);
448}
449
450/*
451 * Setting proper flags after writing (or flushing) data and/or parity to the
452 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
453 */
454static void r5c_finish_cache_stripe(struct stripe_head *sh)
455{
456 struct r5l_log *log = sh->raid_conf->log;
457
458 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
459 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
460 /*
461 * Set R5_InJournal for parity dev[pd_idx]. This means
462 * all data AND parity in the journal. For RAID 6, it is
463 * NOT necessary to set the flag for dev[qd_idx], as the
464 * two parities are written out together.
465 */
466 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
467 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
468 r5c_handle_data_cached(sh);
469 } else {
470 r5c_handle_parity_cached(sh);
471 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
472 }
473}
474
171static void r5l_io_run_stripes(struct r5l_io_unit *io) 475static void r5l_io_run_stripes(struct r5l_io_unit *io)
172{ 476{
173 struct stripe_head *sh, *next; 477 struct stripe_head *sh, *next;
174 478
175 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 479 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
176 list_del_init(&sh->log_list); 480 list_del_init(&sh->log_list);
481
482 r5c_finish_cache_stripe(sh);
483
177 set_bit(STRIPE_HANDLE, &sh->state); 484 set_bit(STRIPE_HANDLE, &sh->state);
178 raid5_release_stripe(sh); 485 raid5_release_stripe(sh);
179 } 486 }
@@ -209,9 +516,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
209 } 516 }
210} 517}
211 518
519static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
212static void r5l_log_endio(struct bio *bio) 520static void r5l_log_endio(struct bio *bio)
213{ 521{
214 struct r5l_io_unit *io = bio->bi_private; 522 struct r5l_io_unit *io = bio->bi_private;
523 struct r5l_io_unit *io_deferred;
215 struct r5l_log *log = io->log; 524 struct r5l_log *log = io->log;
216 unsigned long flags; 525 unsigned long flags;
217 526
@@ -227,18 +536,89 @@ static void r5l_log_endio(struct bio *bio)
227 r5l_move_to_end_ios(log); 536 r5l_move_to_end_ios(log);
228 else 537 else
229 r5l_log_run_stripes(log); 538 r5l_log_run_stripes(log);
539 if (!list_empty(&log->running_ios)) {
540 /*
541 * FLUSH/FUA io_unit is deferred because of ordering, now we
542 * can dispatch it
543 */
544 io_deferred = list_first_entry(&log->running_ios,
545 struct r5l_io_unit, log_sibling);
546 if (io_deferred->io_deferred)
547 schedule_work(&log->deferred_io_work);
548 }
549
230 spin_unlock_irqrestore(&log->io_list_lock, flags); 550 spin_unlock_irqrestore(&log->io_list_lock, flags);
231 551
232 if (log->need_cache_flush) 552 if (log->need_cache_flush)
233 md_wakeup_thread(log->rdev->mddev->thread); 553 md_wakeup_thread(log->rdev->mddev->thread);
554
555 if (io->has_null_flush) {
556 struct bio *bi;
557
558 WARN_ON(bio_list_empty(&io->flush_barriers));
559 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
560 bio_endio(bi);
561 atomic_dec(&io->pending_stripe);
562 }
563 if (atomic_read(&io->pending_stripe) == 0)
564 __r5l_stripe_write_finished(io);
565 }
566}
567
568static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
569{
570 unsigned long flags;
571
572 spin_lock_irqsave(&log->io_list_lock, flags);
573 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
574 spin_unlock_irqrestore(&log->io_list_lock, flags);
575
576 if (io->has_flush)
577 io->current_bio->bi_opf |= REQ_PREFLUSH;
578 if (io->has_fua)
579 io->current_bio->bi_opf |= REQ_FUA;
580 submit_bio(io->current_bio);
581
582 if (!io->split_bio)
583 return;
584
585 if (io->has_flush)
586 io->split_bio->bi_opf |= REQ_PREFLUSH;
587 if (io->has_fua)
588 io->split_bio->bi_opf |= REQ_FUA;
589 submit_bio(io->split_bio);
590}
591
592/* deferred io_unit will be dispatched here */
593static void r5l_submit_io_async(struct work_struct *work)
594{
595 struct r5l_log *log = container_of(work, struct r5l_log,
596 deferred_io_work);
597 struct r5l_io_unit *io = NULL;
598 unsigned long flags;
599
600 spin_lock_irqsave(&log->io_list_lock, flags);
601 if (!list_empty(&log->running_ios)) {
602 io = list_first_entry(&log->running_ios, struct r5l_io_unit,
603 log_sibling);
604 if (!io->io_deferred)
605 io = NULL;
606 else
607 io->io_deferred = 0;
608 }
609 spin_unlock_irqrestore(&log->io_list_lock, flags);
610 if (io)
611 r5l_do_submit_io(log, io);
234} 612}
235 613
236static void r5l_submit_current_io(struct r5l_log *log) 614static void r5l_submit_current_io(struct r5l_log *log)
237{ 615{
238 struct r5l_io_unit *io = log->current_io; 616 struct r5l_io_unit *io = log->current_io;
617 struct bio *bio;
239 struct r5l_meta_block *block; 618 struct r5l_meta_block *block;
240 unsigned long flags; 619 unsigned long flags;
241 u32 crc; 620 u32 crc;
621 bool do_submit = true;
242 622
243 if (!io) 623 if (!io)
244 return; 624 return;
@@ -247,13 +627,20 @@ static void r5l_submit_current_io(struct r5l_log *log)
247 block->meta_size = cpu_to_le32(io->meta_offset); 627 block->meta_size = cpu_to_le32(io->meta_offset);
248 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 628 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
249 block->checksum = cpu_to_le32(crc); 629 block->checksum = cpu_to_le32(crc);
630 bio = io->current_bio;
250 631
251 log->current_io = NULL; 632 log->current_io = NULL;
252 spin_lock_irqsave(&log->io_list_lock, flags); 633 spin_lock_irqsave(&log->io_list_lock, flags);
253 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 634 if (io->has_flush || io->has_fua) {
635 if (io != list_first_entry(&log->running_ios,
636 struct r5l_io_unit, log_sibling)) {
637 io->io_deferred = 1;
638 do_submit = false;
639 }
640 }
254 spin_unlock_irqrestore(&log->io_list_lock, flags); 641 spin_unlock_irqrestore(&log->io_list_lock, flags);
255 642 if (do_submit)
256 submit_bio(io->current_bio); 643 r5l_do_submit_io(log, io);
257} 644}
258 645
259static struct bio *r5l_bio_alloc(struct r5l_log *log) 646static struct bio *r5l_bio_alloc(struct r5l_log *log)
@@ -271,6 +658,7 @@ static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
271{ 658{
272 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 659 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
273 660
661 r5c_update_log_state(log);
274 /* 662 /*
275 * If we filled up the log device start from the beginning again, 663 * If we filled up the log device start from the beginning again,
276 * which will require a new bio. 664 * which will require a new bio.
@@ -297,6 +685,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
297 io->log = log; 685 io->log = log;
298 INIT_LIST_HEAD(&io->log_sibling); 686 INIT_LIST_HEAD(&io->log_sibling);
299 INIT_LIST_HEAD(&io->stripe_list); 687 INIT_LIST_HEAD(&io->stripe_list);
688 bio_list_init(&io->flush_barriers);
300 io->state = IO_UNIT_RUNNING; 689 io->state = IO_UNIT_RUNNING;
301 690
302 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 691 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
@@ -367,12 +756,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
367 struct r5l_io_unit *io = log->current_io; 756 struct r5l_io_unit *io = log->current_io;
368 757
369 if (io->need_split_bio) { 758 if (io->need_split_bio) {
370 struct bio *prev = io->current_bio; 759 BUG_ON(io->split_bio);
371 760 io->split_bio = io->current_bio;
372 io->current_bio = r5l_bio_alloc(log); 761 io->current_bio = r5l_bio_alloc(log);
373 bio_chain(io->current_bio, prev); 762 bio_chain(io->current_bio, io->split_bio);
374 763 io->need_split_bio = false;
375 submit_bio(prev);
376 } 764 }
377 765
378 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 766 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
@@ -401,50 +789,85 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
401 789
402 io = log->current_io; 790 io = log->current_io;
403 791
792 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
793 io->has_flush = 1;
794
404 for (i = 0; i < sh->disks; i++) { 795 for (i = 0; i < sh->disks; i++) {
405 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 796 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
797 test_bit(R5_InJournal, &sh->dev[i].flags))
406 continue; 798 continue;
407 if (i == sh->pd_idx || i == sh->qd_idx) 799 if (i == sh->pd_idx || i == sh->qd_idx)
408 continue; 800 continue;
801 if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
802 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
803 io->has_fua = 1;
804 /*
805 * we need to flush journal to make sure recovery can
806 * reach the data with fua flag
807 */
808 io->has_flush = 1;
809 }
409 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 810 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
410 raid5_compute_blocknr(sh, i, 0), 811 raid5_compute_blocknr(sh, i, 0),
411 sh->dev[i].log_checksum, 0, false); 812 sh->dev[i].log_checksum, 0, false);
412 r5l_append_payload_page(log, sh->dev[i].page); 813 r5l_append_payload_page(log, sh->dev[i].page);
413 } 814 }
414 815
415 if (sh->qd_idx >= 0) { 816 if (parity_pages == 2) {
416 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 817 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
417 sh->sector, sh->dev[sh->pd_idx].log_checksum, 818 sh->sector, sh->dev[sh->pd_idx].log_checksum,
418 sh->dev[sh->qd_idx].log_checksum, true); 819 sh->dev[sh->qd_idx].log_checksum, true);
419 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 820 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
420 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 821 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
421 } else { 822 } else if (parity_pages == 1) {
422 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 823 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
423 sh->sector, sh->dev[sh->pd_idx].log_checksum, 824 sh->sector, sh->dev[sh->pd_idx].log_checksum,
424 0, false); 825 0, false);
425 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 826 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
426 } 827 } else /* Just writing data, not parity, in caching phase */
828 BUG_ON(parity_pages != 0);
427 829
428 list_add_tail(&sh->log_list, &io->stripe_list); 830 list_add_tail(&sh->log_list, &io->stripe_list);
429 atomic_inc(&io->pending_stripe); 831 atomic_inc(&io->pending_stripe);
430 sh->log_io = io; 832 sh->log_io = io;
431 833
834 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
835 return 0;
836
837 if (sh->log_start == MaxSector) {
838 BUG_ON(!list_empty(&sh->r5c));
839 sh->log_start = io->log_start;
840 spin_lock_irq(&log->stripe_in_journal_lock);
841 list_add_tail(&sh->r5c,
842 &log->stripe_in_journal_list);
843 spin_unlock_irq(&log->stripe_in_journal_lock);
844 atomic_inc(&log->stripe_in_journal_count);
845 }
432 return 0; 846 return 0;
433} 847}
434 848
435static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); 849/* add stripe to no_space_stripes, and then wake up reclaim */
850static inline void r5l_add_no_space_stripe(struct r5l_log *log,
851 struct stripe_head *sh)
852{
853 spin_lock(&log->no_space_stripes_lock);
854 list_add_tail(&sh->log_list, &log->no_space_stripes);
855 spin_unlock(&log->no_space_stripes_lock);
856}
857
436/* 858/*
437 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 859 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
438 * data from log to raid disks), so we shouldn't wait for reclaim here 860 * data from log to raid disks), so we shouldn't wait for reclaim here
439 */ 861 */
440int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 862int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
441{ 863{
864 struct r5conf *conf = sh->raid_conf;
442 int write_disks = 0; 865 int write_disks = 0;
443 int data_pages, parity_pages; 866 int data_pages, parity_pages;
444 int meta_size;
445 int reserve; 867 int reserve;
446 int i; 868 int i;
447 int ret = 0; 869 int ret = 0;
870 bool wake_reclaim = false;
448 871
449 if (!log) 872 if (!log)
450 return -EAGAIN; 873 return -EAGAIN;
@@ -456,11 +879,15 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
456 return -EAGAIN; 879 return -EAGAIN;
457 } 880 }
458 881
882 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
883
459 for (i = 0; i < sh->disks; i++) { 884 for (i = 0; i < sh->disks; i++) {
460 void *addr; 885 void *addr;
461 886
462 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 887 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
888 test_bit(R5_InJournal, &sh->dev[i].flags))
463 continue; 889 continue;
890
464 write_disks++; 891 write_disks++;
465 /* checksum is already calculated in last run */ 892 /* checksum is already calculated in last run */
466 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 893 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
@@ -473,15 +900,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
473 parity_pages = 1 + !!(sh->qd_idx >= 0); 900 parity_pages = 1 + !!(sh->qd_idx >= 0);
474 data_pages = write_disks - parity_pages; 901 data_pages = write_disks - parity_pages;
475 902
476 meta_size =
477 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
478 * data_pages) +
479 sizeof(struct r5l_payload_data_parity) +
480 sizeof(__le32) * parity_pages;
481 /* Doesn't work with very big raid array */
482 if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
483 return -EINVAL;
484
485 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 903 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
486 /* 904 /*
487 * The stripe must enter state machine again to finish the write, so 905 * The stripe must enter state machine again to finish the write, so
@@ -493,22 +911,49 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
493 mutex_lock(&log->io_mutex); 911 mutex_lock(&log->io_mutex);
494 /* meta + data */ 912 /* meta + data */
495 reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 913 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
496 if (!r5l_has_free_space(log, reserve)) {
497 spin_lock(&log->no_space_stripes_lock);
498 list_add_tail(&sh->log_list, &log->no_space_stripes);
499 spin_unlock(&log->no_space_stripes_lock);
500 914
501 r5l_wake_reclaim(log, reserve); 915 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
502 } else { 916 if (!r5l_has_free_space(log, reserve)) {
503 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 917 r5l_add_no_space_stripe(log, sh);
504 if (ret) { 918 wake_reclaim = true;
505 spin_lock_irq(&log->io_list_lock); 919 } else {
506 list_add_tail(&sh->log_list, &log->no_mem_stripes); 920 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
507 spin_unlock_irq(&log->io_list_lock); 921 if (ret) {
922 spin_lock_irq(&log->io_list_lock);
923 list_add_tail(&sh->log_list,
924 &log->no_mem_stripes);
925 spin_unlock_irq(&log->io_list_lock);
926 }
927 }
928 } else { /* R5C_JOURNAL_MODE_WRITE_BACK */
929 /*
930 * log space critical, do not process stripes that are
931 * not in cache yet (sh->log_start == MaxSector).
932 */
933 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
934 sh->log_start == MaxSector) {
935 r5l_add_no_space_stripe(log, sh);
936 wake_reclaim = true;
937 reserve = 0;
938 } else if (!r5l_has_free_space(log, reserve)) {
939 if (sh->log_start == log->last_checkpoint)
940 BUG();
941 else
942 r5l_add_no_space_stripe(log, sh);
943 } else {
944 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
945 if (ret) {
946 spin_lock_irq(&log->io_list_lock);
947 list_add_tail(&sh->log_list,
948 &log->no_mem_stripes);
949 spin_unlock_irq(&log->io_list_lock);
950 }
508 } 951 }
509 } 952 }
510 953
511 mutex_unlock(&log->io_mutex); 954 mutex_unlock(&log->io_mutex);
955 if (wake_reclaim)
956 r5l_wake_reclaim(log, reserve);
512 return 0; 957 return 0;
513} 958}
514 959
@@ -525,17 +970,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
525{ 970{
526 if (!log) 971 if (!log)
527 return -ENODEV; 972 return -ENODEV;
528 /* 973
529 * we flush log disk cache first, then write stripe data to raid disks. 974 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
530 * So if bio is finished, the log disk cache is flushed already. The 975 /*
531 * recovery guarantees we can recovery the bio from log disk, so we 976 * in write through (journal only)
532 * don't need to flush again 977 * we flush log disk cache first, then write stripe data to
533 */ 978 * raid disks. So if bio is finished, the log disk cache is
534 if (bio->bi_iter.bi_size == 0) { 979 * flushed already. The recovery guarantees we can recovery
535 bio_endio(bio); 980 * the bio from log disk, so we don't need to flush again
536 return 0; 981 */
982 if (bio->bi_iter.bi_size == 0) {
983 bio_endio(bio);
984 return 0;
985 }
986 bio->bi_opf &= ~REQ_PREFLUSH;
987 } else {
988 /* write back (with cache) */
989 if (bio->bi_iter.bi_size == 0) {
990 mutex_lock(&log->io_mutex);
991 r5l_get_meta(log, 0);
992 bio_list_add(&log->current_io->flush_barriers, bio);
993 log->current_io->has_flush = 1;
994 log->current_io->has_null_flush = 1;
995 atomic_inc(&log->current_io->pending_stripe);
996 r5l_submit_current_io(log);
997 mutex_unlock(&log->io_mutex);
998 return 0;
999 }
537 } 1000 }
538 bio->bi_opf &= ~REQ_PREFLUSH;
539 return -EAGAIN; 1001 return -EAGAIN;
540} 1002}
541 1003
@@ -555,10 +1017,40 @@ static void r5l_run_no_space_stripes(struct r5l_log *log)
555 spin_unlock(&log->no_space_stripes_lock); 1017 spin_unlock(&log->no_space_stripes_lock);
556} 1018}
557 1019
1020/*
1021 * calculate new last_checkpoint
1022 * for write through mode, returns log->next_checkpoint
1023 * for write back, returns log_start of first sh in stripe_in_journal_list
1024 */
1025static sector_t r5c_calculate_new_cp(struct r5conf *conf)
1026{
1027 struct stripe_head *sh;
1028 struct r5l_log *log = conf->log;
1029 sector_t new_cp;
1030 unsigned long flags;
1031
1032 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1033 return log->next_checkpoint;
1034
1035 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1036 if (list_empty(&conf->log->stripe_in_journal_list)) {
1037 /* all stripes flushed */
1038 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1039 return log->next_checkpoint;
1040 }
1041 sh = list_first_entry(&conf->log->stripe_in_journal_list,
1042 struct stripe_head, r5c);
1043 new_cp = sh->log_start;
1044 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1045 return new_cp;
1046}
1047
558static sector_t r5l_reclaimable_space(struct r5l_log *log) 1048static sector_t r5l_reclaimable_space(struct r5l_log *log)
559{ 1049{
1050 struct r5conf *conf = log->rdev->mddev->private;
1051
560 return r5l_ring_distance(log, log->last_checkpoint, 1052 return r5l_ring_distance(log, log->last_checkpoint,
561 log->next_checkpoint); 1053 r5c_calculate_new_cp(conf));
562} 1054}
563 1055
564static void r5l_run_no_mem_stripe(struct r5l_log *log) 1056static void r5l_run_no_mem_stripe(struct r5l_log *log)
@@ -589,7 +1081,6 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
589 break; 1081 break;
590 1082
591 log->next_checkpoint = io->log_start; 1083 log->next_checkpoint = io->log_start;
592 log->next_cp_seq = io->seq;
593 1084
594 list_del(&io->log_sibling); 1085 list_del(&io->log_sibling);
595 mempool_free(io, log->io_pool); 1086 mempool_free(io, log->io_pool);
@@ -604,6 +1095,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
604static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 1095static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
605{ 1096{
606 struct r5l_log *log = io->log; 1097 struct r5l_log *log = io->log;
1098 struct r5conf *conf = log->rdev->mddev->private;
607 unsigned long flags; 1099 unsigned long flags;
608 1100
609 spin_lock_irqsave(&log->io_list_lock, flags); 1101 spin_lock_irqsave(&log->io_list_lock, flags);
@@ -614,7 +1106,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
614 return; 1106 return;
615 } 1107 }
616 1108
617 if (r5l_reclaimable_space(log) > log->max_free_space) 1109 if (r5l_reclaimable_space(log) > log->max_free_space ||
1110 test_bit(R5C_LOG_TIGHT, &conf->cache_state))
618 r5l_wake_reclaim(log, 0); 1111 r5l_wake_reclaim(log, 0);
619 1112
620 spin_unlock_irqrestore(&log->io_list_lock, flags); 1113 spin_unlock_irqrestore(&log->io_list_lock, flags);
@@ -713,8 +1206,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
713 * there is a deadlock. We workaround this issue with a trylock. 1206 * there is a deadlock. We workaround this issue with a trylock.
714 * FIXME: we could miss discard if we can't take reconfig mutex 1207 * FIXME: we could miss discard if we can't take reconfig mutex
715 */ 1208 */
716 set_mask_bits(&mddev->flags, 0, 1209 set_mask_bits(&mddev->sb_flags, 0,
717 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1210 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
718 if (!mddev_trylock(mddev)) 1211 if (!mddev_trylock(mddev))
719 return; 1212 return;
720 md_update_sb(mddev, 1); 1213 md_update_sb(mddev, 1);
@@ -735,15 +1228,148 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
735 } 1228 }
736} 1229}
737 1230
1231/*
1232 * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
1233 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
1234 *
1235 * must hold conf->device_lock
1236 */
1237static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1238{
1239 BUG_ON(list_empty(&sh->lru));
1240 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1241 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1242
1243 /*
1244 * The stripe is not ON_RELEASE_LIST, so it is safe to call
1245 * raid5_release_stripe() while holding conf->device_lock
1246 */
1247 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1248 assert_spin_locked(&conf->device_lock);
1249
1250 list_del_init(&sh->lru);
1251 atomic_inc(&sh->count);
1252
1253 set_bit(STRIPE_HANDLE, &sh->state);
1254 atomic_inc(&conf->active_stripes);
1255 r5c_make_stripe_write_out(sh);
1256
1257 raid5_release_stripe(sh);
1258}
1259
1260/*
1261 * if num == 0, flush all full stripes
1262 * if num > 0, flush all full stripes. If less than num full stripes are
1263 * flushed, flush some partial stripes until totally num stripes are
1264 * flushed or there is no more cached stripes.
1265 */
1266void r5c_flush_cache(struct r5conf *conf, int num)
1267{
1268 int count;
1269 struct stripe_head *sh, *next;
1270
1271 assert_spin_locked(&conf->device_lock);
1272 if (!conf->log)
1273 return;
1274
1275 count = 0;
1276 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1277 r5c_flush_stripe(conf, sh);
1278 count++;
1279 }
1280
1281 if (count >= num)
1282 return;
1283 list_for_each_entry_safe(sh, next,
1284 &conf->r5c_partial_stripe_list, lru) {
1285 r5c_flush_stripe(conf, sh);
1286 if (++count >= num)
1287 break;
1288 }
1289}
1290
1291static void r5c_do_reclaim(struct r5conf *conf)
1292{
1293 struct r5l_log *log = conf->log;
1294 struct stripe_head *sh;
1295 int count = 0;
1296 unsigned long flags;
1297 int total_cached;
1298 int stripes_to_flush;
1299
1300 if (!r5c_is_writeback(log))
1301 return;
1302
1303 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1304 atomic_read(&conf->r5c_cached_full_stripes);
1305
1306 if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1307 atomic_read(&conf->empty_inactive_list_nr) > 0)
1308 /*
1309 * if stripe cache pressure high, flush all full stripes and
1310 * some partial stripes
1311 */
1312 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1313 else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1314 atomic_read(&conf->r5c_cached_full_stripes) >
1315 R5C_FULL_STRIPE_FLUSH_BATCH)
1316 /*
1317 * if stripe cache pressure moderate, or if there is many full
1318 * stripes,flush all full stripes
1319 */
1320 stripes_to_flush = 0;
1321 else
1322 /* no need to flush */
1323 stripes_to_flush = -1;
1324
1325 if (stripes_to_flush >= 0) {
1326 spin_lock_irqsave(&conf->device_lock, flags);
1327 r5c_flush_cache(conf, stripes_to_flush);
1328 spin_unlock_irqrestore(&conf->device_lock, flags);
1329 }
1330
1331 /* if log space is tight, flush stripes on stripe_in_journal_list */
1332 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1333 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1334 spin_lock(&conf->device_lock);
1335 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1336 /*
1337 * stripes on stripe_in_journal_list could be in any
1338 * state of the stripe_cache state machine. In this
1339 * case, we only want to flush stripe on
1340 * r5c_cached_full/partial_stripes. The following
1341 * condition makes sure the stripe is on one of the
1342 * two lists.
1343 */
1344 if (!list_empty(&sh->lru) &&
1345 !test_bit(STRIPE_HANDLE, &sh->state) &&
1346 atomic_read(&sh->count) == 0) {
1347 r5c_flush_stripe(conf, sh);
1348 }
1349 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1350 break;
1351 }
1352 spin_unlock(&conf->device_lock);
1353 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1354 }
1355
1356 if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
1357 r5l_run_no_space_stripes(log);
1358
1359 md_wakeup_thread(conf->mddev->thread);
1360}
738 1361
739static void r5l_do_reclaim(struct r5l_log *log) 1362static void r5l_do_reclaim(struct r5l_log *log)
740{ 1363{
1364 struct r5conf *conf = log->rdev->mddev->private;
741 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 1365 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
742 sector_t reclaimable; 1366 sector_t reclaimable;
743 sector_t next_checkpoint; 1367 sector_t next_checkpoint;
744 u64 next_cp_seq; 1368 bool write_super;
745 1369
746 spin_lock_irq(&log->io_list_lock); 1370 spin_lock_irq(&log->io_list_lock);
1371 write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1372 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
747 /* 1373 /*
748 * move proper io_unit to reclaim list. We should not change the order. 1374 * move proper io_unit to reclaim list. We should not change the order.
749 * reclaimable/unreclaimable io_unit can be mixed in the list, we 1375 * reclaimable/unreclaimable io_unit can be mixed in the list, we
@@ -764,12 +1390,12 @@ static void r5l_do_reclaim(struct r5l_log *log)
764 log->io_list_lock); 1390 log->io_list_lock);
765 } 1391 }
766 1392
767 next_checkpoint = log->next_checkpoint; 1393 next_checkpoint = r5c_calculate_new_cp(conf);
768 next_cp_seq = log->next_cp_seq;
769 spin_unlock_irq(&log->io_list_lock); 1394 spin_unlock_irq(&log->io_list_lock);
770 1395
771 BUG_ON(reclaimable < 0); 1396 BUG_ON(reclaimable < 0);
772 if (reclaimable == 0) 1397
1398 if (reclaimable == 0 || !write_super)
773 return; 1399 return;
774 1400
775 /* 1401 /*
@@ -781,7 +1407,7 @@ static void r5l_do_reclaim(struct r5l_log *log)
781 1407
782 mutex_lock(&log->io_mutex); 1408 mutex_lock(&log->io_mutex);
783 log->last_checkpoint = next_checkpoint; 1409 log->last_checkpoint = next_checkpoint;
784 log->last_cp_seq = next_cp_seq; 1410 r5c_update_log_state(log);
785 mutex_unlock(&log->io_mutex); 1411 mutex_unlock(&log->io_mutex);
786 1412
787 r5l_run_no_space_stripes(log); 1413 r5l_run_no_space_stripes(log);
@@ -795,14 +1421,17 @@ static void r5l_reclaim_thread(struct md_thread *thread)
795 1421
796 if (!log) 1422 if (!log)
797 return; 1423 return;
1424 r5c_do_reclaim(conf);
798 r5l_do_reclaim(log); 1425 r5l_do_reclaim(log);
799} 1426}
800 1427
801static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1428void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
802{ 1429{
803 unsigned long target; 1430 unsigned long target;
804 unsigned long new = (unsigned long)space; /* overflow in theory */ 1431 unsigned long new = (unsigned long)space; /* overflow in theory */
805 1432
1433 if (!log)
1434 return;
806 do { 1435 do {
807 target = log->reclaim_target; 1436 target = log->reclaim_target;
808 if (new < target) 1437 if (new < target)
@@ -816,22 +1445,14 @@ void r5l_quiesce(struct r5l_log *log, int state)
816 struct mddev *mddev; 1445 struct mddev *mddev;
817 if (!log || state == 2) 1446 if (!log || state == 2)
818 return; 1447 return;
819 if (state == 0) { 1448 if (state == 0)
820 /* 1449 kthread_unpark(log->reclaim_thread->tsk);
821 * This is a special case for hotadd. In suspend, the array has 1450 else if (state == 1) {
822 * no journal. In resume, journal is initialized as well as the
823 * reclaim thread.
824 */
825 if (log->reclaim_thread)
826 return;
827 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
828 log->rdev->mddev, "reclaim");
829 } else if (state == 1) {
830 /* make sure r5l_write_super_and_discard_space exits */ 1451 /* make sure r5l_write_super_and_discard_space exits */
831 mddev = log->rdev->mddev; 1452 mddev = log->rdev->mddev;
832 wake_up(&mddev->sb_wait); 1453 wake_up(&mddev->sb_wait);
833 r5l_wake_reclaim(log, -1L); 1454 kthread_park(log->reclaim_thread->tsk);
834 md_unregister_thread(&log->reclaim_thread); 1455 r5l_wake_reclaim(log, MaxSector);
835 r5l_do_reclaim(log); 1456 r5l_do_reclaim(log);
836 } 1457 }
837} 1458}
@@ -857,10 +1478,13 @@ struct r5l_recovery_ctx {
857 sector_t meta_total_blocks; /* total size of current meta and data */ 1478 sector_t meta_total_blocks; /* total size of current meta and data */
858 sector_t pos; /* recovery position */ 1479 sector_t pos; /* recovery position */
859 u64 seq; /* recovery position seq */ 1480 u64 seq; /* recovery position seq */
1481 int data_parity_stripes; /* number of data_parity stripes */
1482 int data_only_stripes; /* number of data_only stripes */
1483 struct list_head cached_list;
860}; 1484};
861 1485
862static int r5l_read_meta_block(struct r5l_log *log, 1486static int r5l_recovery_read_meta_block(struct r5l_log *log,
863 struct r5l_recovery_ctx *ctx) 1487 struct r5l_recovery_ctx *ctx)
864{ 1488{
865 struct page *page = ctx->meta_page; 1489 struct page *page = ctx->meta_page;
866 struct r5l_meta_block *mb; 1490 struct r5l_meta_block *mb;
@@ -892,170 +1516,618 @@ static int r5l_read_meta_block(struct r5l_log *log,
892 return 0; 1516 return 0;
893} 1517}
894 1518
895static int r5l_recovery_flush_one_stripe(struct r5l_log *log, 1519static void
896 struct r5l_recovery_ctx *ctx, 1520r5l_recovery_create_empty_meta_block(struct r5l_log *log,
897 sector_t stripe_sect, 1521 struct page *page,
898 int *offset, sector_t *log_offset) 1522 sector_t pos, u64 seq)
899{ 1523{
900 struct r5conf *conf = log->rdev->mddev->private; 1524 struct r5l_meta_block *mb;
901 struct stripe_head *sh;
902 struct r5l_payload_data_parity *payload;
903 int disk_index;
904 1525
905 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); 1526 mb = page_address(page);
906 while (1) { 1527 clear_page(mb);
907 payload = page_address(ctx->meta_page) + *offset; 1528 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1529 mb->version = R5LOG_VERSION;
1530 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1531 mb->seq = cpu_to_le64(seq);
1532 mb->position = cpu_to_le64(pos);
1533}
908 1534
909 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 1535static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
910 raid5_compute_sector(conf, 1536 u64 seq)
911 le64_to_cpu(payload->location), 0, 1537{
912 &disk_index, sh); 1538 struct page *page;
1539 struct r5l_meta_block *mb;
913 1540
914 sync_page_io(log->rdev, *log_offset, PAGE_SIZE, 1541 page = alloc_page(GFP_KERNEL);
915 sh->dev[disk_index].page, REQ_OP_READ, 0, 1542 if (!page)
916 false); 1543 return -ENOMEM;
917 sh->dev[disk_index].log_checksum = 1544 r5l_recovery_create_empty_meta_block(log, page, pos, seq);
918 le32_to_cpu(payload->checksum[0]); 1545 mb = page_address(page);
919 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 1546 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
920 ctx->meta_total_blocks += BLOCK_SECTORS; 1547 mb, PAGE_SIZE));
921 } else { 1548 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
922 disk_index = sh->pd_idx; 1549 REQ_FUA, false)) {
923 sync_page_io(log->rdev, *log_offset, PAGE_SIZE, 1550 __free_page(page);
924 sh->dev[disk_index].page, REQ_OP_READ, 0, 1551 return -EIO;
925 false); 1552 }
926 sh->dev[disk_index].log_checksum = 1553 __free_page(page);
927 le32_to_cpu(payload->checksum[0]); 1554 return 0;
928 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 1555}
929
930 if (sh->qd_idx >= 0) {
931 disk_index = sh->qd_idx;
932 sync_page_io(log->rdev,
933 r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
934 PAGE_SIZE, sh->dev[disk_index].page,
935 REQ_OP_READ, 0, false);
936 sh->dev[disk_index].log_checksum =
937 le32_to_cpu(payload->checksum[1]);
938 set_bit(R5_Wantwrite,
939 &sh->dev[disk_index].flags);
940 }
941 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
942 }
943 1556
944 *log_offset = r5l_ring_add(log, *log_offset, 1557/*
945 le32_to_cpu(payload->size)); 1558 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
946 *offset += sizeof(struct r5l_payload_data_parity) + 1559 * to mark valid (potentially not flushed) data in the journal.
947 sizeof(__le32) * 1560 *
948 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1561 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
949 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 1562 * so there should not be any mismatch here.
950 break; 1563 */
1564static void r5l_recovery_load_data(struct r5l_log *log,
1565 struct stripe_head *sh,
1566 struct r5l_recovery_ctx *ctx,
1567 struct r5l_payload_data_parity *payload,
1568 sector_t log_offset)
1569{
1570 struct mddev *mddev = log->rdev->mddev;
1571 struct r5conf *conf = mddev->private;
1572 int dd_idx;
1573
1574 raid5_compute_sector(conf,
1575 le64_to_cpu(payload->location), 0,
1576 &dd_idx, sh);
1577 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1578 sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
1579 sh->dev[dd_idx].log_checksum =
1580 le32_to_cpu(payload->checksum[0]);
1581 ctx->meta_total_blocks += BLOCK_SECTORS;
1582
1583 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1584 set_bit(STRIPE_R5C_CACHING, &sh->state);
1585}
1586
1587static void r5l_recovery_load_parity(struct r5l_log *log,
1588 struct stripe_head *sh,
1589 struct r5l_recovery_ctx *ctx,
1590 struct r5l_payload_data_parity *payload,
1591 sector_t log_offset)
1592{
1593 struct mddev *mddev = log->rdev->mddev;
1594 struct r5conf *conf = mddev->private;
1595
1596 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1597 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1598 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
1599 sh->dev[sh->pd_idx].log_checksum =
1600 le32_to_cpu(payload->checksum[0]);
1601 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1602
1603 if (sh->qd_idx >= 0) {
1604 sync_page_io(log->rdev,
1605 r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1606 PAGE_SIZE, sh->dev[sh->qd_idx].page,
1607 REQ_OP_READ, 0, false);
1608 sh->dev[sh->qd_idx].log_checksum =
1609 le32_to_cpu(payload->checksum[1]);
1610 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
951 } 1611 }
1612 clear_bit(STRIPE_R5C_CACHING, &sh->state);
1613}
952 1614
953 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1615static void r5l_recovery_reset_stripe(struct stripe_head *sh)
954 void *addr; 1616{
955 u32 checksum; 1617 int i;
956 1618
1619 sh->state = 0;
1620 sh->log_start = MaxSector;
1621 for (i = sh->disks; i--; )
1622 sh->dev[i].flags = 0;
1623}
1624
1625static void
1626r5l_recovery_replay_one_stripe(struct r5conf *conf,
1627 struct stripe_head *sh,
1628 struct r5l_recovery_ctx *ctx)
1629{
1630 struct md_rdev *rdev, *rrdev;
1631 int disk_index;
1632 int data_count = 0;
1633
1634 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
957 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1635 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
958 continue; 1636 continue;
959 addr = kmap_atomic(sh->dev[disk_index].page); 1637 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
960 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1638 continue;
961 kunmap_atomic(addr); 1639 data_count++;
962 if (checksum != sh->dev[disk_index].log_checksum)
963 goto error;
964 } 1640 }
965 1641
966 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1642 /*
967 struct md_rdev *rdev, *rrdev; 1643 * stripes that only have parity must have been flushed
1644 * before the crash that we are now recovering from, so
1645 * there is nothing more to recovery.
1646 */
1647 if (data_count == 0)
1648 goto out;
968 1649
969 if (!test_and_clear_bit(R5_Wantwrite, 1650 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
970 &sh->dev[disk_index].flags)) 1651 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
971 continue; 1652 continue;
972 1653
973 /* in case device is broken */ 1654 /* in case device is broken */
1655 rcu_read_lock();
974 rdev = rcu_dereference(conf->disks[disk_index].rdev); 1656 rdev = rcu_dereference(conf->disks[disk_index].rdev);
975 if (rdev) 1657 if (rdev) {
976 sync_page_io(rdev, stripe_sect, PAGE_SIZE, 1658 atomic_inc(&rdev->nr_pending);
1659 rcu_read_unlock();
1660 sync_page_io(rdev, sh->sector, PAGE_SIZE,
977 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1661 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
978 false); 1662 false);
1663 rdev_dec_pending(rdev, rdev->mddev);
1664 rcu_read_lock();
1665 }
979 rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1666 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
980 if (rrdev) 1667 if (rrdev) {
981 sync_page_io(rrdev, stripe_sect, PAGE_SIZE, 1668 atomic_inc(&rrdev->nr_pending);
1669 rcu_read_unlock();
1670 sync_page_io(rrdev, sh->sector, PAGE_SIZE,
982 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1671 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
983 false); 1672 false);
1673 rdev_dec_pending(rrdev, rrdev->mddev);
1674 rcu_read_lock();
1675 }
1676 rcu_read_unlock();
984 } 1677 }
985 raid5_release_stripe(sh); 1678 ctx->data_parity_stripes++;
1679out:
1680 r5l_recovery_reset_stripe(sh);
1681}
1682
1683static struct stripe_head *
1684r5c_recovery_alloc_stripe(struct r5conf *conf,
1685 sector_t stripe_sect,
1686 sector_t log_start)
1687{
1688 struct stripe_head *sh;
1689
1690 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
1691 if (!sh)
1692 return NULL; /* no more stripe available */
1693
1694 r5l_recovery_reset_stripe(sh);
1695 sh->log_start = log_start;
1696
1697 return sh;
1698}
1699
1700static struct stripe_head *
1701r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1702{
1703 struct stripe_head *sh;
1704
1705 list_for_each_entry(sh, list, lru)
1706 if (sh->sector == sect)
1707 return sh;
1708 return NULL;
1709}
1710
1711static void
1712r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1713 struct r5l_recovery_ctx *ctx)
1714{
1715 struct stripe_head *sh, *next;
1716
1717 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1718 r5l_recovery_reset_stripe(sh);
1719 list_del_init(&sh->lru);
1720 raid5_release_stripe(sh);
1721 }
1722}
1723
1724static void
1725r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1726 struct r5l_recovery_ctx *ctx)
1727{
1728 struct stripe_head *sh, *next;
1729
1730 list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1731 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1732 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1733 list_del_init(&sh->lru);
1734 raid5_release_stripe(sh);
1735 }
1736}
1737
1738/* if matches return 0; otherwise return -EINVAL */
1739static int
1740r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
1741 sector_t log_offset, __le32 log_checksum)
1742{
1743 void *addr;
1744 u32 checksum;
1745
1746 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1747 page, REQ_OP_READ, 0, false);
1748 addr = kmap_atomic(page);
1749 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1750 kunmap_atomic(addr);
1751 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
1752}
1753
1754/*
1755 * before loading data to stripe cache, we need verify checksum for all data,
1756 * if there is mismatch for any data page, we drop all data in the mata block
1757 */
1758static int
1759r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
1760 struct r5l_recovery_ctx *ctx)
1761{
1762 struct mddev *mddev = log->rdev->mddev;
1763 struct r5conf *conf = mddev->private;
1764 struct r5l_meta_block *mb = page_address(ctx->meta_page);
1765 sector_t mb_offset = sizeof(struct r5l_meta_block);
1766 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1767 struct page *page;
1768 struct r5l_payload_data_parity *payload;
1769
1770 page = alloc_page(GFP_KERNEL);
1771 if (!page)
1772 return -ENOMEM;
1773
1774 while (mb_offset < le32_to_cpu(mb->meta_size)) {
1775 payload = (void *)mb + mb_offset;
1776
1777 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1778 if (r5l_recovery_verify_data_checksum(
1779 log, page, log_offset,
1780 payload->checksum[0]) < 0)
1781 goto mismatch;
1782 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
1783 if (r5l_recovery_verify_data_checksum(
1784 log, page, log_offset,
1785 payload->checksum[0]) < 0)
1786 goto mismatch;
1787 if (conf->max_degraded == 2 && /* q for RAID 6 */
1788 r5l_recovery_verify_data_checksum(
1789 log, page,
1790 r5l_ring_add(log, log_offset,
1791 BLOCK_SECTORS),
1792 payload->checksum[1]) < 0)
1793 goto mismatch;
1794 } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
1795 goto mismatch;
1796
1797 log_offset = r5l_ring_add(log, log_offset,
1798 le32_to_cpu(payload->size));
1799
1800 mb_offset += sizeof(struct r5l_payload_data_parity) +
1801 sizeof(__le32) *
1802 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1803 }
1804
1805 put_page(page);
986 return 0; 1806 return 0;
987 1807
988error: 1808mismatch:
989 for (disk_index = 0; disk_index < sh->disks; disk_index++) 1809 put_page(page);
990 sh->dev[disk_index].flags = 0;
991 raid5_release_stripe(sh);
992 return -EINVAL; 1810 return -EINVAL;
993} 1811}
994 1812
995static int r5l_recovery_flush_one_meta(struct r5l_log *log, 1813/*
996 struct r5l_recovery_ctx *ctx) 1814 * Analyze all data/parity pages in one meta block
1815 * Returns:
1816 * 0 for success
1817 * -EINVAL for unknown playload type
1818 * -EAGAIN for checksum mismatch of data page
1819 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
1820 */
1821static int
1822r5c_recovery_analyze_meta_block(struct r5l_log *log,
1823 struct r5l_recovery_ctx *ctx,
1824 struct list_head *cached_stripe_list)
997{ 1825{
998 struct r5conf *conf = log->rdev->mddev->private; 1826 struct mddev *mddev = log->rdev->mddev;
999 struct r5l_payload_data_parity *payload; 1827 struct r5conf *conf = mddev->private;
1000 struct r5l_meta_block *mb; 1828 struct r5l_meta_block *mb;
1001 int offset; 1829 struct r5l_payload_data_parity *payload;
1830 int mb_offset;
1002 sector_t log_offset; 1831 sector_t log_offset;
1003 sector_t stripe_sector; 1832 sector_t stripe_sect;
1833 struct stripe_head *sh;
1834 int ret;
1835
1836 /*
1837 * for mismatch in data blocks, we will drop all data in this mb, but
1838 * we will still read next mb for other data with FLUSH flag, as
1839 * io_unit could finish out of order.
1840 */
1841 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
1842 if (ret == -EINVAL)
1843 return -EAGAIN;
1844 else if (ret)
1845 return ret; /* -ENOMEM duo to alloc_page() failed */
1004 1846
1005 mb = page_address(ctx->meta_page); 1847 mb = page_address(ctx->meta_page);
1006 offset = sizeof(struct r5l_meta_block); 1848 mb_offset = sizeof(struct r5l_meta_block);
1007 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1849 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1008 1850
1009 while (offset < le32_to_cpu(mb->meta_size)) { 1851 while (mb_offset < le32_to_cpu(mb->meta_size)) {
1010 int dd; 1852 int dd;
1011 1853
1012 payload = (void *)mb + offset; 1854 payload = (void *)mb + mb_offset;
1013 stripe_sector = raid5_compute_sector(conf, 1855 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
1014 le64_to_cpu(payload->location), 0, &dd, NULL); 1856 raid5_compute_sector(
1015 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, 1857 conf, le64_to_cpu(payload->location), 0, &dd,
1016 &offset, &log_offset)) 1858 NULL)
1859 : le64_to_cpu(payload->location);
1860
1861 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
1862 stripe_sect);
1863
1864 if (!sh) {
1865 sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos);
1866 /*
1867 * cannot get stripe from raid5_get_active_stripe
1868 * try replay some stripes
1869 */
1870 if (!sh) {
1871 r5c_recovery_replay_stripes(
1872 cached_stripe_list, ctx);
1873 sh = r5c_recovery_alloc_stripe(
1874 conf, stripe_sect, ctx->pos);
1875 }
1876 if (!sh) {
1877 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
1878 mdname(mddev),
1879 conf->min_nr_stripes * 2);
1880 raid5_set_cache_size(mddev,
1881 conf->min_nr_stripes * 2);
1882 sh = r5c_recovery_alloc_stripe(
1883 conf, stripe_sect, ctx->pos);
1884 }
1885 if (!sh) {
1886 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
1887 mdname(mddev));
1888 return -ENOMEM;
1889 }
1890 list_add_tail(&sh->lru, cached_stripe_list);
1891 }
1892
1893 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1894 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
1895 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
1896 r5l_recovery_replay_one_stripe(conf, sh, ctx);
1897 sh->log_start = ctx->pos;
1898 list_move_tail(&sh->lru, cached_stripe_list);
1899 }
1900 r5l_recovery_load_data(log, sh, ctx, payload,
1901 log_offset);
1902 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
1903 r5l_recovery_load_parity(log, sh, ctx, payload,
1904 log_offset);
1905 else
1017 return -EINVAL; 1906 return -EINVAL;
1907
1908 log_offset = r5l_ring_add(log, log_offset,
1909 le32_to_cpu(payload->size));
1910
1911 mb_offset += sizeof(struct r5l_payload_data_parity) +
1912 sizeof(__le32) *
1913 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1018 } 1914 }
1915
1019 return 0; 1916 return 0;
1020} 1917}
1021 1918
1022/* copy data/parity from log to raid disks */ 1919/*
1023static void r5l_recovery_flush_log(struct r5l_log *log, 1920 * Load the stripe into cache. The stripe will be written out later by
1024 struct r5l_recovery_ctx *ctx) 1921 * the stripe cache state machine.
1922 */
1923static void r5c_recovery_load_one_stripe(struct r5l_log *log,
1924 struct stripe_head *sh)
1025{ 1925{
1926 struct r5dev *dev;
1927 int i;
1928
1929 for (i = sh->disks; i--; ) {
1930 dev = sh->dev + i;
1931 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
1932 set_bit(R5_InJournal, &dev->flags);
1933 set_bit(R5_UPTODATE, &dev->flags);
1934 }
1935 }
1936 list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
1937 atomic_inc(&log->stripe_in_journal_count);
1938}
1939
1940/*
1941 * Scan through the log for all to-be-flushed data
1942 *
1943 * For stripes with data and parity, namely Data-Parity stripe
1944 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
1945 *
1946 * For stripes with only data, namely Data-Only stripe
1947 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
1948 *
1949 * For a stripe, if we see data after parity, we should discard all previous
1950 * data and parity for this stripe, as these data are already flushed to
1951 * the array.
1952 *
1953 * At the end of the scan, we return the new journal_tail, which points to
1954 * first data-only stripe on the journal device, or next invalid meta block.
1955 */
1956static int r5c_recovery_flush_log(struct r5l_log *log,
1957 struct r5l_recovery_ctx *ctx)
1958{
1959 struct stripe_head *sh;
1960 int ret = 0;
1961
1962 /* scan through the log */
1026 while (1) { 1963 while (1) {
1027 if (r5l_read_meta_block(log, ctx)) 1964 if (r5l_recovery_read_meta_block(log, ctx))
1028 return; 1965 break;
1029 if (r5l_recovery_flush_one_meta(log, ctx)) 1966
1030 return; 1967 ret = r5c_recovery_analyze_meta_block(log, ctx,
1968 &ctx->cached_list);
1969 /*
1970 * -EAGAIN means mismatch in data block, in this case, we still
1971 * try scan the next metablock
1972 */
1973 if (ret && ret != -EAGAIN)
1974 break; /* ret == -EINVAL or -ENOMEM */
1031 ctx->seq++; 1975 ctx->seq++;
1032 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1976 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1033 } 1977 }
1978
1979 if (ret == -ENOMEM) {
1980 r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
1981 return ret;
1982 }
1983
1984 /* replay data-parity stripes */
1985 r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
1986
1987 /* load data-only stripes to stripe cache */
1988 list_for_each_entry(sh, &ctx->cached_list, lru) {
1989 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1990 r5c_recovery_load_one_stripe(log, sh);
1991 ctx->data_only_stripes++;
1992 }
1993
1994 return 0;
1034} 1995}
1035 1996
1036static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1997/*
1037 u64 seq) 1998 * we did a recovery. Now ctx.pos points to an invalid meta block. New
1999 * log will start here. but we can't let superblock point to last valid
2000 * meta block. The log might looks like:
2001 * | meta 1| meta 2| meta 3|
2002 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
2003 * superblock points to meta 1, we write a new valid meta 2n. if crash
2004 * happens again, new recovery will start from meta 1. Since meta 2n is
2005 * valid now, recovery will think meta 3 is valid, which is wrong.
2006 * The solution is we create a new meta in meta2 with its seq == meta
2007 * 1's seq + 10000 and let superblock points to meta2. The same recovery
2008 * will not think meta 3 is a valid meta, because its seq doesn't match
2009 */
2010
2011/*
2012 * Before recovery, the log looks like the following
2013 *
2014 * ---------------------------------------------
2015 * | valid log | invalid log |
2016 * ---------------------------------------------
2017 * ^
2018 * |- log->last_checkpoint
2019 * |- log->last_cp_seq
2020 *
2021 * Now we scan through the log until we see invalid entry
2022 *
2023 * ---------------------------------------------
2024 * | valid log | invalid log |
2025 * ---------------------------------------------
2026 * ^ ^
2027 * |- log->last_checkpoint |- ctx->pos
2028 * |- log->last_cp_seq |- ctx->seq
2029 *
2030 * From this point, we need to increase seq number by 10 to avoid
2031 * confusing next recovery.
2032 *
2033 * ---------------------------------------------
2034 * | valid log | invalid log |
2035 * ---------------------------------------------
2036 * ^ ^
2037 * |- log->last_checkpoint |- ctx->pos+1
2038 * |- log->last_cp_seq |- ctx->seq+10001
2039 *
2040 * However, it is not safe to start the state machine yet, because data only
2041 * parities are not yet secured in RAID. To save these data only parities, we
2042 * rewrite them from seq+11.
2043 *
2044 * -----------------------------------------------------------------
2045 * | valid log | data only stripes | invalid log |
2046 * -----------------------------------------------------------------
2047 * ^ ^
2048 * |- log->last_checkpoint |- ctx->pos+n
2049 * |- log->last_cp_seq |- ctx->seq+10000+n
2050 *
2051 * If failure happens again during this process, the recovery can safe start
2052 * again from log->last_checkpoint.
2053 *
2054 * Once data only stripes are rewritten to journal, we move log_tail
2055 *
2056 * -----------------------------------------------------------------
2057 * | old log | data only stripes | invalid log |
2058 * -----------------------------------------------------------------
2059 * ^ ^
2060 * |- log->last_checkpoint |- ctx->pos+n
2061 * |- log->last_cp_seq |- ctx->seq+10000+n
2062 *
2063 * Then we can safely start the state machine. If failure happens from this
2064 * point on, the recovery will start from new log->last_checkpoint.
2065 */
2066static int
2067r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2068 struct r5l_recovery_ctx *ctx)
1038{ 2069{
2070 struct stripe_head *sh, *next;
2071 struct mddev *mddev = log->rdev->mddev;
1039 struct page *page; 2072 struct page *page;
1040 struct r5l_meta_block *mb;
1041 u32 crc;
1042 2073
1043 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2074 page = alloc_page(GFP_KERNEL);
1044 if (!page) 2075 if (!page) {
2076 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
2077 mdname(mddev));
1045 return -ENOMEM; 2078 return -ENOMEM;
1046 mb = page_address(page); 2079 }
1047 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1048 mb->version = R5LOG_VERSION;
1049 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1050 mb->seq = cpu_to_le64(seq);
1051 mb->position = cpu_to_le64(pos);
1052 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1053 mb->checksum = cpu_to_le32(crc);
1054 2080
1055 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 2081 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
1056 REQ_FUA, false)) { 2082 struct r5l_meta_block *mb;
1057 __free_page(page); 2083 int i;
1058 return -EIO; 2084 int offset;
2085 sector_t write_pos;
2086
2087 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2088 r5l_recovery_create_empty_meta_block(log, page,
2089 ctx->pos, ctx->seq);
2090 mb = page_address(page);
2091 offset = le32_to_cpu(mb->meta_size);
2092 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2093
2094 for (i = sh->disks; i--; ) {
2095 struct r5dev *dev = &sh->dev[i];
2096 struct r5l_payload_data_parity *payload;
2097 void *addr;
2098
2099 if (test_bit(R5_InJournal, &dev->flags)) {
2100 payload = (void *)mb + offset;
2101 payload->header.type = cpu_to_le16(
2102 R5LOG_PAYLOAD_DATA);
2103 payload->size = BLOCK_SECTORS;
2104 payload->location = cpu_to_le64(
2105 raid5_compute_blocknr(sh, i, 0));
2106 addr = kmap_atomic(dev->page);
2107 payload->checksum[0] = cpu_to_le32(
2108 crc32c_le(log->uuid_checksum, addr,
2109 PAGE_SIZE));
2110 kunmap_atomic(addr);
2111 sync_page_io(log->rdev, write_pos, PAGE_SIZE,
2112 dev->page, REQ_OP_WRITE, 0, false);
2113 write_pos = r5l_ring_add(log, write_pos,
2114 BLOCK_SECTORS);
2115 offset += sizeof(__le32) +
2116 sizeof(struct r5l_payload_data_parity);
2117
2118 }
2119 }
2120 mb->meta_size = cpu_to_le32(offset);
2121 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
2122 mb, PAGE_SIZE));
2123 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2124 REQ_OP_WRITE, REQ_FUA, false);
2125 sh->log_start = ctx->pos;
2126 ctx->pos = write_pos;
2127 ctx->seq += 1;
2128
2129 list_del_init(&sh->lru);
2130 raid5_release_stripe(sh);
1059 } 2131 }
1060 __free_page(page); 2132 __free_page(page);
1061 return 0; 2133 return 0;
@@ -1063,45 +2135,60 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1063 2135
1064static int r5l_recovery_log(struct r5l_log *log) 2136static int r5l_recovery_log(struct r5l_log *log)
1065{ 2137{
2138 struct mddev *mddev = log->rdev->mddev;
1066 struct r5l_recovery_ctx ctx; 2139 struct r5l_recovery_ctx ctx;
2140 int ret;
2141 sector_t pos;
2142 struct stripe_head *sh;
1067 2143
1068 ctx.pos = log->last_checkpoint; 2144 ctx.pos = log->last_checkpoint;
1069 ctx.seq = log->last_cp_seq; 2145 ctx.seq = log->last_cp_seq;
1070 ctx.meta_page = alloc_page(GFP_KERNEL); 2146 ctx.meta_page = alloc_page(GFP_KERNEL);
2147 ctx.data_only_stripes = 0;
2148 ctx.data_parity_stripes = 0;
2149 INIT_LIST_HEAD(&ctx.cached_list);
2150
1071 if (!ctx.meta_page) 2151 if (!ctx.meta_page)
1072 return -ENOMEM; 2152 return -ENOMEM;
1073 2153
1074 r5l_recovery_flush_log(log, &ctx); 2154 ret = r5c_recovery_flush_log(log, &ctx);
1075 __free_page(ctx.meta_page); 2155 __free_page(ctx.meta_page);
1076 2156
1077 /* 2157 if (ret)
1078 * we did a recovery. Now ctx.pos points to an invalid meta block. New 2158 return ret;
1079 * log will start here. but we can't let superblock point to last valid 2159
1080 * meta block. The log might looks like: 2160 pos = ctx.pos;
1081 * | meta 1| meta 2| meta 3| 2161 ctx.seq += 10000;
1082 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 2162
1083 * superblock points to meta 1, we write a new valid meta 2n. if crash 2163 if (ctx.data_only_stripes == 0) {
1084 * happens again, new recovery will start from meta 1. Since meta 2n is
1085 * valid now, recovery will think meta 3 is valid, which is wrong.
1086 * The solution is we create a new meta in meta2 with its seq == meta
1087 * 1's seq + 10 and let superblock points to meta2. The same recovery will
1088 * not think meta 3 is a valid meta, because its seq doesn't match
1089 */
1090 if (ctx.seq > log->last_cp_seq) {
1091 int ret;
1092
1093 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1094 if (ret)
1095 return ret;
1096 log->seq = ctx.seq + 11;
1097 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1098 r5l_write_super(log, ctx.pos);
1099 log->last_checkpoint = ctx.pos;
1100 log->next_checkpoint = ctx.pos; 2164 log->next_checkpoint = ctx.pos;
2165 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
2166 ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1101 } else { 2167 } else {
1102 log->log_start = ctx.pos; 2168 sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru);
1103 log->seq = ctx.seq; 2169 log->next_checkpoint = sh->log_start;
1104 } 2170 }
2171
2172 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
2173 pr_debug("md/raid:%s: starting from clean shutdown\n",
2174 mdname(mddev));
2175 else {
2176 pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n",
2177 mdname(mddev), ctx.data_only_stripes,
2178 ctx.data_parity_stripes);
2179
2180 if (ctx.data_only_stripes > 0)
2181 if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
2182 pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2183 mdname(mddev));
2184 return -EIO;
2185 }
2186 }
2187
2188 log->log_start = ctx.pos;
2189 log->seq = ctx.seq;
2190 log->last_checkpoint = pos;
2191 r5l_write_super(log, pos);
1105 return 0; 2192 return 0;
1106} 2193}
1107 2194
@@ -1110,7 +2197,293 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
1110 struct mddev *mddev = log->rdev->mddev; 2197 struct mddev *mddev = log->rdev->mddev;
1111 2198
1112 log->rdev->journal_tail = cp; 2199 log->rdev->journal_tail = cp;
1113 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2200 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2201}
2202
2203static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2204{
2205 struct r5conf *conf = mddev->private;
2206 int ret;
2207
2208 if (!conf->log)
2209 return 0;
2210
2211 switch (conf->log->r5c_journal_mode) {
2212 case R5C_JOURNAL_MODE_WRITE_THROUGH:
2213 ret = snprintf(
2214 page, PAGE_SIZE, "[%s] %s\n",
2215 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2216 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2217 break;
2218 case R5C_JOURNAL_MODE_WRITE_BACK:
2219 ret = snprintf(
2220 page, PAGE_SIZE, "%s [%s]\n",
2221 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2222 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2223 break;
2224 default:
2225 ret = 0;
2226 }
2227 return ret;
2228}
2229
2230static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2231 const char *page, size_t length)
2232{
2233 struct r5conf *conf = mddev->private;
2234 struct r5l_log *log = conf->log;
2235 int val = -1, i;
2236 int len = length;
2237
2238 if (!log)
2239 return -ENODEV;
2240
2241 if (len && page[len - 1] == '\n')
2242 len -= 1;
2243 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
2244 if (strlen(r5c_journal_mode_str[i]) == len &&
2245 strncmp(page, r5c_journal_mode_str[i], len) == 0) {
2246 val = i;
2247 break;
2248 }
2249 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2250 val > R5C_JOURNAL_MODE_WRITE_BACK)
2251 return -EINVAL;
2252
2253 mddev_suspend(mddev);
2254 conf->log->r5c_journal_mode = val;
2255 mddev_resume(mddev);
2256
2257 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2258 mdname(mddev), val, r5c_journal_mode_str[val]);
2259 return length;
2260}
2261
2262struct md_sysfs_entry
2263r5c_journal_mode = __ATTR(journal_mode, 0644,
2264 r5c_journal_mode_show, r5c_journal_mode_store);
2265
2266/*
2267 * Try handle write operation in caching phase. This function should only
2268 * be called in write-back mode.
2269 *
2270 * If all outstanding writes can be handled in caching phase, returns 0
2271 * If writes requires write-out phase, call r5c_make_stripe_write_out()
2272 * and returns -EAGAIN
2273 */
2274int r5c_try_caching_write(struct r5conf *conf,
2275 struct stripe_head *sh,
2276 struct stripe_head_state *s,
2277 int disks)
2278{
2279 struct r5l_log *log = conf->log;
2280 int i;
2281 struct r5dev *dev;
2282 int to_cache = 0;
2283
2284 BUG_ON(!r5c_is_writeback(log));
2285
2286 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2287 /*
2288 * There are two different scenarios here:
2289 * 1. The stripe has some data cached, and it is sent to
2290 * write-out phase for reclaim
2291 * 2. The stripe is clean, and this is the first write
2292 *
2293 * For 1, return -EAGAIN, so we continue with
2294 * handle_stripe_dirtying().
2295 *
2296 * For 2, set STRIPE_R5C_CACHING and continue with caching
2297 * write.
2298 */
2299
2300 /* case 1: anything injournal or anything in written */
2301 if (s->injournal > 0 || s->written > 0)
2302 return -EAGAIN;
2303 /* case 2 */
2304 set_bit(STRIPE_R5C_CACHING, &sh->state);
2305 }
2306
2307 for (i = disks; i--; ) {
2308 dev = &sh->dev[i];
2309 /* if non-overwrite, use writing-out phase */
2310 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2311 !test_bit(R5_InJournal, &dev->flags)) {
2312 r5c_make_stripe_write_out(sh);
2313 return -EAGAIN;
2314 }
2315 }
2316
2317 for (i = disks; i--; ) {
2318 dev = &sh->dev[i];
2319 if (dev->towrite) {
2320 set_bit(R5_Wantwrite, &dev->flags);
2321 set_bit(R5_Wantdrain, &dev->flags);
2322 set_bit(R5_LOCKED, &dev->flags);
2323 to_cache++;
2324 }
2325 }
2326
2327 if (to_cache) {
2328 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2329 /*
2330 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
2331 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
2332 * r5c_handle_data_cached()
2333 */
2334 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2335 }
2336
2337 return 0;
2338}
2339
2340/*
2341 * free extra pages (orig_page) we allocated for prexor
2342 */
2343void r5c_release_extra_page(struct stripe_head *sh)
2344{
2345 struct r5conf *conf = sh->raid_conf;
2346 int i;
2347 bool using_disk_info_extra_page;
2348
2349 using_disk_info_extra_page =
2350 sh->dev[0].orig_page == conf->disks[0].extra_page;
2351
2352 for (i = sh->disks; i--; )
2353 if (sh->dev[i].page != sh->dev[i].orig_page) {
2354 struct page *p = sh->dev[i].orig_page;
2355
2356 sh->dev[i].orig_page = sh->dev[i].page;
2357 if (!using_disk_info_extra_page)
2358 put_page(p);
2359 }
2360
2361 if (using_disk_info_extra_page) {
2362 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
2363 md_wakeup_thread(conf->mddev->thread);
2364 }
2365}
2366
2367void r5c_use_extra_page(struct stripe_head *sh)
2368{
2369 struct r5conf *conf = sh->raid_conf;
2370 int i;
2371 struct r5dev *dev;
2372
2373 for (i = sh->disks; i--; ) {
2374 dev = &sh->dev[i];
2375 if (dev->orig_page != dev->page)
2376 put_page(dev->orig_page);
2377 dev->orig_page = conf->disks[i].extra_page;
2378 }
2379}
2380
2381/*
2382 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
2383 * stripe is committed to RAID disks.
2384 */
2385void r5c_finish_stripe_write_out(struct r5conf *conf,
2386 struct stripe_head *sh,
2387 struct stripe_head_state *s)
2388{
2389 int i;
2390 int do_wakeup = 0;
2391
2392 if (!conf->log ||
2393 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2394 return;
2395
2396 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2397 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2398
2399 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2400 return;
2401
2402 for (i = sh->disks; i--; ) {
2403 clear_bit(R5_InJournal, &sh->dev[i].flags);
2404 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2405 do_wakeup = 1;
2406 }
2407
2408 /*
2409 * analyse_stripe() runs before r5c_finish_stripe_write_out(),
2410 * We updated R5_InJournal, so we also update s->injournal.
2411 */
2412 s->injournal = 0;
2413
2414 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2415 if (atomic_dec_and_test(&conf->pending_full_writes))
2416 md_wakeup_thread(conf->mddev->thread);
2417
2418 if (do_wakeup)
2419 wake_up(&conf->wait_for_overlap);
2420
2421 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2422 return;
2423
2424 spin_lock_irq(&conf->log->stripe_in_journal_lock);
2425 list_del_init(&sh->r5c);
2426 spin_unlock_irq(&conf->log->stripe_in_journal_lock);
2427 sh->log_start = MaxSector;
2428 atomic_dec(&conf->log->stripe_in_journal_count);
2429 r5c_update_log_state(conf->log);
2430}
2431
2432int
2433r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
2434 struct stripe_head_state *s)
2435{
2436 struct r5conf *conf = sh->raid_conf;
2437 int pages = 0;
2438 int reserve;
2439 int i;
2440 int ret = 0;
2441
2442 BUG_ON(!log);
2443
2444 for (i = 0; i < sh->disks; i++) {
2445 void *addr;
2446
2447 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2448 continue;
2449 addr = kmap_atomic(sh->dev[i].page);
2450 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2451 addr, PAGE_SIZE);
2452 kunmap_atomic(addr);
2453 pages++;
2454 }
2455 WARN_ON(pages == 0);
2456
2457 /*
2458 * The stripe must enter state machine again to call endio, so
2459 * don't delay.
2460 */
2461 clear_bit(STRIPE_DELAYED, &sh->state);
2462 atomic_inc(&sh->count);
2463
2464 mutex_lock(&log->io_mutex);
2465 /* meta + data */
2466 reserve = (1 + pages) << (PAGE_SHIFT - 9);
2467
2468 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2469 sh->log_start == MaxSector)
2470 r5l_add_no_space_stripe(log, sh);
2471 else if (!r5l_has_free_space(log, reserve)) {
2472 if (sh->log_start == log->last_checkpoint)
2473 BUG();
2474 else
2475 r5l_add_no_space_stripe(log, sh);
2476 } else {
2477 ret = r5l_log_stripe(log, sh, pages, 0);
2478 if (ret) {
2479 spin_lock_irq(&log->io_list_lock);
2480 list_add_tail(&sh->log_list, &log->no_mem_stripes);
2481 spin_unlock_irq(&log->io_list_lock);
2482 }
2483 }
2484
2485 mutex_unlock(&log->io_mutex);
2486 return 0;
1114} 2487}
1115 2488
1116static int r5l_load_log(struct r5l_log *log) 2489static int r5l_load_log(struct r5l_log *log)
@@ -1121,7 +2494,7 @@ static int r5l_load_log(struct r5l_log *log)
1121 sector_t cp = log->rdev->journal_tail; 2494 sector_t cp = log->rdev->journal_tail;
1122 u32 stored_crc, expected_crc; 2495 u32 stored_crc, expected_crc;
1123 bool create_super = false; 2496 bool create_super = false;
1124 int ret; 2497 int ret = 0;
1125 2498
1126 /* Make sure it's valid */ 2499 /* Make sure it's valid */
1127 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2500 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
@@ -1171,11 +2544,18 @@ create:
1171 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 2544 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1172 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2545 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1173 log->last_checkpoint = cp; 2546 log->last_checkpoint = cp;
1174 log->next_checkpoint = cp;
1175 2547
1176 __free_page(page); 2548 __free_page(page);
1177 2549
1178 return r5l_recovery_log(log); 2550 if (create_super) {
2551 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
2552 log->seq = log->last_cp_seq + 1;
2553 log->next_checkpoint = cp;
2554 } else
2555 ret = r5l_recovery_log(log);
2556
2557 r5c_update_log_state(log);
2558 return ret;
1179ioerr: 2559ioerr:
1180 __free_page(page); 2560 __free_page(page);
1181 return ret; 2561 return ret;
@@ -1188,6 +2568,22 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1188 2568
1189 if (PAGE_SIZE != 4096) 2569 if (PAGE_SIZE != 4096)
1190 return -EINVAL; 2570 return -EINVAL;
2571
2572 /*
2573 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
2574 * raid_disks r5l_payload_data_parity.
2575 *
2576 * Write journal and cache does not work for very big array
2577 * (raid_disks > 203)
2578 */
2579 if (sizeof(struct r5l_meta_block) +
2580 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
2581 conf->raid_disks) > PAGE_SIZE) {
2582 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
2583 mdname(conf->mddev), conf->raid_disks);
2584 return -EINVAL;
2585 }
2586
1191 log = kzalloc(sizeof(*log), GFP_KERNEL); 2587 log = kzalloc(sizeof(*log), GFP_KERNEL);
1192 if (!log) 2588 if (!log)
1193 return -ENOMEM; 2589 return -ENOMEM;
@@ -1227,6 +2623,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1227 log->rdev->mddev, "reclaim"); 2623 log->rdev->mddev, "reclaim");
1228 if (!log->reclaim_thread) 2624 if (!log->reclaim_thread)
1229 goto reclaim_thread; 2625 goto reclaim_thread;
2626 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
2627
1230 init_waitqueue_head(&log->iounit_wait); 2628 init_waitqueue_head(&log->iounit_wait);
1231 2629
1232 INIT_LIST_HEAD(&log->no_mem_stripes); 2630 INIT_LIST_HEAD(&log->no_mem_stripes);
@@ -1234,6 +2632,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1234 INIT_LIST_HEAD(&log->no_space_stripes); 2632 INIT_LIST_HEAD(&log->no_space_stripes);
1235 spin_lock_init(&log->no_space_stripes_lock); 2633 spin_lock_init(&log->no_space_stripes_lock);
1236 2634
2635 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
2636
2637 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2638 INIT_LIST_HEAD(&log->stripe_in_journal_list);
2639 spin_lock_init(&log->stripe_in_journal_lock);
2640 atomic_set(&log->stripe_in_journal_count, 0);
2641
1237 if (r5l_load_log(log)) 2642 if (r5l_load_log(log))
1238 goto error; 2643 goto error;
1239 2644
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5f9e28443c8a..06d7279bdd04 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644);
70MODULE_PARM_DESC(devices_handle_discard_safely, 70MODULE_PARM_DESC(devices_handle_discard_safely,
71 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 71 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
72static struct workqueue_struct *raid5_wq; 72static struct workqueue_struct *raid5_wq;
73/*
74 * Stripe cache
75 */
76
77#define NR_STRIPES 256
78#define STRIPE_SIZE PAGE_SIZE
79#define STRIPE_SHIFT (PAGE_SHIFT - 9)
80#define STRIPE_SECTORS (STRIPE_SIZE>>9)
81#define IO_THRESHOLD 1
82#define BYPASS_THRESHOLD 1
83#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
84#define HASH_MASK (NR_HASH - 1)
85#define MAX_STRIPE_BATCH 8
86 73
87static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 74static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
88{ 75{
@@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
126 local_irq_enable(); 113 local_irq_enable();
127} 114}
128 115
129/* bio's attached to a stripe+device for I/O are linked together in bi_sector
130 * order without overlap. There may be several bio's per stripe+device, and
131 * a bio could span several devices.
132 * When walking this list for a particular stripe+device, we must never proceed
133 * beyond a bio that extends past this device, as the next bio might no longer
134 * be valid.
135 * This function is used to determine the 'next' bio in the list, given the sector
136 * of the current stripe+device
137 */
138static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
139{
140 int sectors = bio_sectors(bio);
141 if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
142 return bio->bi_next;
143 else
144 return NULL;
145}
146
147/*
148 * We maintain a biased count of active stripes in the bottom 16 bits of
149 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
150 */
151static inline int raid5_bi_processed_stripes(struct bio *bio)
152{
153 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
154 return (atomic_read(segments) >> 16) & 0xffff;
155}
156
157static inline int raid5_dec_bi_active_stripes(struct bio *bio)
158{
159 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
160 return atomic_sub_return(1, segments) & 0xffff;
161}
162
163static inline void raid5_inc_bi_active_stripes(struct bio *bio)
164{
165 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
166 atomic_inc(segments);
167}
168
169static inline void raid5_set_bi_processed_stripes(struct bio *bio,
170 unsigned int cnt)
171{
172 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
173 int old, new;
174
175 do {
176 old = atomic_read(segments);
177 new = (old & 0xffff) | (cnt << 16);
178 } while (atomic_cmpxchg(segments, old, new) != old);
179}
180
181static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
182{
183 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
184 atomic_set(segments, cnt);
185}
186
187/* Find first data disk in a raid6 stripe */ 116/* Find first data disk in a raid6 stripe */
188static inline int raid6_d0(struct stripe_head *sh) 117static inline int raid6_d0(struct stripe_head *sh)
189{ 118{
@@ -289,8 +218,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
289static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 218static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
290 struct list_head *temp_inactive_list) 219 struct list_head *temp_inactive_list)
291{ 220{
221 int i;
222 int injournal = 0; /* number of date pages with R5_InJournal */
223
292 BUG_ON(!list_empty(&sh->lru)); 224 BUG_ON(!list_empty(&sh->lru));
293 BUG_ON(atomic_read(&conf->active_stripes)==0); 225 BUG_ON(atomic_read(&conf->active_stripes)==0);
226
227 if (r5c_is_writeback(conf->log))
228 for (i = sh->disks; i--; )
229 if (test_bit(R5_InJournal, &sh->dev[i].flags))
230 injournal++;
231 /*
232 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
233 * data in journal, so they are not released to cached lists
234 */
235 if (conf->quiesce && r5c_is_writeback(conf->log) &&
236 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
237 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
238 r5c_make_stripe_write_out(sh);
239 set_bit(STRIPE_HANDLE, &sh->state);
240 }
241
294 if (test_bit(STRIPE_HANDLE, &sh->state)) { 242 if (test_bit(STRIPE_HANDLE, &sh->state)) {
295 if (test_bit(STRIPE_DELAYED, &sh->state) && 243 if (test_bit(STRIPE_DELAYED, &sh->state) &&
296 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 244 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
@@ -316,8 +264,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
316 < IO_THRESHOLD) 264 < IO_THRESHOLD)
317 md_wakeup_thread(conf->mddev->thread); 265 md_wakeup_thread(conf->mddev->thread);
318 atomic_dec(&conf->active_stripes); 266 atomic_dec(&conf->active_stripes);
319 if (!test_bit(STRIPE_EXPANDING, &sh->state)) 267 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
320 list_add_tail(&sh->lru, temp_inactive_list); 268 if (!r5c_is_writeback(conf->log))
269 list_add_tail(&sh->lru, temp_inactive_list);
270 else {
271 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
272 if (injournal == 0)
273 list_add_tail(&sh->lru, temp_inactive_list);
274 else if (injournal == conf->raid_disks - conf->max_degraded) {
275 /* full stripe */
276 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
277 atomic_inc(&conf->r5c_cached_full_stripes);
278 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
279 atomic_dec(&conf->r5c_cached_partial_stripes);
280 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
281 r5c_check_cached_full_stripe(conf);
282 } else {
283 /* partial stripe */
284 if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
285 &sh->state))
286 atomic_inc(&conf->r5c_cached_partial_stripes);
287 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
288 }
289 }
290 }
321 } 291 }
322} 292}
323 293
@@ -541,7 +511,7 @@ retry:
541 511
542 if (dev->toread || dev->read || dev->towrite || dev->written || 512 if (dev->toread || dev->read || dev->towrite || dev->written ||
543 test_bit(R5_LOCKED, &dev->flags)) { 513 test_bit(R5_LOCKED, &dev->flags)) {
544 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 514 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
545 (unsigned long long)sh->sector, i, dev->toread, 515 (unsigned long long)sh->sector, i, dev->toread,
546 dev->read, dev->towrite, dev->written, 516 dev->read, dev->towrite, dev->written,
547 test_bit(R5_LOCKED, &dev->flags)); 517 test_bit(R5_LOCKED, &dev->flags));
@@ -680,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
680 } 650 }
681 if (noblock && sh == NULL) 651 if (noblock && sh == NULL)
682 break; 652 break;
653
654 r5c_check_stripe_cache_usage(conf);
683 if (!sh) { 655 if (!sh) {
684 set_bit(R5_INACTIVE_BLOCKED, 656 set_bit(R5_INACTIVE_BLOCKED,
685 &conf->cache_state); 657 &conf->cache_state);
658 r5l_wake_reclaim(conf->log, 0);
686 wait_event_lock_irq( 659 wait_event_lock_irq(
687 conf->wait_for_stripe, 660 conf->wait_for_stripe,
688 !list_empty(conf->inactive_list + hash) && 661 !list_empty(conf->inactive_list + hash) &&
@@ -901,8 +874,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
901 874
902 might_sleep(); 875 might_sleep();
903 876
904 if (r5l_write_stripe(conf->log, sh) == 0) 877 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
905 return; 878 /* writing out phase */
879 if (s->waiting_extra_page)
880 return;
881 if (r5l_write_stripe(conf->log, sh) == 0)
882 return;
883 } else { /* caching phase */
884 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
885 r5c_cache_data(conf->log, sh, s);
886 return;
887 }
888 }
889
906 for (i = disks; i--; ) { 890 for (i = disks; i--; ) {
907 int op, op_flags = 0; 891 int op, op_flags = 0;
908 int replace_only = 0; 892 int replace_only = 0;
@@ -977,7 +961,7 @@ again:
977 if (bad < 0) { 961 if (bad < 0) {
978 set_bit(BlockedBadBlocks, &rdev->flags); 962 set_bit(BlockedBadBlocks, &rdev->flags);
979 if (!conf->mddev->external && 963 if (!conf->mddev->external &&
980 conf->mddev->flags) { 964 conf->mddev->sb_flags) {
981 /* It is very unlikely, but we might 965 /* It is very unlikely, but we might
982 * still need to write out the 966 * still need to write out the
983 * bad block log - better give it 967 * bad block log - better give it
@@ -1115,7 +1099,7 @@ again:
1115static struct dma_async_tx_descriptor * 1099static struct dma_async_tx_descriptor *
1116async_copy_data(int frombio, struct bio *bio, struct page **page, 1100async_copy_data(int frombio, struct bio *bio, struct page **page,
1117 sector_t sector, struct dma_async_tx_descriptor *tx, 1101 sector_t sector, struct dma_async_tx_descriptor *tx,
1118 struct stripe_head *sh) 1102 struct stripe_head *sh, int no_skipcopy)
1119{ 1103{
1120 struct bio_vec bvl; 1104 struct bio_vec bvl;
1121 struct bvec_iter iter; 1105 struct bvec_iter iter;
@@ -1155,7 +1139,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
1155 if (frombio) { 1139 if (frombio) {
1156 if (sh->raid_conf->skip_copy && 1140 if (sh->raid_conf->skip_copy &&
1157 b_offset == 0 && page_offset == 0 && 1141 b_offset == 0 && page_offset == 0 &&
1158 clen == STRIPE_SIZE) 1142 clen == STRIPE_SIZE &&
1143 !no_skipcopy)
1159 *page = bio_page; 1144 *page = bio_page;
1160 else 1145 else
1161 tx = async_memcpy(*page, bio_page, page_offset, 1146 tx = async_memcpy(*page, bio_page, page_offset,
@@ -1237,7 +1222,7 @@ static void ops_run_biofill(struct stripe_head *sh)
1237 while (rbi && rbi->bi_iter.bi_sector < 1222 while (rbi && rbi->bi_iter.bi_sector <
1238 dev->sector + STRIPE_SECTORS) { 1223 dev->sector + STRIPE_SECTORS) {
1239 tx = async_copy_data(0, rbi, &dev->page, 1224 tx = async_copy_data(0, rbi, &dev->page,
1240 dev->sector, tx, sh); 1225 dev->sector, tx, sh, 0);
1241 rbi = r5_next_bio(rbi, dev->sector); 1226 rbi = r5_next_bio(rbi, dev->sector);
1242 } 1227 }
1243 } 1228 }
@@ -1364,10 +1349,15 @@ static int set_syndrome_sources(struct page **srcs,
1364 if (i == sh->qd_idx || i == sh->pd_idx || 1349 if (i == sh->qd_idx || i == sh->pd_idx ||
1365 (srctype == SYNDROME_SRC_ALL) || 1350 (srctype == SYNDROME_SRC_ALL) ||
1366 (srctype == SYNDROME_SRC_WANT_DRAIN && 1351 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1367 test_bit(R5_Wantdrain, &dev->flags)) || 1352 (test_bit(R5_Wantdrain, &dev->flags) ||
1353 test_bit(R5_InJournal, &dev->flags))) ||
1368 (srctype == SYNDROME_SRC_WRITTEN && 1354 (srctype == SYNDROME_SRC_WRITTEN &&
1369 dev->written)) 1355 dev->written)) {
1370 srcs[slot] = sh->dev[i].page; 1356 if (test_bit(R5_InJournal, &dev->flags))
1357 srcs[slot] = sh->dev[i].orig_page;
1358 else
1359 srcs[slot] = sh->dev[i].page;
1360 }
1371 i = raid6_next_disk(i, disks); 1361 i = raid6_next_disk(i, disks);
1372 } while (i != d0_idx); 1362 } while (i != d0_idx);
1373 1363
@@ -1546,6 +1536,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
1546 1536
1547 pr_debug("%s: stripe %llu\n", __func__, 1537 pr_debug("%s: stripe %llu\n", __func__,
1548 (unsigned long long)sh->sector); 1538 (unsigned long long)sh->sector);
1539
1540 if (r5c_is_writeback(sh->raid_conf->log))
1541 /*
1542 * raid5-cache write back uses orig_page during prexor.
1543 * After prexor, it is time to free orig_page
1544 */
1545 r5c_release_extra_page(sh);
1549} 1546}
1550 1547
1551static struct dma_async_tx_descriptor * 1548static struct dma_async_tx_descriptor *
@@ -1567,7 +1564,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1567 for (i = disks; i--; ) { 1564 for (i = disks; i--; ) {
1568 struct r5dev *dev = &sh->dev[i]; 1565 struct r5dev *dev = &sh->dev[i];
1569 /* Only process blocks that are known to be uptodate */ 1566 /* Only process blocks that are known to be uptodate */
1570 if (test_bit(R5_Wantdrain, &dev->flags)) 1567 if (test_bit(R5_InJournal, &dev->flags))
1568 xor_srcs[count++] = dev->orig_page;
1569 else if (test_bit(R5_Wantdrain, &dev->flags))
1571 xor_srcs[count++] = dev->page; 1570 xor_srcs[count++] = dev->page;
1572 } 1571 }
1573 1572
@@ -1601,6 +1600,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1601static struct dma_async_tx_descriptor * 1600static struct dma_async_tx_descriptor *
1602ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1601ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1603{ 1602{
1603 struct r5conf *conf = sh->raid_conf;
1604 int disks = sh->disks; 1604 int disks = sh->disks;
1605 int i; 1605 int i;
1606 struct stripe_head *head_sh = sh; 1606 struct stripe_head *head_sh = sh;
@@ -1618,6 +1618,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1618 1618
1619again: 1619again:
1620 dev = &sh->dev[i]; 1620 dev = &sh->dev[i];
1621 /*
1622 * clear R5_InJournal, so when rewriting a page in
1623 * journal, it is not skipped by r5l_log_stripe()
1624 */
1625 clear_bit(R5_InJournal, &dev->flags);
1621 spin_lock_irq(&sh->stripe_lock); 1626 spin_lock_irq(&sh->stripe_lock);
1622 chosen = dev->towrite; 1627 chosen = dev->towrite;
1623 dev->towrite = NULL; 1628 dev->towrite = NULL;
@@ -1637,8 +1642,10 @@ again:
1637 set_bit(R5_Discard, &dev->flags); 1642 set_bit(R5_Discard, &dev->flags);
1638 else { 1643 else {
1639 tx = async_copy_data(1, wbi, &dev->page, 1644 tx = async_copy_data(1, wbi, &dev->page,
1640 dev->sector, tx, sh); 1645 dev->sector, tx, sh,
1641 if (dev->page != dev->orig_page) { 1646 r5c_is_writeback(conf->log));
1647 if (dev->page != dev->orig_page &&
1648 !r5c_is_writeback(conf->log)) {
1642 set_bit(R5_SkipCopy, &dev->flags); 1649 set_bit(R5_SkipCopy, &dev->flags);
1643 clear_bit(R5_UPTODATE, &dev->flags); 1650 clear_bit(R5_UPTODATE, &dev->flags);
1644 clear_bit(R5_OVERWRITE, &dev->flags); 1651 clear_bit(R5_OVERWRITE, &dev->flags);
@@ -1746,7 +1753,8 @@ again:
1746 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1753 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1747 for (i = disks; i--; ) { 1754 for (i = disks; i--; ) {
1748 struct r5dev *dev = &sh->dev[i]; 1755 struct r5dev *dev = &sh->dev[i];
1749 if (head_sh->dev[i].written) 1756 if (head_sh->dev[i].written ||
1757 test_bit(R5_InJournal, &head_sh->dev[i].flags))
1750 xor_srcs[count++] = dev->page; 1758 xor_srcs[count++] = dev->page;
1751 } 1759 }
1752 } else { 1760 } else {
@@ -2000,7 +2008,10 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2000 spin_lock_init(&sh->batch_lock); 2008 spin_lock_init(&sh->batch_lock);
2001 INIT_LIST_HEAD(&sh->batch_list); 2009 INIT_LIST_HEAD(&sh->batch_list);
2002 INIT_LIST_HEAD(&sh->lru); 2010 INIT_LIST_HEAD(&sh->lru);
2011 INIT_LIST_HEAD(&sh->r5c);
2012 INIT_LIST_HEAD(&sh->log_list);
2003 atomic_set(&sh->count, 1); 2013 atomic_set(&sh->count, 1);
2014 sh->log_start = MaxSector;
2004 for (i = 0; i < disks; i++) { 2015 for (i = 0; i < disks; i++) {
2005 struct r5dev *dev = &sh->dev[i]; 2016 struct r5dev *dev = &sh->dev[i];
2006 2017
@@ -2240,10 +2251,24 @@ static int resize_stripes(struct r5conf *conf, int newsize)
2240 */ 2251 */
2241 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2252 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
2242 if (ndisks) { 2253 if (ndisks) {
2243 for (i=0; i<conf->raid_disks; i++) 2254 for (i = 0; i < conf->pool_size; i++)
2244 ndisks[i] = conf->disks[i]; 2255 ndisks[i] = conf->disks[i];
2245 kfree(conf->disks); 2256
2246 conf->disks = ndisks; 2257 for (i = conf->pool_size; i < newsize; i++) {
2258 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2259 if (!ndisks[i].extra_page)
2260 err = -ENOMEM;
2261 }
2262
2263 if (err) {
2264 for (i = conf->pool_size; i < newsize; i++)
2265 if (ndisks[i].extra_page)
2266 put_page(ndisks[i].extra_page);
2267 kfree(ndisks);
2268 } else {
2269 kfree(conf->disks);
2270 conf->disks = ndisks;
2271 }
2247 } else 2272 } else
2248 err = -ENOMEM; 2273 err = -ENOMEM;
2249 2274
@@ -2342,10 +2367,8 @@ static void raid5_end_read_request(struct bio * bi)
2342 * replacement device. We just fail those on 2367 * replacement device. We just fail those on
2343 * any error 2368 * any error
2344 */ 2369 */
2345 printk_ratelimited( 2370 pr_info_ratelimited(
2346 KERN_INFO 2371 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2347 "md/raid:%s: read error corrected"
2348 " (%lu sectors at %llu on %s)\n",
2349 mdname(conf->mddev), STRIPE_SECTORS, 2372 mdname(conf->mddev), STRIPE_SECTORS,
2350 (unsigned long long)s, 2373 (unsigned long long)s,
2351 bdevname(rdev->bdev, b)); 2374 bdevname(rdev->bdev, b));
@@ -2365,36 +2388,29 @@ static void raid5_end_read_request(struct bio * bi)
2365 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2388 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2366 atomic_inc(&rdev->read_errors); 2389 atomic_inc(&rdev->read_errors);
2367 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2390 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2368 printk_ratelimited( 2391 pr_warn_ratelimited(
2369 KERN_WARNING 2392 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2370 "md/raid:%s: read error on replacement device "
2371 "(sector %llu on %s).\n",
2372 mdname(conf->mddev), 2393 mdname(conf->mddev),
2373 (unsigned long long)s, 2394 (unsigned long long)s,
2374 bdn); 2395 bdn);
2375 else if (conf->mddev->degraded >= conf->max_degraded) { 2396 else if (conf->mddev->degraded >= conf->max_degraded) {
2376 set_bad = 1; 2397 set_bad = 1;
2377 printk_ratelimited( 2398 pr_warn_ratelimited(
2378 KERN_WARNING 2399 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2379 "md/raid:%s: read error not correctable "
2380 "(sector %llu on %s).\n",
2381 mdname(conf->mddev), 2400 mdname(conf->mddev),
2382 (unsigned long long)s, 2401 (unsigned long long)s,
2383 bdn); 2402 bdn);
2384 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2403 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2385 /* Oh, no!!! */ 2404 /* Oh, no!!! */
2386 set_bad = 1; 2405 set_bad = 1;
2387 printk_ratelimited( 2406 pr_warn_ratelimited(
2388 KERN_WARNING 2407 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2389 "md/raid:%s: read error NOT corrected!! "
2390 "(sector %llu on %s).\n",
2391 mdname(conf->mddev), 2408 mdname(conf->mddev),
2392 (unsigned long long)s, 2409 (unsigned long long)s,
2393 bdn); 2410 bdn);
2394 } else if (atomic_read(&rdev->read_errors) 2411 } else if (atomic_read(&rdev->read_errors)
2395 > conf->max_nr_stripes) 2412 > conf->max_nr_stripes)
2396 printk(KERN_WARNING 2413 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2397 "md/raid:%s: Too many read errors, failing device %s.\n",
2398 mdname(conf->mddev), bdn); 2414 mdname(conf->mddev), bdn);
2399 else 2415 else
2400 retry = 1; 2416 retry = 1;
@@ -2526,15 +2542,14 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2526 2542
2527 set_bit(Blocked, &rdev->flags); 2543 set_bit(Blocked, &rdev->flags);
2528 set_bit(Faulty, &rdev->flags); 2544 set_bit(Faulty, &rdev->flags);
2529 set_mask_bits(&mddev->flags, 0, 2545 set_mask_bits(&mddev->sb_flags, 0,
2530 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 2546 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2531 printk(KERN_ALERT 2547 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2532 "md/raid:%s: Disk failure on %s, disabling device.\n" 2548 "md/raid:%s: Operation continuing on %d devices.\n",
2533 "md/raid:%s: Operation continuing on %d devices.\n", 2549 mdname(mddev),
2534 mdname(mddev), 2550 bdevname(rdev->bdev, b),
2535 bdevname(rdev->bdev, b), 2551 mdname(mddev),
2536 mdname(mddev), 2552 conf->raid_disks - mddev->degraded);
2537 conf->raid_disks - mddev->degraded);
2538} 2553}
2539 2554
2540/* 2555/*
@@ -2856,8 +2871,8 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2856 previous, &dummy1, &sh2); 2871 previous, &dummy1, &sh2);
2857 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2872 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2858 || sh2.qd_idx != sh->qd_idx) { 2873 || sh2.qd_idx != sh->qd_idx) {
2859 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2874 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
2860 mdname(conf->mddev)); 2875 mdname(conf->mddev));
2861 return 0; 2876 return 0;
2862 } 2877 }
2863 return r_sector; 2878 return r_sector;
@@ -2872,6 +2887,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2872 int level = conf->level; 2887 int level = conf->level;
2873 2888
2874 if (rcw) { 2889 if (rcw) {
2890 /*
2891 * In some cases, handle_stripe_dirtying initially decided to
2892 * run rmw and allocates extra page for prexor. However, rcw is
2893 * cheaper later on. We need to free the extra page now,
2894 * because we won't be able to do that in ops_complete_prexor().
2895 */
2896 r5c_release_extra_page(sh);
2875 2897
2876 for (i = disks; i--; ) { 2898 for (i = disks; i--; ) {
2877 struct r5dev *dev = &sh->dev[i]; 2899 struct r5dev *dev = &sh->dev[i];
@@ -2882,6 +2904,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2882 if (!expand) 2904 if (!expand)
2883 clear_bit(R5_UPTODATE, &dev->flags); 2905 clear_bit(R5_UPTODATE, &dev->flags);
2884 s->locked++; 2906 s->locked++;
2907 } else if (test_bit(R5_InJournal, &dev->flags)) {
2908 set_bit(R5_LOCKED, &dev->flags);
2909 s->locked++;
2885 } 2910 }
2886 } 2911 }
2887 /* if we are not expanding this is a proper write request, and 2912 /* if we are not expanding this is a proper write request, and
@@ -2921,6 +2946,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2921 set_bit(R5_LOCKED, &dev->flags); 2946 set_bit(R5_LOCKED, &dev->flags);
2922 clear_bit(R5_UPTODATE, &dev->flags); 2947 clear_bit(R5_UPTODATE, &dev->flags);
2923 s->locked++; 2948 s->locked++;
2949 } else if (test_bit(R5_InJournal, &dev->flags)) {
2950 set_bit(R5_LOCKED, &dev->flags);
2951 s->locked++;
2924 } 2952 }
2925 } 2953 }
2926 if (!s->locked) 2954 if (!s->locked)
@@ -3564,10 +3592,10 @@ unhash:
3564 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3592 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3565} 3593}
3566 3594
3567static void handle_stripe_dirtying(struct r5conf *conf, 3595static int handle_stripe_dirtying(struct r5conf *conf,
3568 struct stripe_head *sh, 3596 struct stripe_head *sh,
3569 struct stripe_head_state *s, 3597 struct stripe_head_state *s,
3570 int disks) 3598 int disks)
3571{ 3599{
3572 int rmw = 0, rcw = 0, i; 3600 int rmw = 0, rcw = 0, i;
3573 sector_t recovery_cp = conf->mddev->recovery_cp; 3601 sector_t recovery_cp = conf->mddev->recovery_cp;
@@ -3592,9 +3620,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3592 } else for (i = disks; i--; ) { 3620 } else for (i = disks; i--; ) {
3593 /* would I have to read this buffer for read_modify_write */ 3621 /* would I have to read this buffer for read_modify_write */
3594 struct r5dev *dev = &sh->dev[i]; 3622 struct r5dev *dev = &sh->dev[i];
3595 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && 3623 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
3624 test_bit(R5_InJournal, &dev->flags)) &&
3596 !test_bit(R5_LOCKED, &dev->flags) && 3625 !test_bit(R5_LOCKED, &dev->flags) &&
3597 !(test_bit(R5_UPTODATE, &dev->flags) || 3626 !((test_bit(R5_UPTODATE, &dev->flags) &&
3627 (!test_bit(R5_InJournal, &dev->flags) ||
3628 dev->page != dev->orig_page)) ||
3598 test_bit(R5_Wantcompute, &dev->flags))) { 3629 test_bit(R5_Wantcompute, &dev->flags))) {
3599 if (test_bit(R5_Insync, &dev->flags)) 3630 if (test_bit(R5_Insync, &dev->flags))
3600 rmw++; 3631 rmw++;
@@ -3606,13 +3637,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3606 i != sh->pd_idx && i != sh->qd_idx && 3637 i != sh->pd_idx && i != sh->qd_idx &&
3607 !test_bit(R5_LOCKED, &dev->flags) && 3638 !test_bit(R5_LOCKED, &dev->flags) &&
3608 !(test_bit(R5_UPTODATE, &dev->flags) || 3639 !(test_bit(R5_UPTODATE, &dev->flags) ||
3609 test_bit(R5_Wantcompute, &dev->flags))) { 3640 test_bit(R5_InJournal, &dev->flags) ||
3641 test_bit(R5_Wantcompute, &dev->flags))) {
3610 if (test_bit(R5_Insync, &dev->flags)) 3642 if (test_bit(R5_Insync, &dev->flags))
3611 rcw++; 3643 rcw++;
3612 else 3644 else
3613 rcw += 2*disks; 3645 rcw += 2*disks;
3614 } 3646 }
3615 } 3647 }
3648
3616 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 3649 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3617 (unsigned long long)sh->sector, rmw, rcw); 3650 (unsigned long long)sh->sector, rmw, rcw);
3618 set_bit(STRIPE_HANDLE, &sh->state); 3651 set_bit(STRIPE_HANDLE, &sh->state);
@@ -3624,10 +3657,44 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3624 (unsigned long long)sh->sector, rmw); 3657 (unsigned long long)sh->sector, rmw);
3625 for (i = disks; i--; ) { 3658 for (i = disks; i--; ) {
3626 struct r5dev *dev = &sh->dev[i]; 3659 struct r5dev *dev = &sh->dev[i];
3627 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && 3660 if (test_bit(R5_InJournal, &dev->flags) &&
3661 dev->page == dev->orig_page &&
3662 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3663 /* alloc page for prexor */
3664 struct page *p = alloc_page(GFP_NOIO);
3665
3666 if (p) {
3667 dev->orig_page = p;
3668 continue;
3669 }
3670
3671 /*
3672 * alloc_page() failed, try use
3673 * disk_info->extra_page
3674 */
3675 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
3676 &conf->cache_state)) {
3677 r5c_use_extra_page(sh);
3678 break;
3679 }
3680
3681 /* extra_page in use, add to delayed_list */
3682 set_bit(STRIPE_DELAYED, &sh->state);
3683 s->waiting_extra_page = 1;
3684 return -EAGAIN;
3685 }
3686 }
3687
3688 for (i = disks; i--; ) {
3689 struct r5dev *dev = &sh->dev[i];
3690 if ((dev->towrite ||
3691 i == sh->pd_idx || i == sh->qd_idx ||
3692 test_bit(R5_InJournal, &dev->flags)) &&
3628 !test_bit(R5_LOCKED, &dev->flags) && 3693 !test_bit(R5_LOCKED, &dev->flags) &&
3629 !(test_bit(R5_UPTODATE, &dev->flags) || 3694 !((test_bit(R5_UPTODATE, &dev->flags) &&
3630 test_bit(R5_Wantcompute, &dev->flags)) && 3695 (!test_bit(R5_InJournal, &dev->flags) ||
3696 dev->page != dev->orig_page)) ||
3697 test_bit(R5_Wantcompute, &dev->flags)) &&
3631 test_bit(R5_Insync, &dev->flags)) { 3698 test_bit(R5_Insync, &dev->flags)) {
3632 if (test_bit(STRIPE_PREREAD_ACTIVE, 3699 if (test_bit(STRIPE_PREREAD_ACTIVE,
3633 &sh->state)) { 3700 &sh->state)) {
@@ -3653,6 +3720,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3653 i != sh->pd_idx && i != sh->qd_idx && 3720 i != sh->pd_idx && i != sh->qd_idx &&
3654 !test_bit(R5_LOCKED, &dev->flags) && 3721 !test_bit(R5_LOCKED, &dev->flags) &&
3655 !(test_bit(R5_UPTODATE, &dev->flags) || 3722 !(test_bit(R5_UPTODATE, &dev->flags) ||
3723 test_bit(R5_InJournal, &dev->flags) ||
3656 test_bit(R5_Wantcompute, &dev->flags))) { 3724 test_bit(R5_Wantcompute, &dev->flags))) {
3657 rcw++; 3725 rcw++;
3658 if (test_bit(R5_Insync, &dev->flags) && 3726 if (test_bit(R5_Insync, &dev->flags) &&
@@ -3692,8 +3760,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3692 */ 3760 */
3693 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 3761 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
3694 (s->locked == 0 && (rcw == 0 || rmw == 0) && 3762 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
3695 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 3763 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
3696 schedule_reconstruction(sh, s, rcw == 0, 0); 3764 schedule_reconstruction(sh, s, rcw == 0, 0);
3765 return 0;
3697} 3766}
3698 3767
3699static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 3768static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
@@ -3777,7 +3846,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
3777 case check_state_compute_run: 3846 case check_state_compute_run:
3778 break; 3847 break;
3779 default: 3848 default:
3780 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3849 pr_err("%s: unknown check_state: %d sector: %llu\n",
3781 __func__, sh->check_state, 3850 __func__, sh->check_state,
3782 (unsigned long long) sh->sector); 3851 (unsigned long long) sh->sector);
3783 BUG(); 3852 BUG();
@@ -3941,9 +4010,9 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3941 case check_state_compute_run: 4010 case check_state_compute_run:
3942 break; 4011 break;
3943 default: 4012 default:
3944 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 4013 pr_warn("%s: unknown check_state: %d sector: %llu\n",
3945 __func__, sh->check_state, 4014 __func__, sh->check_state,
3946 (unsigned long long) sh->sector); 4015 (unsigned long long) sh->sector);
3947 BUG(); 4016 BUG();
3948 } 4017 }
3949} 4018}
@@ -4183,6 +4252,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4183 if (rdev && !test_bit(Faulty, &rdev->flags)) 4252 if (rdev && !test_bit(Faulty, &rdev->flags))
4184 do_recovery = 1; 4253 do_recovery = 1;
4185 } 4254 }
4255
4256 if (test_bit(R5_InJournal, &dev->flags))
4257 s->injournal++;
4258 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4259 s->just_cached++;
4186 } 4260 }
4187 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4261 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4188 /* If there is a failed device being replaced, 4262 /* If there is a failed device being replaced,
@@ -4411,7 +4485,8 @@ static void handle_stripe(struct stripe_head *sh)
4411 struct r5dev *dev = &sh->dev[i]; 4485 struct r5dev *dev = &sh->dev[i];
4412 if (test_bit(R5_LOCKED, &dev->flags) && 4486 if (test_bit(R5_LOCKED, &dev->flags) &&
4413 (i == sh->pd_idx || i == sh->qd_idx || 4487 (i == sh->pd_idx || i == sh->qd_idx ||
4414 dev->written)) { 4488 dev->written || test_bit(R5_InJournal,
4489 &dev->flags))) {
4415 pr_debug("Writing block %d\n", i); 4490 pr_debug("Writing block %d\n", i);
4416 set_bit(R5_Wantwrite, &dev->flags); 4491 set_bit(R5_Wantwrite, &dev->flags);
4417 if (prexor) 4492 if (prexor)
@@ -4451,6 +4526,10 @@ static void handle_stripe(struct stripe_head *sh)
4451 test_bit(R5_Discard, &qdev->flags)))))) 4526 test_bit(R5_Discard, &qdev->flags))))))
4452 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 4527 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
4453 4528
4529 if (s.just_cached)
4530 r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
4531 r5l_stripe_write_finished(sh);
4532
4454 /* Now we might consider reading some blocks, either to check/generate 4533 /* Now we might consider reading some blocks, either to check/generate
4455 * parity, or to satisfy requests 4534 * parity, or to satisfy requests
4456 * or to load a block that is being partially written. 4535 * or to load a block that is being partially written.
@@ -4462,14 +4541,51 @@ static void handle_stripe(struct stripe_head *sh)
4462 || s.expanding) 4541 || s.expanding)
4463 handle_stripe_fill(sh, &s, disks); 4542 handle_stripe_fill(sh, &s, disks);
4464 4543
4465 /* Now to consider new write requests and what else, if anything 4544 /*
4466 * should be read. We do not handle new writes when: 4545 * When the stripe finishes full journal write cycle (write to journal
4546 * and raid disk), this is the clean up procedure so it is ready for
4547 * next operation.
4548 */
4549 r5c_finish_stripe_write_out(conf, sh, &s);
4550
4551 /*
4552 * Now to consider new write requests, cache write back and what else,
4553 * if anything should be read. We do not handle new writes when:
4467 * 1/ A 'write' operation (copy+xor) is already in flight. 4554 * 1/ A 'write' operation (copy+xor) is already in flight.
4468 * 2/ A 'check' operation is in flight, as it may clobber the parity 4555 * 2/ A 'check' operation is in flight, as it may clobber the parity
4469 * block. 4556 * block.
4557 * 3/ A r5c cache log write is in flight.
4470 */ 4558 */
4471 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 4559
4472 handle_stripe_dirtying(conf, sh, &s, disks); 4560 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4561 if (!r5c_is_writeback(conf->log)) {
4562 if (s.to_write)
4563 handle_stripe_dirtying(conf, sh, &s, disks);
4564 } else { /* write back cache */
4565 int ret = 0;
4566
4567 /* First, try handle writes in caching phase */
4568 if (s.to_write)
4569 ret = r5c_try_caching_write(conf, sh, &s,
4570 disks);
4571 /*
4572 * If caching phase failed: ret == -EAGAIN
4573 * OR
4574 * stripe under reclaim: !caching && injournal
4575 *
4576 * fall back to handle_stripe_dirtying()
4577 */
4578 if (ret == -EAGAIN ||
4579 /* stripe under reclaim: !caching && injournal */
4580 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4581 s.injournal > 0)) {
4582 ret = handle_stripe_dirtying(conf, sh, &s,
4583 disks);
4584 if (ret == -EAGAIN)
4585 goto finish;
4586 }
4587 }
4588 }
4473 4589
4474 /* maybe we need to check and possibly fix the parity for this stripe 4590 /* maybe we need to check and possibly fix the parity for this stripe
4475 * Any reads will already have been scheduled, so we just see if enough 4591 * Any reads will already have been scheduled, so we just see if enough
@@ -4640,9 +4756,7 @@ finish:
4640 } 4756 }
4641 4757
4642 if (!bio_list_empty(&s.return_bi)) { 4758 if (!bio_list_empty(&s.return_bi)) {
4643 if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) && 4759 if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4644 (s.failed <= conf->max_degraded ||
4645 conf->mddev->external == 0)) {
4646 spin_lock_irq(&conf->device_lock); 4760 spin_lock_irq(&conf->device_lock);
4647 bio_list_merge(&conf->return_bi, &s.return_bi); 4761 bio_list_merge(&conf->return_bi, &s.return_bi);
4648 spin_unlock_irq(&conf->device_lock); 4762 spin_unlock_irq(&conf->device_lock);
@@ -4698,6 +4812,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
4698 4812
4699 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 4813 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
4700 return 1; 4814 return 1;
4815
4816 /* Also checks whether there is pressure on r5cache log space */
4817 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
4818 return 1;
4701 if (conf->quiesce) 4819 if (conf->quiesce)
4702 return 1; 4820 return 1;
4703 if (atomic_read(&conf->empty_inactive_list_nr)) 4821 if (atomic_read(&conf->empty_inactive_list_nr))
@@ -5167,6 +5285,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
5167 int remaining; 5285 int remaining;
5168 DEFINE_WAIT(w); 5286 DEFINE_WAIT(w);
5169 bool do_prepare; 5287 bool do_prepare;
5288 bool do_flush = false;
5170 5289
5171 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5290 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5172 int ret = r5l_handle_flush_request(conf->log, bi); 5291 int ret = r5l_handle_flush_request(conf->log, bi);
@@ -5178,6 +5297,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
5178 return; 5297 return;
5179 } 5298 }
5180 /* ret == -EAGAIN, fallback */ 5299 /* ret == -EAGAIN, fallback */
5300 /*
5301 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
5302 * we need to flush journal device
5303 */
5304 do_flush = bi->bi_opf & REQ_PREFLUSH;
5181 } 5305 }
5182 5306
5183 md_write_start(mddev, bi); 5307 md_write_start(mddev, bi);
@@ -5188,6 +5312,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
5188 * data on failed drives. 5312 * data on failed drives.
5189 */ 5313 */
5190 if (rw == READ && mddev->degraded == 0 && 5314 if (rw == READ && mddev->degraded == 0 &&
5315 !r5c_is_writeback(conf->log) &&
5191 mddev->reshape_position == MaxSector) { 5316 mddev->reshape_position == MaxSector) {
5192 bi = chunk_aligned_read(mddev, bi); 5317 bi = chunk_aligned_read(mddev, bi);
5193 if (!bi) 5318 if (!bi)
@@ -5316,6 +5441,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
5316 do_prepare = true; 5441 do_prepare = true;
5317 goto retry; 5442 goto retry;
5318 } 5443 }
5444 if (do_flush) {
5445 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5446 /* we only need flush for one stripe */
5447 do_flush = false;
5448 }
5449
5319 set_bit(STRIPE_HANDLE, &sh->state); 5450 set_bit(STRIPE_HANDLE, &sh->state);
5320 clear_bit(STRIPE_DELAYED, &sh->state); 5451 clear_bit(STRIPE_DELAYED, &sh->state);
5321 if ((!sh->batch_head || sh == sh->batch_head) && 5452 if ((!sh->batch_head || sh == sh->batch_head) &&
@@ -5481,9 +5612,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
5481 mddev->reshape_position = conf->reshape_progress; 5612 mddev->reshape_position = conf->reshape_progress;
5482 mddev->curr_resync_completed = sector_nr; 5613 mddev->curr_resync_completed = sector_nr;
5483 conf->reshape_checkpoint = jiffies; 5614 conf->reshape_checkpoint = jiffies;
5484 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5615 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5485 md_wakeup_thread(mddev->thread); 5616 md_wakeup_thread(mddev->thread);
5486 wait_event(mddev->sb_wait, mddev->flags == 0 || 5617 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
5487 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5618 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5488 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5619 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5489 return 0; 5620 return 0;
@@ -5579,10 +5710,10 @@ finish:
5579 mddev->reshape_position = conf->reshape_progress; 5710 mddev->reshape_position = conf->reshape_progress;
5580 mddev->curr_resync_completed = sector_nr; 5711 mddev->curr_resync_completed = sector_nr;
5581 conf->reshape_checkpoint = jiffies; 5712 conf->reshape_checkpoint = jiffies;
5582 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5713 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5583 md_wakeup_thread(mddev->thread); 5714 md_wakeup_thread(mddev->thread);
5584 wait_event(mddev->sb_wait, 5715 wait_event(mddev->sb_wait,
5585 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 5716 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
5586 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5717 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5587 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5718 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5588 goto ret; 5719 goto ret;
@@ -5857,10 +5988,10 @@ static void raid5d(struct md_thread *thread)
5857 md_check_recovery(mddev); 5988 md_check_recovery(mddev);
5858 5989
5859 if (!bio_list_empty(&conf->return_bi) && 5990 if (!bio_list_empty(&conf->return_bi) &&
5860 !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { 5991 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
5861 struct bio_list tmp = BIO_EMPTY_LIST; 5992 struct bio_list tmp = BIO_EMPTY_LIST;
5862 spin_lock_irq(&conf->device_lock); 5993 spin_lock_irq(&conf->device_lock);
5863 if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { 5994 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
5864 bio_list_merge(&tmp, &conf->return_bi); 5995 bio_list_merge(&tmp, &conf->return_bi);
5865 bio_list_init(&conf->return_bi); 5996 bio_list_init(&conf->return_bi);
5866 } 5997 }
@@ -5907,7 +6038,7 @@ static void raid5d(struct md_thread *thread)
5907 break; 6038 break;
5908 handled += batch_size; 6039 handled += batch_size;
5909 6040
5910 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 6041 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
5911 spin_unlock_irq(&conf->device_lock); 6042 spin_unlock_irq(&conf->device_lock);
5912 md_check_recovery(mddev); 6043 md_check_recovery(mddev);
5913 spin_lock_irq(&conf->device_lock); 6044 spin_lock_irq(&conf->device_lock);
@@ -6237,6 +6368,7 @@ static struct attribute *raid5_attrs[] = {
6237 &raid5_group_thread_cnt.attr, 6368 &raid5_group_thread_cnt.attr,
6238 &raid5_skip_copy.attr, 6369 &raid5_skip_copy.attr,
6239 &raid5_rmw_level.attr, 6370 &raid5_rmw_level.attr,
6371 &r5c_journal_mode.attr,
6240 NULL, 6372 NULL,
6241}; 6373};
6242static struct attribute_group raid5_attrs_group = { 6374static struct attribute_group raid5_attrs_group = {
@@ -6363,6 +6495,8 @@ static void raid5_free_percpu(struct r5conf *conf)
6363 6495
6364static void free_conf(struct r5conf *conf) 6496static void free_conf(struct r5conf *conf)
6365{ 6497{
6498 int i;
6499
6366 if (conf->log) 6500 if (conf->log)
6367 r5l_exit_log(conf->log); 6501 r5l_exit_log(conf->log);
6368 if (conf->shrinker.nr_deferred) 6502 if (conf->shrinker.nr_deferred)
@@ -6371,6 +6505,9 @@ static void free_conf(struct r5conf *conf)
6371 free_thread_groups(conf); 6505 free_thread_groups(conf);
6372 shrink_stripes(conf); 6506 shrink_stripes(conf);
6373 raid5_free_percpu(conf); 6507 raid5_free_percpu(conf);
6508 for (i = 0; i < conf->pool_size; i++)
6509 if (conf->disks[i].extra_page)
6510 put_page(conf->disks[i].extra_page);
6374 kfree(conf->disks); 6511 kfree(conf->disks);
6375 kfree(conf->stripe_hashtbl); 6512 kfree(conf->stripe_hashtbl);
6376 kfree(conf); 6513 kfree(conf);
@@ -6382,8 +6519,8 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
6382 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6519 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6383 6520
6384 if (alloc_scratch_buffer(conf, percpu)) { 6521 if (alloc_scratch_buffer(conf, percpu)) {
6385 pr_err("%s: failed memory allocation for cpu%u\n", 6522 pr_warn("%s: failed memory allocation for cpu%u\n",
6386 __func__, cpu); 6523 __func__, cpu);
6387 return -ENOMEM; 6524 return -ENOMEM;
6388 } 6525 }
6389 return 0; 6526 return 0;
@@ -6453,29 +6590,29 @@ static struct r5conf *setup_conf(struct mddev *mddev)
6453 if (mddev->new_level != 5 6590 if (mddev->new_level != 5
6454 && mddev->new_level != 4 6591 && mddev->new_level != 4
6455 && mddev->new_level != 6) { 6592 && mddev->new_level != 6) {
6456 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6593 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6457 mdname(mddev), mddev->new_level); 6594 mdname(mddev), mddev->new_level);
6458 return ERR_PTR(-EIO); 6595 return ERR_PTR(-EIO);
6459 } 6596 }
6460 if ((mddev->new_level == 5 6597 if ((mddev->new_level == 5
6461 && !algorithm_valid_raid5(mddev->new_layout)) || 6598 && !algorithm_valid_raid5(mddev->new_layout)) ||
6462 (mddev->new_level == 6 6599 (mddev->new_level == 6
6463 && !algorithm_valid_raid6(mddev->new_layout))) { 6600 && !algorithm_valid_raid6(mddev->new_layout))) {
6464 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 6601 pr_warn("md/raid:%s: layout %d not supported\n",
6465 mdname(mddev), mddev->new_layout); 6602 mdname(mddev), mddev->new_layout);
6466 return ERR_PTR(-EIO); 6603 return ERR_PTR(-EIO);
6467 } 6604 }
6468 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6605 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6469 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6606 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6470 mdname(mddev), mddev->raid_disks); 6607 mdname(mddev), mddev->raid_disks);
6471 return ERR_PTR(-EINVAL); 6608 return ERR_PTR(-EINVAL);
6472 } 6609 }
6473 6610
6474 if (!mddev->new_chunk_sectors || 6611 if (!mddev->new_chunk_sectors ||
6475 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6612 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6476 !is_power_of_2(mddev->new_chunk_sectors)) { 6613 !is_power_of_2(mddev->new_chunk_sectors)) {
6477 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 6614 pr_warn("md/raid:%s: invalid chunk size %d\n",
6478 mdname(mddev), mddev->new_chunk_sectors << 9); 6615 mdname(mddev), mddev->new_chunk_sectors << 9);
6479 return ERR_PTR(-EINVAL); 6616 return ERR_PTR(-EINVAL);
6480 } 6617 }
6481 6618
@@ -6517,9 +6654,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
6517 6654
6518 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6655 conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
6519 GFP_KERNEL); 6656 GFP_KERNEL);
6657
6520 if (!conf->disks) 6658 if (!conf->disks)
6521 goto abort; 6659 goto abort;
6522 6660
6661 for (i = 0; i < max_disks; i++) {
6662 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
6663 if (!conf->disks[i].extra_page)
6664 goto abort;
6665 }
6666
6523 conf->mddev = mddev; 6667 conf->mddev = mddev;
6524 6668
6525 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6669 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -6540,6 +6684,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
6540 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6684 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6541 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6685 INIT_LIST_HEAD(conf->temp_inactive_list + i);
6542 6686
6687 atomic_set(&conf->r5c_cached_full_stripes, 0);
6688 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
6689 atomic_set(&conf->r5c_cached_partial_stripes, 0);
6690 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
6691
6543 conf->level = mddev->new_level; 6692 conf->level = mddev->new_level;
6544 conf->chunk_sectors = mddev->new_chunk_sectors; 6693 conf->chunk_sectors = mddev->new_chunk_sectors;
6545 if (raid5_alloc_percpu(conf) != 0) 6694 if (raid5_alloc_percpu(conf) != 0)
@@ -6566,9 +6715,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
6566 6715
6567 if (test_bit(In_sync, &rdev->flags)) { 6716 if (test_bit(In_sync, &rdev->flags)) {
6568 char b[BDEVNAME_SIZE]; 6717 char b[BDEVNAME_SIZE];
6569 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 6718 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
6570 " disk %d\n", 6719 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
6571 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
6572 } else if (rdev->saved_raid_disk != raid_disk) 6720 } else if (rdev->saved_raid_disk != raid_disk)
6573 /* Cannot rely on bitmap to complete recovery */ 6721 /* Cannot rely on bitmap to complete recovery */
6574 conf->fullsync = 1; 6722 conf->fullsync = 1;
@@ -6602,21 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
6602 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 6750 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
6603 conf->min_nr_stripes = max(NR_STRIPES, stripes); 6751 conf->min_nr_stripes = max(NR_STRIPES, stripes);
6604 if (conf->min_nr_stripes != NR_STRIPES) 6752 if (conf->min_nr_stripes != NR_STRIPES)
6605 printk(KERN_INFO 6753 pr_info("md/raid:%s: force stripe size %d for reshape\n",
6606 "md/raid:%s: force stripe size %d for reshape\n",
6607 mdname(mddev), conf->min_nr_stripes); 6754 mdname(mddev), conf->min_nr_stripes);
6608 } 6755 }
6609 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 6756 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
6610 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 6757 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
6611 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 6758 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
6612 if (grow_stripes(conf, conf->min_nr_stripes)) { 6759 if (grow_stripes(conf, conf->min_nr_stripes)) {
6613 printk(KERN_ERR 6760 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
6614 "md/raid:%s: couldn't allocate %dkB for buffers\n", 6761 mdname(mddev), memory);
6615 mdname(mddev), memory);
6616 goto abort; 6762 goto abort;
6617 } else 6763 } else
6618 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 6764 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
6619 mdname(mddev), memory);
6620 /* 6765 /*
6621 * Losing a stripe head costs more than the time to refill it, 6766 * Losing a stripe head costs more than the time to refill it,
6622 * it reduces the queue depth and so can hurt throughput. 6767 * it reduces the queue depth and so can hurt throughput.
@@ -6628,18 +6773,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
6628 conf->shrinker.batch = 128; 6773 conf->shrinker.batch = 128;
6629 conf->shrinker.flags = 0; 6774 conf->shrinker.flags = 0;
6630 if (register_shrinker(&conf->shrinker)) { 6775 if (register_shrinker(&conf->shrinker)) {
6631 printk(KERN_ERR 6776 pr_warn("md/raid:%s: couldn't register shrinker.\n",
6632 "md/raid:%s: couldn't register shrinker.\n", 6777 mdname(mddev));
6633 mdname(mddev));
6634 goto abort; 6778 goto abort;
6635 } 6779 }
6636 6780
6637 sprintf(pers_name, "raid%d", mddev->new_level); 6781 sprintf(pers_name, "raid%d", mddev->new_level);
6638 conf->thread = md_register_thread(raid5d, mddev, pers_name); 6782 conf->thread = md_register_thread(raid5d, mddev, pers_name);
6639 if (!conf->thread) { 6783 if (!conf->thread) {
6640 printk(KERN_ERR 6784 pr_warn("md/raid:%s: couldn't allocate thread.\n",
6641 "md/raid:%s: couldn't allocate thread.\n", 6785 mdname(mddev));
6642 mdname(mddev));
6643 goto abort; 6786 goto abort;
6644 } 6787 }
6645 6788
@@ -6692,9 +6835,8 @@ static int raid5_run(struct mddev *mddev)
6692 int first = 1; 6835 int first = 1;
6693 6836
6694 if (mddev->recovery_cp != MaxSector) 6837 if (mddev->recovery_cp != MaxSector)
6695 printk(KERN_NOTICE "md/raid:%s: not clean" 6838 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
6696 " -- starting background reconstruction\n", 6839 mdname(mddev));
6697 mdname(mddev));
6698 6840
6699 rdev_for_each(rdev, mddev) { 6841 rdev_for_each(rdev, mddev) {
6700 long long diff; 6842 long long diff;
@@ -6737,15 +6879,14 @@ static int raid5_run(struct mddev *mddev)
6737 int new_data_disks; 6879 int new_data_disks;
6738 6880
6739 if (journal_dev) { 6881 if (journal_dev) {
6740 printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n", 6882 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
6741 mdname(mddev)); 6883 mdname(mddev));
6742 return -EINVAL; 6884 return -EINVAL;
6743 } 6885 }
6744 6886
6745 if (mddev->new_level != mddev->level) { 6887 if (mddev->new_level != mddev->level) {
6746 printk(KERN_ERR "md/raid:%s: unsupported reshape " 6888 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
6747 "required - aborting.\n", 6889 mdname(mddev));
6748 mdname(mddev));
6749 return -EINVAL; 6890 return -EINVAL;
6750 } 6891 }
6751 old_disks = mddev->raid_disks - mddev->delta_disks; 6892 old_disks = mddev->raid_disks - mddev->delta_disks;
@@ -6760,8 +6901,8 @@ static int raid5_run(struct mddev *mddev)
6760 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 6901 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
6761 new_data_disks = mddev->raid_disks - max_degraded; 6902 new_data_disks = mddev->raid_disks - max_degraded;
6762 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 6903 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
6763 printk(KERN_ERR "md/raid:%s: reshape_position not " 6904 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
6764 "on a stripe boundary\n", mdname(mddev)); 6905 mdname(mddev));
6765 return -EINVAL; 6906 return -EINVAL;
6766 } 6907 }
6767 reshape_offset = here_new * chunk_sectors; 6908 reshape_offset = here_new * chunk_sectors;
@@ -6782,10 +6923,8 @@ static int raid5_run(struct mddev *mddev)
6782 abs(min_offset_diff) >= mddev->new_chunk_sectors) 6923 abs(min_offset_diff) >= mddev->new_chunk_sectors)
6783 /* not really in-place - so OK */; 6924 /* not really in-place - so OK */;
6784 else if (mddev->ro == 0) { 6925 else if (mddev->ro == 0) {
6785 printk(KERN_ERR "md/raid:%s: in-place reshape " 6926 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
6786 "must be started in read-only mode " 6927 mdname(mddev));
6787 "- aborting\n",
6788 mdname(mddev));
6789 return -EINVAL; 6928 return -EINVAL;
6790 } 6929 }
6791 } else if (mddev->reshape_backwards 6930 } else if (mddev->reshape_backwards
@@ -6794,13 +6933,11 @@ static int raid5_run(struct mddev *mddev)
6794 : (here_new * chunk_sectors >= 6933 : (here_new * chunk_sectors >=
6795 here_old * chunk_sectors + (-min_offset_diff))) { 6934 here_old * chunk_sectors + (-min_offset_diff))) {
6796 /* Reading from the same stripe as writing to - bad */ 6935 /* Reading from the same stripe as writing to - bad */
6797 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 6936 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
6798 "auto-recovery - aborting.\n", 6937 mdname(mddev));
6799 mdname(mddev));
6800 return -EINVAL; 6938 return -EINVAL;
6801 } 6939 }
6802 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 6940 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
6803 mdname(mddev));
6804 /* OK, we should be able to continue; */ 6941 /* OK, we should be able to continue; */
6805 } else { 6942 } else {
6806 BUG_ON(mddev->level != mddev->new_level); 6943 BUG_ON(mddev->level != mddev->new_level);
@@ -6819,8 +6956,8 @@ static int raid5_run(struct mddev *mddev)
6819 6956
6820 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 6957 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
6821 if (!journal_dev) { 6958 if (!journal_dev) {
6822 pr_err("md/raid:%s: journal disk is missing, force array readonly\n", 6959 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
6823 mdname(mddev)); 6960 mdname(mddev));
6824 mddev->ro = 1; 6961 mddev->ro = 1;
6825 set_disk_ro(mddev->gendisk, 1); 6962 set_disk_ro(mddev->gendisk, 1);
6826 } else if (mddev->recovery_cp == MaxSector) 6963 } else if (mddev->recovery_cp == MaxSector)
@@ -6847,8 +6984,7 @@ static int raid5_run(struct mddev *mddev)
6847 if (conf->disks[i].replacement && 6984 if (conf->disks[i].replacement &&
6848 conf->reshape_progress != MaxSector) { 6985 conf->reshape_progress != MaxSector) {
6849 /* replacements and reshape simply do not mix. */ 6986 /* replacements and reshape simply do not mix. */
6850 printk(KERN_ERR "md: cannot handle concurrent " 6987 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
6851 "replacement and reshape.\n");
6852 goto abort; 6988 goto abort;
6853 } 6989 }
6854 if (test_bit(In_sync, &rdev->flags)) { 6990 if (test_bit(In_sync, &rdev->flags)) {
@@ -6890,8 +7026,7 @@ static int raid5_run(struct mddev *mddev)
6890 mddev->degraded = calc_degraded(conf); 7026 mddev->degraded = calc_degraded(conf);
6891 7027
6892 if (has_failed(conf)) { 7028 if (has_failed(conf)) {
6893 printk(KERN_ERR "md/raid:%s: not enough operational devices" 7029 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
6894 " (%d/%d failed)\n",
6895 mdname(mddev), mddev->degraded, conf->raid_disks); 7030 mdname(mddev), mddev->degraded, conf->raid_disks);
6896 goto abort; 7031 goto abort;
6897 } 7032 }
@@ -6903,29 +7038,19 @@ static int raid5_run(struct mddev *mddev)
6903 if (mddev->degraded > dirty_parity_disks && 7038 if (mddev->degraded > dirty_parity_disks &&
6904 mddev->recovery_cp != MaxSector) { 7039 mddev->recovery_cp != MaxSector) {
6905 if (mddev->ok_start_degraded) 7040 if (mddev->ok_start_degraded)
6906 printk(KERN_WARNING 7041 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
6907 "md/raid:%s: starting dirty degraded array" 7042 mdname(mddev));
6908 " - data corruption possible.\n",
6909 mdname(mddev));
6910 else { 7043 else {
6911 printk(KERN_ERR 7044 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
6912 "md/raid:%s: cannot start dirty degraded array.\n", 7045 mdname(mddev));
6913 mdname(mddev));
6914 goto abort; 7046 goto abort;
6915 } 7047 }
6916 } 7048 }
6917 7049
6918 if (mddev->degraded == 0) 7050 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
6919 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 7051 mdname(mddev), conf->level,
6920 " devices, algorithm %d\n", mdname(mddev), conf->level, 7052 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
6921 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7053 mddev->new_layout);
6922 mddev->new_layout);
6923 else
6924 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
6925 " out of %d devices, algorithm %d\n",
6926 mdname(mddev), conf->level,
6927 mddev->raid_disks - mddev->degraded,
6928 mddev->raid_disks, mddev->new_layout);
6929 7054
6930 print_raid5_conf(conf); 7055 print_raid5_conf(conf);
6931 7056
@@ -6945,9 +7070,8 @@ static int raid5_run(struct mddev *mddev)
6945 mddev->to_remove = NULL; 7070 mddev->to_remove = NULL;
6946 else if (mddev->kobj.sd && 7071 else if (mddev->kobj.sd &&
6947 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7072 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
6948 printk(KERN_WARNING 7073 pr_warn("raid5: failed to create sysfs attributes for %s\n",
6949 "raid5: failed to create sysfs attributes for %s\n", 7074 mdname(mddev));
6950 mdname(mddev));
6951 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7075 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
6952 7076
6953 if (mddev->queue) { 7077 if (mddev->queue) {
@@ -6979,6 +7103,15 @@ static int raid5_run(struct mddev *mddev)
6979 stripe = (stripe | (stripe-1)) + 1; 7103 stripe = (stripe | (stripe-1)) + 1;
6980 mddev->queue->limits.discard_alignment = stripe; 7104 mddev->queue->limits.discard_alignment = stripe;
6981 mddev->queue->limits.discard_granularity = stripe; 7105 mddev->queue->limits.discard_granularity = stripe;
7106
7107 /*
7108 * We use 16-bit counter of active stripes in bi_phys_segments
7109 * (minus one for over-loaded initialization)
7110 */
7111 blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
7112 blk_queue_max_discard_sectors(mddev->queue,
7113 0xfffe * STRIPE_SECTORS);
7114
6982 /* 7115 /*
6983 * unaligned part of discard request will be ignored, so can't 7116 * unaligned part of discard request will be ignored, so can't
6984 * guarantee discard_zeroes_data 7117 * guarantee discard_zeroes_data
@@ -7035,9 +7168,10 @@ static int raid5_run(struct mddev *mddev)
7035 if (journal_dev) { 7168 if (journal_dev) {
7036 char b[BDEVNAME_SIZE]; 7169 char b[BDEVNAME_SIZE];
7037 7170
7038 printk(KERN_INFO"md/raid:%s: using device %s as journal\n", 7171 pr_debug("md/raid:%s: using device %s as journal\n",
7039 mdname(mddev), bdevname(journal_dev->bdev, b)); 7172 mdname(mddev), bdevname(journal_dev->bdev, b));
7040 r5l_init_log(conf, journal_dev); 7173 if (r5l_init_log(conf, journal_dev))
7174 goto abort;
7041 } 7175 }
7042 7176
7043 return 0; 7177 return 0;
@@ -7046,7 +7180,7 @@ abort:
7046 print_raid5_conf(conf); 7180 print_raid5_conf(conf);
7047 free_conf(conf); 7181 free_conf(conf);
7048 mddev->private = NULL; 7182 mddev->private = NULL;
7049 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7183 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7050 return -EIO; 7184 return -EIO;
7051} 7185}
7052 7186
@@ -7080,12 +7214,12 @@ static void print_raid5_conf (struct r5conf *conf)
7080 int i; 7214 int i;
7081 struct disk_info *tmp; 7215 struct disk_info *tmp;
7082 7216
7083 printk(KERN_DEBUG "RAID conf printout:\n"); 7217 pr_debug("RAID conf printout:\n");
7084 if (!conf) { 7218 if (!conf) {
7085 printk("(conf==NULL)\n"); 7219 pr_debug("(conf==NULL)\n");
7086 return; 7220 return;
7087 } 7221 }
7088 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 7222 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7089 conf->raid_disks, 7223 conf->raid_disks,
7090 conf->raid_disks - conf->mddev->degraded); 7224 conf->raid_disks - conf->mddev->degraded);
7091 7225
@@ -7093,7 +7227,7 @@ static void print_raid5_conf (struct r5conf *conf)
7093 char b[BDEVNAME_SIZE]; 7227 char b[BDEVNAME_SIZE];
7094 tmp = conf->disks + i; 7228 tmp = conf->disks + i;
7095 if (tmp->rdev) 7229 if (tmp->rdev)
7096 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 7230 pr_debug(" disk %d, o:%d, dev:%s\n",
7097 i, !test_bit(Faulty, &tmp->rdev->flags), 7231 i, !test_bit(Faulty, &tmp->rdev->flags),
7098 bdevname(tmp->rdev->bdev, b)); 7232 bdevname(tmp->rdev->bdev, b));
7099 } 7233 }
@@ -7241,8 +7375,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7241 * write requests running. We should be safe 7375 * write requests running. We should be safe
7242 */ 7376 */
7243 r5l_init_log(conf, rdev); 7377 r5l_init_log(conf, rdev);
7244 printk(KERN_INFO"md/raid:%s: using device %s as journal\n", 7378 pr_debug("md/raid:%s: using device %s as journal\n",
7245 mdname(mddev), bdevname(rdev->bdev, b)); 7379 mdname(mddev), bdevname(rdev->bdev, b));
7246 return 0; 7380 return 0;
7247 } 7381 }
7248 if (mddev->recovery_disabled == conf->recovery_disabled) 7382 if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7346,10 +7480,10 @@ static int check_stripe_cache(struct mddev *mddev)
7346 > conf->min_nr_stripes || 7480 > conf->min_nr_stripes ||
7347 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7481 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7348 > conf->min_nr_stripes) { 7482 > conf->min_nr_stripes) {
7349 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7483 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
7350 mdname(mddev), 7484 mdname(mddev),
7351 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7485 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7352 / STRIPE_SIZE)*4); 7486 / STRIPE_SIZE)*4);
7353 return 0; 7487 return 0;
7354 } 7488 }
7355 return 1; 7489 return 1;
@@ -7430,8 +7564,8 @@ static int raid5_start_reshape(struct mddev *mddev)
7430 */ 7564 */
7431 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7565 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7432 < mddev->array_sectors) { 7566 < mddev->array_sectors) {
7433 printk(KERN_ERR "md/raid:%s: array size must be reduced " 7567 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
7434 "before number of disks\n", mdname(mddev)); 7568 mdname(mddev));
7435 return -EINVAL; 7569 return -EINVAL;
7436 } 7570 }
7437 7571
@@ -7501,7 +7635,7 @@ static int raid5_start_reshape(struct mddev *mddev)
7501 } 7635 }
7502 mddev->raid_disks = conf->raid_disks; 7636 mddev->raid_disks = conf->raid_disks;
7503 mddev->reshape_position = conf->reshape_progress; 7637 mddev->reshape_position = conf->reshape_progress;
7504 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7638 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7505 7639
7506 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7640 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7507 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7641 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -7619,6 +7753,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
7619 /* '2' tells resync/reshape to pause so that all 7753 /* '2' tells resync/reshape to pause so that all
7620 * active stripes can drain 7754 * active stripes can drain
7621 */ 7755 */
7756 r5c_flush_cache(conf, INT_MAX);
7622 conf->quiesce = 2; 7757 conf->quiesce = 2;
7623 wait_event_cmd(conf->wait_for_quiescent, 7758 wait_event_cmd(conf->wait_for_quiescent,
7624 atomic_read(&conf->active_stripes) == 0 && 7759 atomic_read(&conf->active_stripes) == 0 &&
@@ -7649,8 +7784,8 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
7649 7784
7650 /* for raid0 takeover only one zone is supported */ 7785 /* for raid0 takeover only one zone is supported */
7651 if (raid0_conf->nr_strip_zones > 1) { 7786 if (raid0_conf->nr_strip_zones > 1) {
7652 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 7787 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
7653 mdname(mddev)); 7788 mdname(mddev));
7654 return ERR_PTR(-EINVAL); 7789 return ERR_PTR(-EINVAL);
7655 } 7790 }
7656 7791
@@ -7671,6 +7806,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
7671static void *raid5_takeover_raid1(struct mddev *mddev) 7806static void *raid5_takeover_raid1(struct mddev *mddev)
7672{ 7807{
7673 int chunksect; 7808 int chunksect;
7809 void *ret;
7674 7810
7675 if (mddev->raid_disks != 2 || 7811 if (mddev->raid_disks != 2 ||
7676 mddev->degraded > 1) 7812 mddev->degraded > 1)
@@ -7692,7 +7828,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
7692 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 7828 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
7693 mddev->new_chunk_sectors = chunksect; 7829 mddev->new_chunk_sectors = chunksect;
7694 7830
7695 return setup_conf(mddev); 7831 ret = setup_conf(mddev);
7832 if (!IS_ERR_VALUE(ret))
7833 clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
7834 return ret;
7696} 7835}
7697 7836
7698static void *raid5_takeover_raid6(struct mddev *mddev) 7837static void *raid5_takeover_raid6(struct mddev *mddev)
@@ -7762,7 +7901,7 @@ static int raid5_check_reshape(struct mddev *mddev)
7762 conf->chunk_sectors = new_chunk ; 7901 conf->chunk_sectors = new_chunk ;
7763 mddev->chunk_sectors = new_chunk; 7902 mddev->chunk_sectors = new_chunk;
7764 } 7903 }
7765 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7904 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7766 md_wakeup_thread(mddev->thread); 7905 md_wakeup_thread(mddev->thread);
7767 } 7906 }
7768 return check_reshape(mddev); 7907 return check_reshape(mddev);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 57ec49f0839e..ed8e1362ab36 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -226,6 +226,8 @@ struct stripe_head {
226 226
227 struct r5l_io_unit *log_io; 227 struct r5l_io_unit *log_io;
228 struct list_head log_list; 228 struct list_head log_list;
229 sector_t log_start; /* first meta block on the journal */
230 struct list_head r5c; /* for r5c_cache->stripe_in_journal */
229 /** 231 /**
230 * struct stripe_operations 232 * struct stripe_operations
231 * @target - STRIPE_OP_COMPUTE_BLK target 233 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -264,6 +266,7 @@ struct stripe_head_state {
264 int syncing, expanding, expanded, replacing; 266 int syncing, expanding, expanded, replacing;
265 int locked, uptodate, to_read, to_write, failed, written; 267 int locked, uptodate, to_read, to_write, failed, written;
266 int to_fill, compute, req_compute, non_overwrite; 268 int to_fill, compute, req_compute, non_overwrite;
269 int injournal, just_cached;
267 int failed_num[2]; 270 int failed_num[2];
268 int p_failed, q_failed; 271 int p_failed, q_failed;
269 int dec_preread_active; 272 int dec_preread_active;
@@ -273,6 +276,7 @@ struct stripe_head_state {
273 struct md_rdev *blocked_rdev; 276 struct md_rdev *blocked_rdev;
274 int handle_bad_blocks; 277 int handle_bad_blocks;
275 int log_failed; 278 int log_failed;
279 int waiting_extra_page;
276}; 280};
277 281
278/* Flags for struct r5dev.flags */ 282/* Flags for struct r5dev.flags */
@@ -313,6 +317,11 @@ enum r5dev_flags {
313 */ 317 */
314 R5_Discard, /* Discard the stripe */ 318 R5_Discard, /* Discard the stripe */
315 R5_SkipCopy, /* Don't copy data from bio to stripe cache */ 319 R5_SkipCopy, /* Don't copy data from bio to stripe cache */
320 R5_InJournal, /* data being written is in the journal device.
321 * if R5_InJournal is set for parity pd_idx, all the
322 * data and parity being written are in the journal
323 * device
324 */
316}; 325};
317 326
318/* 327/*
@@ -345,7 +354,30 @@ enum {
345 STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add 354 STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
346 * to batch yet. 355 * to batch yet.
347 */ 356 */
348 STRIPE_LOG_TRAPPED, /* trapped into log */ 357 STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
358 * this bit is used in two scenarios:
359 *
360 * 1. write-out phase
361 * set in first entry of r5l_write_stripe
362 * clear in second entry of r5l_write_stripe
363 * used to bypass logic in handle_stripe
364 *
365 * 2. caching phase
366 * set in r5c_try_caching_write()
367 * clear when journal write is done
368 * used to initiate r5c_cache_data()
369 * also used to bypass logic in handle_stripe
370 */
371 STRIPE_R5C_CACHING, /* the stripe is in caching phase
372 * see more detail in the raid5-cache.c
373 */
374 STRIPE_R5C_PARTIAL_STRIPE, /* in r5c cache (to-be/being handled or
375 * in conf->r5c_partial_stripe_list)
376 */
377 STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or
378 * in conf->r5c_full_stripe_list)
379 */
380 STRIPE_R5C_PREFLUSH, /* need to flush journal device */
349}; 381};
350 382
351#define STRIPE_EXPAND_SYNC_FLAGS \ 383#define STRIPE_EXPAND_SYNC_FLAGS \
@@ -408,8 +440,86 @@ enum {
408 440
409struct disk_info { 441struct disk_info {
410 struct md_rdev *rdev, *replacement; 442 struct md_rdev *rdev, *replacement;
443 struct page *extra_page; /* extra page to use in prexor */
411}; 444};
412 445
446/*
447 * Stripe cache
448 */
449
450#define NR_STRIPES 256
451#define STRIPE_SIZE PAGE_SIZE
452#define STRIPE_SHIFT (PAGE_SHIFT - 9)
453#define STRIPE_SECTORS (STRIPE_SIZE>>9)
454#define IO_THRESHOLD 1
455#define BYPASS_THRESHOLD 1
456#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
457#define HASH_MASK (NR_HASH - 1)
458#define MAX_STRIPE_BATCH 8
459
460/* bio's attached to a stripe+device for I/O are linked together in bi_sector
461 * order without overlap. There may be several bio's per stripe+device, and
462 * a bio could span several devices.
463 * When walking this list for a particular stripe+device, we must never proceed
464 * beyond a bio that extends past this device, as the next bio might no longer
465 * be valid.
466 * This function is used to determine the 'next' bio in the list, given the
467 * sector of the current stripe+device
468 */
469static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
470{
471 int sectors = bio_sectors(bio);
472
473 if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
474 return bio->bi_next;
475 else
476 return NULL;
477}
478
479/*
480 * We maintain a biased count of active stripes in the bottom 16 bits of
481 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
482 */
483static inline int raid5_bi_processed_stripes(struct bio *bio)
484{
485 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
486
487 return (atomic_read(segments) >> 16) & 0xffff;
488}
489
490static inline int raid5_dec_bi_active_stripes(struct bio *bio)
491{
492 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
493
494 return atomic_sub_return(1, segments) & 0xffff;
495}
496
497static inline void raid5_inc_bi_active_stripes(struct bio *bio)
498{
499 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
500
501 atomic_inc(segments);
502}
503
504static inline void raid5_set_bi_processed_stripes(struct bio *bio,
505 unsigned int cnt)
506{
507 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
508 int old, new;
509
510 do {
511 old = atomic_read(segments);
512 new = (old & 0xffff) | (cnt << 16);
513 } while (atomic_cmpxchg(segments, old, new) != old);
514}
515
516static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
517{
518 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
519
520 atomic_set(segments, cnt);
521}
522
413/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. 523/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
414 * This is because we sometimes take all the spinlocks 524 * This is because we sometimes take all the spinlocks
415 * and creating that much locking depth can cause 525 * and creating that much locking depth can cause
@@ -432,6 +542,30 @@ struct r5worker_group {
432 int stripes_cnt; 542 int stripes_cnt;
433}; 543};
434 544
545enum r5_cache_state {
546 R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked,
547 * waiting for 25% to be free
548 */
549 R5_ALLOC_MORE, /* It might help to allocate another
550 * stripe.
551 */
552 R5_DID_ALLOC, /* A stripe was allocated, don't allocate
553 * more until at least one has been
554 * released. This avoids flooding
555 * the cache.
556 */
557 R5C_LOG_TIGHT, /* log device space tight, need to
558 * prioritize stripes at last_checkpoint
559 */
560 R5C_LOG_CRITICAL, /* log device is running out of space,
561 * only process stripes that are already
562 * occupying the log
563 */
564 R5C_EXTRA_PAGE_IN_USE, /* a stripe is using disk_info.extra_page
565 * for prexor
566 */
567};
568
435struct r5conf { 569struct r5conf {
436 struct hlist_head *stripe_hashtbl; 570 struct hlist_head *stripe_hashtbl;
437 /* only protect corresponding hash list and inactive_list */ 571 /* only protect corresponding hash list and inactive_list */
@@ -519,23 +653,18 @@ struct r5conf {
519 */ 653 */
520 atomic_t active_stripes; 654 atomic_t active_stripes;
521 struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; 655 struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
656
657 atomic_t r5c_cached_full_stripes;
658 struct list_head r5c_full_stripe_list;
659 atomic_t r5c_cached_partial_stripes;
660 struct list_head r5c_partial_stripe_list;
661
522 atomic_t empty_inactive_list_nr; 662 atomic_t empty_inactive_list_nr;
523 struct llist_head released_stripes; 663 struct llist_head released_stripes;
524 wait_queue_head_t wait_for_quiescent; 664 wait_queue_head_t wait_for_quiescent;
525 wait_queue_head_t wait_for_stripe; 665 wait_queue_head_t wait_for_stripe;
526 wait_queue_head_t wait_for_overlap; 666 wait_queue_head_t wait_for_overlap;
527 unsigned long cache_state; 667 unsigned long cache_state;
528#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,
529 * waiting for 25% to be free
530 */
531#define R5_ALLOC_MORE 2 /* It might help to allocate another
532 * stripe.
533 */
534#define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate
535 * more until at least one has been
536 * released. This avoids flooding
537 * the cache.
538 */
539 struct shrinker shrinker; 668 struct shrinker shrinker;
540 int pool_size; /* number of disks in stripeheads in pool */ 669 int pool_size; /* number of disks in stripeheads in pool */
541 spinlock_t device_lock; 670 spinlock_t device_lock;
@@ -633,4 +762,23 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
633extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); 762extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
634extern void r5l_quiesce(struct r5l_log *log, int state); 763extern void r5l_quiesce(struct r5l_log *log, int state);
635extern bool r5l_log_disk_error(struct r5conf *conf); 764extern bool r5l_log_disk_error(struct r5conf *conf);
765extern bool r5c_is_writeback(struct r5l_log *log);
766extern int
767r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
768 struct stripe_head_state *s, int disks);
769extern void
770r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
771 struct stripe_head_state *s);
772extern void r5c_release_extra_page(struct stripe_head *sh);
773extern void r5c_use_extra_page(struct stripe_head *sh);
774extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
775extern void r5c_handle_cached_data_endio(struct r5conf *conf,
776 struct stripe_head *sh, int disks, struct bio_list *return_bi);
777extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
778 struct stripe_head_state *s);
779extern void r5c_make_stripe_write_out(struct stripe_head *sh);
780extern void r5c_flush_cache(struct r5conf *conf, int num);
781extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
782extern void r5c_check_cached_full_stripe(struct r5conf *conf);
783extern struct md_sysfs_entry r5c_journal_mode;
636#endif 784#endif
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index c3e654c6d518..9930f3e9040f 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -84,6 +84,10 @@
84#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed 84#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed
85 * For clustered enviroments only. 85 * For clustered enviroments only.
86 */ 86 */
87#define MD_DISK_FAILFAST 10 /* Send REQ_FAILFAST if there are multiple
88 * devices available - and don't try to
89 * correct read errors.
90 */
87 91
88#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. 92#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
89 * read requests will only be sent here in 93 * read requests will only be sent here in
@@ -265,8 +269,9 @@ struct mdp_superblock_1 {
265 __le32 dev_number; /* permanent identifier of this device - not role in raid */ 269 __le32 dev_number; /* permanent identifier of this device - not role in raid */
266 __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ 270 __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
267 __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ 271 __u8 device_uuid[16]; /* user-space setable, ignored by kernel */
268 __u8 devflags; /* per-device flags. Only one defined...*/ 272 __u8 devflags; /* per-device flags. Only two defined...*/
269#define WriteMostly1 1 /* mask for writemostly flag in above */ 273#define WriteMostly1 1 /* mask for writemostly flag in above */
274#define FailFast1 2 /* Should avoid retries and fixups and just fail */
270 /* Bad block log. If there are any bad blocks the feature flag is set. 275 /* Bad block log. If there are any bad blocks the feature flag is set.
271 * If offset and size are non-zero, that space is reserved and available 276 * If offset and size are non-zero, that space is reserved and available
272 */ 277 */
diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c
index 76734004358d..20bca3d44f67 100644
--- a/lib/raid6/avx2.c
+++ b/lib/raid6/avx2.c
@@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
87 kernel_fpu_end(); 87 kernel_fpu_end();
88} 88}
89 89
90static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
91 size_t bytes, void **ptrs)
92{
93 u8 **dptr = (u8 **)ptrs;
94 u8 *p, *q;
95 int d, z, z0;
96
97 z0 = stop; /* P/Q right side optimization */
98 p = dptr[disks-2]; /* XOR parity */
99 q = dptr[disks-1]; /* RS syndrome */
100
101 kernel_fpu_begin();
102
103 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
104
105 for (d = 0 ; d < bytes ; d += 32) {
106 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
107 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
108 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
109 /* P/Q data pages */
110 for (z = z0-1 ; z >= start ; z--) {
111 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
112 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
113 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
114 asm volatile("vpand %ymm0,%ymm5,%ymm5");
115 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
116 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
117 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
118 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119 }
120 /* P/Q left side optimization */
121 for (z = start-1 ; z >= 0 ; z--) {
122 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
123 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
124 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
125 asm volatile("vpand %ymm0,%ymm5,%ymm5");
126 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
127 }
128 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
129 /* Don't use movntdq for r/w memory area < cache line */
130 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
131 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
132 }
133
134 asm volatile("sfence" : : : "memory");
135 kernel_fpu_end();
136}
137
90const struct raid6_calls raid6_avx2x1 = { 138const struct raid6_calls raid6_avx2x1 = {
91 raid6_avx21_gen_syndrome, 139 raid6_avx21_gen_syndrome,
92 NULL, /* XOR not yet implemented */ 140 raid6_avx21_xor_syndrome,
93 raid6_have_avx2, 141 raid6_have_avx2,
94 "avx2x1", 142 "avx2x1",
95 1 /* Has cache hints */ 143 1 /* Has cache hints */
@@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
149 kernel_fpu_end(); 197 kernel_fpu_end();
150} 198}
151 199
200static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
201 size_t bytes, void **ptrs)
202{
203 u8 **dptr = (u8 **)ptrs;
204 u8 *p, *q;
205 int d, z, z0;
206
207 z0 = stop; /* P/Q right side optimization */
208 p = dptr[disks-2]; /* XOR parity */
209 q = dptr[disks-1]; /* RS syndrome */
210
211 kernel_fpu_begin();
212
213 asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
214
215 for (d = 0 ; d < bytes ; d += 64) {
216 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
217 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
218 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
219 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
220 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
221 asm volatile("vpxor %ymm6,%ymm3,%ymm3");
222 /* P/Q data pages */
223 for (z = z0-1 ; z >= start ; z--) {
224 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
225 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
226 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
227 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
228 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
229 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
230 asm volatile("vpand %ymm0,%ymm5,%ymm5");
231 asm volatile("vpand %ymm0,%ymm7,%ymm7");
232 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
233 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
234 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
235 asm volatile("vmovdqa %0,%%ymm7"
236 :: "m" (dptr[z][d+32]));
237 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
238 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
239 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
240 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
241 }
242 /* P/Q left side optimization */
243 for (z = start-1 ; z >= 0 ; z--) {
244 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
245 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
246 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
247 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
248 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
249 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
250 asm volatile("vpand %ymm0,%ymm5,%ymm5");
251 asm volatile("vpand %ymm0,%ymm7,%ymm7");
252 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
253 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
254 }
255 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
256 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
257 /* Don't use movntdq for r/w memory area < cache line */
258 asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
259 asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
260 asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
261 asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
262 }
263
264 asm volatile("sfence" : : : "memory");
265 kernel_fpu_end();
266}
267
152const struct raid6_calls raid6_avx2x2 = { 268const struct raid6_calls raid6_avx2x2 = {
153 raid6_avx22_gen_syndrome, 269 raid6_avx22_gen_syndrome,
154 NULL, /* XOR not yet implemented */ 270 raid6_avx22_xor_syndrome,
155 raid6_have_avx2, 271 raid6_have_avx2,
156 "avx2x2", 272 "avx2x2",
157 1 /* Has cache hints */ 273 1 /* Has cache hints */
@@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
242 kernel_fpu_end(); 358 kernel_fpu_end();
243} 359}
244 360
361static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
362 size_t bytes, void **ptrs)
363{
364 u8 **dptr = (u8 **)ptrs;
365 u8 *p, *q;
366 int d, z, z0;
367
368 z0 = stop; /* P/Q right side optimization */
369 p = dptr[disks-2]; /* XOR parity */
370 q = dptr[disks-1]; /* RS syndrome */
371
372 kernel_fpu_begin();
373
374 asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
375
376 for (d = 0 ; d < bytes ; d += 128) {
377 asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
378 asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
379 asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
380 asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
381 asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
382 asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
383 asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
384 asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
385 asm volatile("vpxor %ymm4,%ymm2,%ymm2");
386 asm volatile("vpxor %ymm6,%ymm3,%ymm3");
387 asm volatile("vpxor %ymm12,%ymm10,%ymm10");
388 asm volatile("vpxor %ymm14,%ymm11,%ymm11");
389 /* P/Q data pages */
390 for (z = z0-1 ; z >= start ; z--) {
391 asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
392 asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
393 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
394 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
395 asm volatile("vpxor %ymm13,%ymm13,%ymm13");
396 asm volatile("vpxor %ymm15,%ymm15,%ymm15");
397 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
398 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
399 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
400 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
401 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
402 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
403 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
404 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
405 asm volatile("vpand %ymm0,%ymm5,%ymm5");
406 asm volatile("vpand %ymm0,%ymm7,%ymm7");
407 asm volatile("vpand %ymm0,%ymm13,%ymm13");
408 asm volatile("vpand %ymm0,%ymm15,%ymm15");
409 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
410 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
411 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
412 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
413 asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
414 asm volatile("vmovdqa %0,%%ymm7"
415 :: "m" (dptr[z][d+32]));
416 asm volatile("vmovdqa %0,%%ymm13"
417 :: "m" (dptr[z][d+64]));
418 asm volatile("vmovdqa %0,%%ymm15"
419 :: "m" (dptr[z][d+96]));
420 asm volatile("vpxor %ymm5,%ymm2,%ymm2");
421 asm volatile("vpxor %ymm7,%ymm3,%ymm3");
422 asm volatile("vpxor %ymm13,%ymm10,%ymm10");
423 asm volatile("vpxor %ymm15,%ymm11,%ymm11");
424 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
425 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
426 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
427 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
428 }
429 asm volatile("prefetchnta %0" :: "m" (q[d]));
430 asm volatile("prefetchnta %0" :: "m" (q[d+64]));
431 /* P/Q left side optimization */
432 for (z = start-1 ; z >= 0 ; z--) {
433 asm volatile("vpxor %ymm5,%ymm5,%ymm5");
434 asm volatile("vpxor %ymm7,%ymm7,%ymm7");
435 asm volatile("vpxor %ymm13,%ymm13,%ymm13");
436 asm volatile("vpxor %ymm15,%ymm15,%ymm15");
437 asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
438 asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
439 asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
440 asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
441 asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
442 asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
443 asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
444 asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
445 asm volatile("vpand %ymm0,%ymm5,%ymm5");
446 asm volatile("vpand %ymm0,%ymm7,%ymm7");
447 asm volatile("vpand %ymm0,%ymm13,%ymm13");
448 asm volatile("vpand %ymm0,%ymm15,%ymm15");
449 asm volatile("vpxor %ymm5,%ymm4,%ymm4");
450 asm volatile("vpxor %ymm7,%ymm6,%ymm6");
451 asm volatile("vpxor %ymm13,%ymm12,%ymm12");
452 asm volatile("vpxor %ymm15,%ymm14,%ymm14");
453 }
454 asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
455 asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
456 asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
457 asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
458 asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
459 asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
460 asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
461 asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
462 asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
463 asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
464 asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
465 asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
466 }
467 asm volatile("sfence" : : : "memory");
468 kernel_fpu_end();
469}
470
245const struct raid6_calls raid6_avx2x4 = { 471const struct raid6_calls raid6_avx2x4 = {
246 raid6_avx24_gen_syndrome, 472 raid6_avx24_gen_syndrome,
247 NULL, /* XOR not yet implemented */ 473 raid6_avx24_xor_syndrome,
248 raid6_have_avx2, 474 raid6_have_avx2,
249 "avx2x4", 475 "avx2x4",
250 1 /* Has cache hints */ 476 1 /* Has cache hints */