aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.com>2015-10-13 16:09:52 -0400
committerNeilBrown <neilb@suse.com>2015-10-13 16:09:52 -0400
commitc2a06c38d92d044a69a3eae0138ab95ff0788030 (patch)
treee193af1aaf9ea876dedf6db38ded53f8d7a7f9b4
parent25cb62b76430a91cc6195f902e61c2cb84ade622 (diff)
parent23b63f9fa82eed128b5c585cbfe10ced82d73e91 (diff)
Merge branch 'md-next' of git://github.com/goldwynr/linux into for-next
md-cluster: A better way for METADATA_UPDATED processing The processing of METADATA_UPDATED message is too simple and prone to errors. Besides, it would not update the internal data structures as required. This set of patches reads the superblock from one of the device of the MD and checks for changes in the in-memory data structures. If there is a change, it performs the necessary actions to keep the internal data structures as it would be in the primary node. An example is if a devices turns faulty. The algorithm is: 1. The initiator node marks the device as faulty and updates the superblock 2. The initiator node sends METADATA_UPDATED with an advisory device number to the rest of the nodes. 3. The receiving node on receiving the METADATA_UPDATED message 3.1 Reads the superblock 3.2 Detects a device has failed by comparing with memory structure 3.3 Calls the necessary functions to record the failure and get the device out of the active array. 3.4 Acknowledges the message. The patch series also fixes adding the disk which was impacted because of the changes. Patches can also be found at https://github.com/goldwynr/linux branch md-next Changes since V2: - Fix status synchrnoization after --add and --re-add operations - Included Guoqing's patches on endian correctness, zeroing cmsg etc - Restructure add_new_disk() and cancel()
-rw-r--r--drivers/md/bitmap.c14
-rw-r--r--drivers/md/bitmap.h4
-rw-r--r--drivers/md/md-cluster.c198
-rw-r--r--drivers/md/md-cluster.h12
-rw-r--r--drivers/md/md.c359
-rw-r--r--drivers/md/md.h2
-rw-r--r--drivers/md/raid1.c33
-rw-r--r--drivers/md/raid1.h7
-rw-r--r--drivers/md/raid10.c2
-rw-r--r--drivers/md/raid5.c2
-rw-r--r--include/uapi/linux/raid/md_p.h2
11 files changed, 421 insertions, 214 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 48b5890c28e3..4f22e919787a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -613,12 +613,10 @@ re_read:
613 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; 613 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
614 write_behind = le32_to_cpu(sb->write_behind); 614 write_behind = le32_to_cpu(sb->write_behind);
615 sectors_reserved = le32_to_cpu(sb->sectors_reserved); 615 sectors_reserved = le32_to_cpu(sb->sectors_reserved);
616 /* XXX: This is a hack to ensure that we don't use clustering 616 /* Setup nodes/clustername only if bitmap version is
617 * in case: 617 * cluster-compatible
618 * - dm-raid is in use and
619 * - the nodes written in bitmap_sb is erroneous.
620 */ 618 */
621 if (!bitmap->mddev->sync_super) { 619 if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
622 nodes = le32_to_cpu(sb->nodes); 620 nodes = le32_to_cpu(sb->nodes);
623 strlcpy(bitmap->mddev->bitmap_info.cluster_name, 621 strlcpy(bitmap->mddev->bitmap_info.cluster_name,
624 sb->cluster_name, 64); 622 sb->cluster_name, 64);
@@ -628,7 +626,7 @@ re_read:
628 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) 626 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
629 reason = "bad magic"; 627 reason = "bad magic";
630 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || 628 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
631 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) 629 le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
632 reason = "unrecognized superblock version"; 630 reason = "unrecognized superblock version";
633 else if (chunksize < 512) 631 else if (chunksize < 512)
634 reason = "bitmap chunksize too small"; 632 reason = "bitmap chunksize too small";
@@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
1572} 1570}
1573EXPORT_SYMBOL(bitmap_close_sync); 1571EXPORT_SYMBOL(bitmap_close_sync);
1574 1572
1575void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) 1573void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
1576{ 1574{
1577 sector_t s = 0; 1575 sector_t s = 0;
1578 sector_t blocks; 1576 sector_t blocks;
@@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1583 bitmap->last_end_sync = jiffies; 1581 bitmap->last_end_sync = jiffies;
1584 return; 1582 return;
1585 } 1583 }
1586 if (time_before(jiffies, (bitmap->last_end_sync 1584 if (!force && time_before(jiffies, (bitmap->last_end_sync
1587 + bitmap->mddev->bitmap_info.daemon_sleep))) 1585 + bitmap->mddev->bitmap_info.daemon_sleep)))
1588 return; 1586 return;
1589 wait_event(bitmap->mddev->recovery_wait, 1587 wait_event(bitmap->mddev->recovery_wait,
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index f1f4dd01090d..7d5c3a610ca5 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -9,8 +9,10 @@
9#define BITMAP_MAJOR_LO 3 9#define BITMAP_MAJOR_LO 3
10/* version 4 insists the bitmap is in little-endian order 10/* version 4 insists the bitmap is in little-endian order
11 * with version 3, it is host-endian which is non-portable 11 * with version 3, it is host-endian which is non-portable
12 * Version 5 is currently set only for clustered devices
12 */ 13 */
13#define BITMAP_MAJOR_HI 4 14#define BITMAP_MAJOR_HI 4
15#define BITMAP_MAJOR_CLUSTERED 5
14#define BITMAP_MAJOR_HOSTENDIAN 3 16#define BITMAP_MAJOR_HOSTENDIAN 3
15 17
16/* 18/*
@@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
255int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); 257int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
256void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); 258void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
257void bitmap_close_sync(struct bitmap *bitmap); 259void bitmap_close_sync(struct bitmap *bitmap);
258void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); 260void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
259 261
260void bitmap_unplug(struct bitmap *bitmap); 262void bitmap_unplug(struct bitmap *bitmap);
261void bitmap_daemon_work(struct mddev *mddev); 263void bitmap_daemon_work(struct mddev *mddev);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 11e3bc9d2a4b..35ac2e8cb7f1 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -28,6 +28,7 @@ struct dlm_lock_resource {
28 struct completion completion; /* completion for synchronized locking */ 28 struct completion completion; /* completion for synchronized locking */
29 void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ 29 void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
30 struct mddev *mddev; /* pointing back to mddev. */ 30 struct mddev *mddev; /* pointing back to mddev. */
31 int mode;
31}; 32};
32 33
33struct suspend_info { 34struct suspend_info {
@@ -55,6 +56,7 @@ struct md_cluster_info {
55 struct completion completion; 56 struct completion completion;
56 struct mutex sb_mutex; 57 struct mutex sb_mutex;
57 struct dlm_lock_resource *bitmap_lockres; 58 struct dlm_lock_resource *bitmap_lockres;
59 struct dlm_lock_resource *resync_lockres;
58 struct list_head suspend_list; 60 struct list_head suspend_list;
59 spinlock_t suspend_lock; 61 spinlock_t suspend_lock;
60 struct md_thread *recovery_thread; 62 struct md_thread *recovery_thread;
@@ -106,6 +108,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
106 if (ret) 108 if (ret)
107 return ret; 109 return ret;
108 wait_for_completion(&res->completion); 110 wait_for_completion(&res->completion);
111 if (res->lksb.sb_status == 0)
112 res->mode = mode;
109 return res->lksb.sb_status; 113 return res->lksb.sb_status;
110} 114}
111 115
@@ -127,6 +131,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
127 init_completion(&res->completion); 131 init_completion(&res->completion);
128 res->ls = cinfo->lockspace; 132 res->ls = cinfo->lockspace;
129 res->mddev = mddev; 133 res->mddev = mddev;
134 res->mode = DLM_LOCK_IV;
130 namelen = strlen(name); 135 namelen = strlen(name);
131 res->name = kzalloc(namelen + 1, GFP_KERNEL); 136 res->name = kzalloc(namelen + 1, GFP_KERNEL);
132 if (!res->name) { 137 if (!res->name) {
@@ -358,29 +363,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
358 363
359 list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) 364 list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
360 if (slot == s->slot) { 365 if (slot == s->slot) {
361 pr_info("%s:%d Deleting suspend_info: %d\n",
362 __func__, __LINE__, slot);
363 list_del(&s->list); 366 list_del(&s->list);
364 kfree(s); 367 kfree(s);
365 break; 368 break;
366 } 369 }
367} 370}
368 371
369static void remove_suspend_info(struct md_cluster_info *cinfo, int slot) 372static void remove_suspend_info(struct mddev *mddev, int slot)
370{ 373{
374 struct md_cluster_info *cinfo = mddev->cluster_info;
371 spin_lock_irq(&cinfo->suspend_lock); 375 spin_lock_irq(&cinfo->suspend_lock);
372 __remove_suspend_info(cinfo, slot); 376 __remove_suspend_info(cinfo, slot);
373 spin_unlock_irq(&cinfo->suspend_lock); 377 spin_unlock_irq(&cinfo->suspend_lock);
378 mddev->pers->quiesce(mddev, 2);
374} 379}
375 380
376 381
377static void process_suspend_info(struct md_cluster_info *cinfo, 382static void process_suspend_info(struct mddev *mddev,
378 int slot, sector_t lo, sector_t hi) 383 int slot, sector_t lo, sector_t hi)
379{ 384{
385 struct md_cluster_info *cinfo = mddev->cluster_info;
380 struct suspend_info *s; 386 struct suspend_info *s;
381 387
382 if (!hi) { 388 if (!hi) {
383 remove_suspend_info(cinfo, slot); 389 remove_suspend_info(mddev, slot);
390 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
391 md_wakeup_thread(mddev->thread);
384 return; 392 return;
385 } 393 }
386 s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); 394 s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
@@ -389,11 +397,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
389 s->slot = slot; 397 s->slot = slot;
390 s->lo = lo; 398 s->lo = lo;
391 s->hi = hi; 399 s->hi = hi;
400 mddev->pers->quiesce(mddev, 1);
401 mddev->pers->quiesce(mddev, 0);
392 spin_lock_irq(&cinfo->suspend_lock); 402 spin_lock_irq(&cinfo->suspend_lock);
393 /* Remove existing entry (if exists) before adding */ 403 /* Remove existing entry (if exists) before adding */
394 __remove_suspend_info(cinfo, slot); 404 __remove_suspend_info(cinfo, slot);
395 list_add(&s->list, &cinfo->suspend_list); 405 list_add(&s->list, &cinfo->suspend_list);
396 spin_unlock_irq(&cinfo->suspend_lock); 406 spin_unlock_irq(&cinfo->suspend_lock);
407 mddev->pers->quiesce(mddev, 2);
397} 408}
398 409
399static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) 410static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
@@ -407,7 +418,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
407 418
408 len = snprintf(disk_uuid, 64, "DEVICE_UUID="); 419 len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
409 sprintf(disk_uuid + len, "%pU", cmsg->uuid); 420 sprintf(disk_uuid + len, "%pU", cmsg->uuid);
410 snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); 421 snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
411 pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); 422 pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
412 init_completion(&cinfo->newdisk_completion); 423 init_completion(&cinfo->newdisk_completion);
413 set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); 424 set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
@@ -421,63 +432,57 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
421static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) 432static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
422{ 433{
423 struct md_cluster_info *cinfo = mddev->cluster_info; 434 struct md_cluster_info *cinfo = mddev->cluster_info;
424 435 md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
425 md_reload_sb(mddev);
426 dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); 436 dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
427} 437}
428 438
429static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) 439static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
430{ 440{
431 struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); 441 struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
442 le32_to_cpu(msg->raid_slot));
432 443
433 if (rdev) 444 if (rdev)
434 md_kick_rdev_from_array(rdev); 445 md_kick_rdev_from_array(rdev);
435 else 446 else
436 pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot); 447 pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
448 __func__, __LINE__, le32_to_cpu(msg->raid_slot));
437} 449}
438 450
439static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) 451static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
440{ 452{
441 struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); 453 struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
454 le32_to_cpu(msg->raid_slot));
442 455
443 if (rdev && test_bit(Faulty, &rdev->flags)) 456 if (rdev && test_bit(Faulty, &rdev->flags))
444 clear_bit(Faulty, &rdev->flags); 457 clear_bit(Faulty, &rdev->flags);
445 else 458 else
446 pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot); 459 pr_warn("%s: %d Could not find disk(%d) which is faulty",
460 __func__, __LINE__, le32_to_cpu(msg->raid_slot));
447} 461}
448 462
449static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) 463static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
450{ 464{
465 if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
466 "node %d received it's own msg\n", le32_to_cpu(msg->slot)))
467 return;
451 switch (msg->type) { 468 switch (msg->type) {
452 case METADATA_UPDATED: 469 case METADATA_UPDATED:
453 pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
454 __func__, __LINE__, msg->slot);
455 process_metadata_update(mddev, msg); 470 process_metadata_update(mddev, msg);
456 break; 471 break;
457 case RESYNCING: 472 case RESYNCING:
458 pr_info("%s: %d Received message: RESYNCING from %d\n", 473 process_suspend_info(mddev, msg->slot,
459 __func__, __LINE__, msg->slot);
460 process_suspend_info(mddev->cluster_info, msg->slot,
461 msg->low, msg->high); 474 msg->low, msg->high);
462 break; 475 break;
463 case NEWDISK: 476 case NEWDISK:
464 pr_info("%s: %d Received message: NEWDISK from %d\n",
465 __func__, __LINE__, msg->slot);
466 process_add_new_disk(mddev, msg); 477 process_add_new_disk(mddev, msg);
467 break; 478 break;
468 case REMOVE: 479 case REMOVE:
469 pr_info("%s: %d Received REMOVE from %d\n",
470 __func__, __LINE__, msg->slot);
471 process_remove_disk(mddev, msg); 480 process_remove_disk(mddev, msg);
472 break; 481 break;
473 case RE_ADD: 482 case RE_ADD:
474 pr_info("%s: %d Received RE_ADD from %d\n",
475 __func__, __LINE__, msg->slot);
476 process_readd_disk(mddev, msg); 483 process_readd_disk(mddev, msg);
477 break; 484 break;
478 case BITMAP_NEEDS_SYNC: 485 case BITMAP_NEEDS_SYNC:
479 pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
480 __func__, __LINE__, msg->slot);
481 __recover_slot(mddev, msg->slot); 486 __recover_slot(mddev, msg->slot);
482 break; 487 break;
483 default: 488 default:
@@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread)
528/* lock_comm() 533/* lock_comm()
529 * Takes the lock on the TOKEN lock resource so no other 534 * Takes the lock on the TOKEN lock resource so no other
530 * node can communicate while the operation is underway. 535 * node can communicate while the operation is underway.
536 * If called again, and the TOKEN lock is alread in EX mode
537 * return success. However, care must be taken that unlock_comm()
538 * is called only once.
531 */ 539 */
532static int lock_comm(struct md_cluster_info *cinfo) 540static int lock_comm(struct md_cluster_info *cinfo)
533{ 541{
534 int error; 542 int error;
535 543
544 if (cinfo->token_lockres->mode == DLM_LOCK_EX)
545 return 0;
546
536 error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); 547 error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
537 if (error) 548 if (error)
538 pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", 549 pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
@@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo)
542 553
543static void unlock_comm(struct md_cluster_info *cinfo) 554static void unlock_comm(struct md_cluster_info *cinfo)
544{ 555{
556 WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
545 dlm_unlock_sync(cinfo->token_lockres); 557 dlm_unlock_sync(cinfo->token_lockres);
546} 558}
547 559
@@ -753,6 +765,10 @@ static int join(struct mddev *mddev, int nodes)
753 goto err; 765 goto err;
754 } 766 }
755 767
768 cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
769 if (!cinfo->resync_lockres)
770 goto err;
771
756 ret = gather_all_resync_info(mddev, nodes); 772 ret = gather_all_resync_info(mddev, nodes);
757 if (ret) 773 if (ret)
758 goto err; 774 goto err;
@@ -763,6 +779,7 @@ err:
763 lockres_free(cinfo->token_lockres); 779 lockres_free(cinfo->token_lockres);
764 lockres_free(cinfo->ack_lockres); 780 lockres_free(cinfo->ack_lockres);
765 lockres_free(cinfo->no_new_dev_lockres); 781 lockres_free(cinfo->no_new_dev_lockres);
782 lockres_free(cinfo->resync_lockres);
766 lockres_free(cinfo->bitmap_lockres); 783 lockres_free(cinfo->bitmap_lockres);
767 if (cinfo->lockspace) 784 if (cinfo->lockspace)
768 dlm_release_lockspace(cinfo->lockspace, 2); 785 dlm_release_lockspace(cinfo->lockspace, 2);
@@ -771,12 +788,32 @@ err:
771 return ret; 788 return ret;
772} 789}
773 790
791static void resync_bitmap(struct mddev *mddev)
792{
793 struct md_cluster_info *cinfo = mddev->cluster_info;
794 struct cluster_msg cmsg = {0};
795 int err;
796
797 cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
798 err = sendmsg(cinfo, &cmsg);
799 if (err)
800 pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
801 __func__, __LINE__, err);
802}
803
774static int leave(struct mddev *mddev) 804static int leave(struct mddev *mddev)
775{ 805{
776 struct md_cluster_info *cinfo = mddev->cluster_info; 806 struct md_cluster_info *cinfo = mddev->cluster_info;
777 807
778 if (!cinfo) 808 if (!cinfo)
779 return 0; 809 return 0;
810
811 /* BITMAP_NEEDS_SYNC message should be sent when node
812 * is leaving the cluster with dirty bitmap, also we
813 * can only deliver it when dlm connection is available */
814 if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
815 resync_bitmap(mddev);
816
780 md_unregister_thread(&cinfo->recovery_thread); 817 md_unregister_thread(&cinfo->recovery_thread);
781 md_unregister_thread(&cinfo->recv_thread); 818 md_unregister_thread(&cinfo->recv_thread);
782 lockres_free(cinfo->message_lockres); 819 lockres_free(cinfo->message_lockres);
@@ -799,15 +836,6 @@ static int slot_number(struct mddev *mddev)
799 return cinfo->slot_number - 1; 836 return cinfo->slot_number - 1;
800} 837}
801 838
802static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
803{
804 struct md_cluster_info *cinfo = mddev->cluster_info;
805
806 add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
807 /* Re-acquire the lock to refresh LVB */
808 dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
809}
810
811static int metadata_update_start(struct mddev *mddev) 839static int metadata_update_start(struct mddev *mddev)
812{ 840{
813 return lock_comm(mddev->cluster_info); 841 return lock_comm(mddev->cluster_info);
@@ -817,59 +845,61 @@ static int metadata_update_finish(struct mddev *mddev)
817{ 845{
818 struct md_cluster_info *cinfo = mddev->cluster_info; 846 struct md_cluster_info *cinfo = mddev->cluster_info;
819 struct cluster_msg cmsg; 847 struct cluster_msg cmsg;
820 int ret; 848 struct md_rdev *rdev;
849 int ret = 0;
821 850
822 memset(&cmsg, 0, sizeof(cmsg)); 851 memset(&cmsg, 0, sizeof(cmsg));
823 cmsg.type = cpu_to_le32(METADATA_UPDATED); 852 cmsg.type = cpu_to_le32(METADATA_UPDATED);
824 ret = __sendmsg(cinfo, &cmsg); 853 cmsg.raid_slot = -1;
854 /* Pick up a good active device number to send.
855 */
856 rdev_for_each(rdev, mddev)
857 if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
858 cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
859 break;
860 }
861 if (cmsg.raid_slot >= 0)
862 ret = __sendmsg(cinfo, &cmsg);
863 else
864 pr_warn("md-cluster: No good device id found to send\n");
825 unlock_comm(cinfo); 865 unlock_comm(cinfo);
826 return ret; 866 return ret;
827} 867}
828 868
829static int metadata_update_cancel(struct mddev *mddev) 869static void metadata_update_cancel(struct mddev *mddev)
830{ 870{
831 struct md_cluster_info *cinfo = mddev->cluster_info; 871 struct md_cluster_info *cinfo = mddev->cluster_info;
872 unlock_comm(cinfo);
873}
832 874
833 return dlm_unlock_sync(cinfo->token_lockres); 875static int resync_start(struct mddev *mddev)
876{
877 struct md_cluster_info *cinfo = mddev->cluster_info;
878 cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
879 return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
834} 880}
835 881
836static int resync_send(struct mddev *mddev, enum msg_type type, 882static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
837 sector_t lo, sector_t hi)
838{ 883{
839 struct md_cluster_info *cinfo = mddev->cluster_info; 884 struct md_cluster_info *cinfo = mddev->cluster_info;
840 struct cluster_msg cmsg; 885 struct cluster_msg cmsg = {0};
841 int slot = cinfo->slot_number - 1;
842 886
843 pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, 887 add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
844 (unsigned long long)lo, 888 /* Re-acquire the lock to refresh LVB */
845 (unsigned long long)hi); 889 dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
846 resync_info_update(mddev, lo, hi); 890 cmsg.type = cpu_to_le32(RESYNCING);
847 cmsg.type = cpu_to_le32(type);
848 cmsg.slot = cpu_to_le32(slot);
849 cmsg.low = cpu_to_le64(lo); 891 cmsg.low = cpu_to_le64(lo);
850 cmsg.high = cpu_to_le64(hi); 892 cmsg.high = cpu_to_le64(hi);
851 return sendmsg(cinfo, &cmsg);
852}
853 893
854static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) 894 return sendmsg(cinfo, &cmsg);
855{
856 pr_info("%s:%d\n", __func__, __LINE__);
857 return resync_send(mddev, RESYNCING, lo, hi);
858} 895}
859 896
860static void resync_finish(struct mddev *mddev) 897static int resync_finish(struct mddev *mddev)
861{ 898{
862 struct md_cluster_info *cinfo = mddev->cluster_info; 899 struct md_cluster_info *cinfo = mddev->cluster_info;
863 struct cluster_msg cmsg; 900 cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
864 int slot = cinfo->slot_number - 1; 901 dlm_unlock_sync(cinfo->resync_lockres);
865 902 return resync_info_update(mddev, 0, 0);
866 pr_info("%s:%d\n", __func__, __LINE__);
867 resync_send(mddev, RESYNCING, 0, 0);
868 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
869 cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
870 cmsg.slot = cpu_to_le32(slot);
871 sendmsg(cinfo, &cmsg);
872 }
873} 903}
874 904
875static int area_resyncing(struct mddev *mddev, int direction, 905static int area_resyncing(struct mddev *mddev, int direction,
@@ -896,7 +926,11 @@ out:
896 return ret; 926 return ret;
897} 927}
898 928
899static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) 929/* add_new_disk() - initiates a disk add
930 * However, if this fails before writing md_update_sb(),
931 * add_new_disk_cancel() must be called to release token lock
932 */
933static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
900{ 934{
901 struct md_cluster_info *cinfo = mddev->cluster_info; 935 struct md_cluster_info *cinfo = mddev->cluster_info;
902 struct cluster_msg cmsg; 936 struct cluster_msg cmsg;
@@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
907 memset(&cmsg, 0, sizeof(cmsg)); 941 memset(&cmsg, 0, sizeof(cmsg));
908 cmsg.type = cpu_to_le32(NEWDISK); 942 cmsg.type = cpu_to_le32(NEWDISK);
909 memcpy(cmsg.uuid, uuid, 16); 943 memcpy(cmsg.uuid, uuid, 16);
910 cmsg.raid_slot = rdev->desc_nr; 944 cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
911 lock_comm(cinfo); 945 lock_comm(cinfo);
912 ret = __sendmsg(cinfo, &cmsg); 946 ret = __sendmsg(cinfo, &cmsg);
913 if (ret) 947 if (ret)
@@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
918 /* Some node does not "see" the device */ 952 /* Some node does not "see" the device */
919 if (ret == -EAGAIN) 953 if (ret == -EAGAIN)
920 ret = -ENOENT; 954 ret = -ENOENT;
955 if (ret)
956 unlock_comm(cinfo);
921 else 957 else
922 dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); 958 dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
923 return ret; 959 return ret;
924} 960}
925 961
926static int add_new_disk_finish(struct mddev *mddev) 962static void add_new_disk_cancel(struct mddev *mddev)
927{ 963{
928 struct cluster_msg cmsg;
929 struct md_cluster_info *cinfo = mddev->cluster_info; 964 struct md_cluster_info *cinfo = mddev->cluster_info;
930 int ret;
931 /* Write sb and inform others */
932 md_update_sb(mddev, 1);
933 cmsg.type = METADATA_UPDATED;
934 ret = __sendmsg(cinfo, &cmsg);
935 unlock_comm(cinfo); 965 unlock_comm(cinfo);
936 return ret;
937} 966}
938 967
939static int new_disk_ack(struct mddev *mddev, bool ack) 968static int new_disk_ack(struct mddev *mddev, bool ack)
@@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
953 982
954static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) 983static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
955{ 984{
956 struct cluster_msg cmsg; 985 struct cluster_msg cmsg = {0};
957 struct md_cluster_info *cinfo = mddev->cluster_info; 986 struct md_cluster_info *cinfo = mddev->cluster_info;
958 cmsg.type = REMOVE; 987 cmsg.type = cpu_to_le32(REMOVE);
959 cmsg.raid_slot = rdev->desc_nr; 988 cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
960 return __sendmsg(cinfo, &cmsg); 989 return __sendmsg(cinfo, &cmsg);
961} 990}
962 991
@@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev)
964{ 993{
965 int sn, err; 994 int sn, err;
966 sector_t lo, hi; 995 sector_t lo, hi;
967 struct cluster_msg cmsg; 996 struct cluster_msg cmsg = {0};
968 struct mddev *mddev = rdev->mddev; 997 struct mddev *mddev = rdev->mddev;
969 struct md_cluster_info *cinfo = mddev->cluster_info; 998 struct md_cluster_info *cinfo = mddev->cluster_info;
970 999
971 cmsg.type = RE_ADD; 1000 cmsg.type = cpu_to_le32(RE_ADD);
972 cmsg.raid_slot = rdev->desc_nr; 1001 cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
973 err = sendmsg(cinfo, &cmsg); 1002 err = sendmsg(cinfo, &cmsg);
974 if (err) 1003 if (err)
975 goto out; 1004 goto out;
@@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = {
993 .join = join, 1022 .join = join,
994 .leave = leave, 1023 .leave = leave,
995 .slot_number = slot_number, 1024 .slot_number = slot_number,
996 .resync_info_update = resync_info_update,
997 .resync_start = resync_start, 1025 .resync_start = resync_start,
998 .resync_finish = resync_finish, 1026 .resync_finish = resync_finish,
1027 .resync_info_update = resync_info_update,
999 .metadata_update_start = metadata_update_start, 1028 .metadata_update_start = metadata_update_start,
1000 .metadata_update_finish = metadata_update_finish, 1029 .metadata_update_finish = metadata_update_finish,
1001 .metadata_update_cancel = metadata_update_cancel, 1030 .metadata_update_cancel = metadata_update_cancel,
1002 .area_resyncing = area_resyncing, 1031 .area_resyncing = area_resyncing,
1003 .add_new_disk_start = add_new_disk_start, 1032 .add_new_disk = add_new_disk,
1004 .add_new_disk_finish = add_new_disk_finish, 1033 .add_new_disk_cancel = add_new_disk_cancel,
1005 .new_disk_ack = new_disk_ack, 1034 .new_disk_ack = new_disk_ack,
1006 .remove_disk = remove_disk, 1035 .remove_disk = remove_disk,
1007 .gather_bitmaps = gather_bitmaps, 1036 .gather_bitmaps = gather_bitmaps,
@@ -1022,5 +1051,6 @@ static void cluster_exit(void)
1022 1051
1023module_init(cluster_init); 1052module_init(cluster_init);
1024module_exit(cluster_exit); 1053module_exit(cluster_exit);
1054MODULE_AUTHOR("SUSE");
1025MODULE_LICENSE("GPL"); 1055MODULE_LICENSE("GPL");
1026MODULE_DESCRIPTION("Clustering support for MD"); 1056MODULE_DESCRIPTION("Clustering support for MD");
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 00defe2badbc..e75ea2613184 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -12,15 +12,15 @@ struct md_cluster_operations {
12 int (*join)(struct mddev *mddev, int nodes); 12 int (*join)(struct mddev *mddev, int nodes);
13 int (*leave)(struct mddev *mddev); 13 int (*leave)(struct mddev *mddev);
14 int (*slot_number)(struct mddev *mddev); 14 int (*slot_number)(struct mddev *mddev);
15 void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); 15 int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
16 int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
17 void (*resync_finish)(struct mddev *mddev);
18 int (*metadata_update_start)(struct mddev *mddev); 16 int (*metadata_update_start)(struct mddev *mddev);
19 int (*metadata_update_finish)(struct mddev *mddev); 17 int (*metadata_update_finish)(struct mddev *mddev);
20 int (*metadata_update_cancel)(struct mddev *mddev); 18 void (*metadata_update_cancel)(struct mddev *mddev);
19 int (*resync_start)(struct mddev *mddev);
20 int (*resync_finish)(struct mddev *mddev);
21 int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi); 21 int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
22 int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); 22 int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
23 int (*add_new_disk_finish)(struct mddev *mddev); 23 void (*add_new_disk_cancel)(struct mddev *mddev);
24 int (*new_disk_ack)(struct mddev *mddev, bool ack); 24 int (*new_disk_ack)(struct mddev *mddev, bool ack);
25 int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); 25 int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
26 int (*gather_bitmaps)(struct md_rdev *rdev); 26 int (*gather_bitmaps)(struct md_rdev *rdev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c702de18207a..a71b36f0acb0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1735,6 +1735,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1735 } 1735 }
1736 } 1736 }
1737 1737
1738 if (mddev_is_clustered(mddev))
1739 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1740
1738 if (rdev->badblocks.count == 0) 1741 if (rdev->badblocks.count == 0)
1739 /* Nothing to do for bad blocks*/ ; 1742 /* Nothing to do for bad blocks*/ ;
1740 else if (sb->bblog_offset == 0) 1743 else if (sb->bblog_offset == 0)
@@ -2196,18 +2199,72 @@ static void sync_sbs(struct mddev *mddev, int nospares)
2196 } 2199 }
2197} 2200}
2198 2201
2202static bool does_sb_need_changing(struct mddev *mddev)
2203{
2204 struct md_rdev *rdev;
2205 struct mdp_superblock_1 *sb;
2206 int role;
2207
2208 /* Find a good rdev */
2209 rdev_for_each(rdev, mddev)
2210 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2211 break;
2212
2213 /* No good device found. */
2214 if (!rdev)
2215 return false;
2216
2217 sb = page_address(rdev->sb_page);
2218 /* Check if a device has become faulty or a spare become active */
2219 rdev_for_each(rdev, mddev) {
2220 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2221 /* Device activated? */
2222 if (role == 0xffff && rdev->raid_disk >=0 &&
2223 !test_bit(Faulty, &rdev->flags))
2224 return true;
2225 /* Device turned faulty? */
2226 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2227 return true;
2228 }
2229
2230 /* Check if any mddev parameters have changed */
2231 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2232 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2233 (mddev->recovery_cp != le64_to_cpu(sb->resync_offset)) ||
2234 (mddev->layout != le64_to_cpu(sb->layout)) ||
2235 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2236 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2237 return true;
2238
2239 return false;
2240}
2241
2199void md_update_sb(struct mddev *mddev, int force_change) 2242void md_update_sb(struct mddev *mddev, int force_change)
2200{ 2243{
2201 struct md_rdev *rdev; 2244 struct md_rdev *rdev;
2202 int sync_req; 2245 int sync_req;
2203 int nospares = 0; 2246 int nospares = 0;
2204 int any_badblocks_changed = 0; 2247 int any_badblocks_changed = 0;
2248 int ret = -1;
2205 2249
2206 if (mddev->ro) { 2250 if (mddev->ro) {
2207 if (force_change) 2251 if (force_change)
2208 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2252 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2209 return; 2253 return;
2210 } 2254 }
2255
2256 if (mddev_is_clustered(mddev)) {
2257 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2258 force_change = 1;
2259 ret = md_cluster_ops->metadata_update_start(mddev);
2260 /* Has someone else has updated the sb */
2261 if (!does_sb_need_changing(mddev)) {
2262 if (ret == 0)
2263 md_cluster_ops->metadata_update_cancel(mddev);
2264 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2265 return;
2266 }
2267 }
2211repeat: 2268repeat:
2212 /* First make sure individual recovery_offsets are correct */ 2269 /* First make sure individual recovery_offsets are correct */
2213 rdev_for_each(rdev, mddev) { 2270 rdev_for_each(rdev, mddev) {
@@ -2356,6 +2413,9 @@ repeat:
2356 clear_bit(BlockedBadBlocks, &rdev->flags); 2413 clear_bit(BlockedBadBlocks, &rdev->flags);
2357 wake_up(&rdev->blocked_wait); 2414 wake_up(&rdev->blocked_wait);
2358 } 2415 }
2416
2417 if (mddev_is_clustered(mddev) && ret == 0)
2418 md_cluster_ops->metadata_update_finish(mddev);
2359} 2419}
2360EXPORT_SYMBOL(md_update_sb); 2420EXPORT_SYMBOL(md_update_sb);
2361 2421
@@ -2490,17 +2550,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2490 err = -EBUSY; 2550 err = -EBUSY;
2491 else { 2551 else {
2492 struct mddev *mddev = rdev->mddev; 2552 struct mddev *mddev = rdev->mddev;
2493 if (mddev_is_clustered(mddev))
2494 md_cluster_ops->remove_disk(mddev, rdev);
2495 md_kick_rdev_from_array(rdev);
2496 if (mddev_is_clustered(mddev))
2497 md_cluster_ops->metadata_update_start(mddev);
2498 if (mddev->pers)
2499 md_update_sb(mddev, 1);
2500 md_new_event(mddev);
2501 if (mddev_is_clustered(mddev))
2502 md_cluster_ops->metadata_update_finish(mddev);
2503 err = 0; 2553 err = 0;
2554 if (mddev_is_clustered(mddev))
2555 err = md_cluster_ops->remove_disk(mddev, rdev);
2556
2557 if (err == 0) {
2558 md_kick_rdev_from_array(rdev);
2559 if (mddev->pers)
2560 md_update_sb(mddev, 1);
2561 md_new_event(mddev);
2562 }
2504 } 2563 }
2505 } else if (cmd_match(buf, "writemostly")) { 2564 } else if (cmd_match(buf, "writemostly")) {
2506 set_bit(WriteMostly, &rdev->flags); 2565 set_bit(WriteMostly, &rdev->flags);
@@ -2688,15 +2747,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2688 rdev->saved_raid_disk = -1; 2747 rdev->saved_raid_disk = -1;
2689 clear_bit(In_sync, &rdev->flags); 2748 clear_bit(In_sync, &rdev->flags);
2690 clear_bit(Bitmap_sync, &rdev->flags); 2749 clear_bit(Bitmap_sync, &rdev->flags);
2691 err = rdev->mddev->pers-> 2750 remove_and_add_spares(rdev->mddev, rdev);
2692 hot_add_disk(rdev->mddev, rdev); 2751 if (rdev->raid_disk == -1)
2693 if (err) { 2752 return -EBUSY;
2694 rdev->raid_disk = -1;
2695 return err;
2696 } else
2697 sysfs_notify_dirent_safe(rdev->sysfs_state);
2698 if (sysfs_link_rdev(rdev->mddev, rdev))
2699 /* failure here is OK */;
2700 /* don't wakeup anyone, leave that to userspace. */ 2753 /* don't wakeup anyone, leave that to userspace. */
2701 } else { 2754 } else {
2702 if (slot >= rdev->mddev->raid_disks && 2755 if (slot >= rdev->mddev->raid_disks &&
@@ -3198,14 +3251,6 @@ static void analyze_sbs(struct mddev *mddev)
3198 md_kick_rdev_from_array(rdev); 3251 md_kick_rdev_from_array(rdev);
3199 continue; 3252 continue;
3200 } 3253 }
3201 /* No device should have a Candidate flag
3202 * when reading devices
3203 */
3204 if (test_bit(Candidate, &rdev->flags)) {
3205 pr_info("md: kicking Cluster Candidate %s from array!\n",
3206 bdevname(rdev->bdev, b));
3207 md_kick_rdev_from_array(rdev);
3208 }
3209 } 3254 }
3210 if (mddev->level == LEVEL_MULTIPATH) { 3255 if (mddev->level == LEVEL_MULTIPATH) {
3211 rdev->desc_nr = i++; 3256 rdev->desc_nr = i++;
@@ -4066,12 +4111,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
4066 if (err) 4111 if (err)
4067 return err; 4112 return err;
4068 if (mddev->pers) { 4113 if (mddev->pers) {
4069 if (mddev_is_clustered(mddev))
4070 md_cluster_ops->metadata_update_start(mddev);
4071 err = update_size(mddev, sectors); 4114 err = update_size(mddev, sectors);
4072 md_update_sb(mddev, 1); 4115 md_update_sb(mddev, 1);
4073 if (mddev_is_clustered(mddev))
4074 md_cluster_ops->metadata_update_finish(mddev);
4075 } else { 4116 } else {
4076 if (mddev->dev_sectors == 0 || 4117 if (mddev->dev_sectors == 0 ||
4077 mddev->dev_sectors > sectors) 4118 mddev->dev_sectors > sectors)
@@ -5309,8 +5350,6 @@ static void md_clean(struct mddev *mddev)
5309 5350
5310static void __md_stop_writes(struct mddev *mddev) 5351static void __md_stop_writes(struct mddev *mddev)
5311{ 5352{
5312 if (mddev_is_clustered(mddev))
5313 md_cluster_ops->metadata_update_start(mddev);
5314 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5353 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5315 flush_workqueue(md_misc_wq); 5354 flush_workqueue(md_misc_wq);
5316 if (mddev->sync_thread) { 5355 if (mddev->sync_thread) {
@@ -5329,8 +5368,6 @@ static void __md_stop_writes(struct mddev *mddev)
5329 mddev->in_sync = 1; 5368 mddev->in_sync = 1;
5330 md_update_sb(mddev, 1); 5369 md_update_sb(mddev, 1);
5331 } 5370 }
5332 if (mddev_is_clustered(mddev))
5333 md_cluster_ops->metadata_update_finish(mddev);
5334} 5371}
5335 5372
5336void md_stop_writes(struct mddev *mddev) 5373void md_stop_writes(struct mddev *mddev)
@@ -5910,19 +5947,12 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
5910 * check whether the device shows up in other nodes 5947 * check whether the device shows up in other nodes
5911 */ 5948 */
5912 if (mddev_is_clustered(mddev)) { 5949 if (mddev_is_clustered(mddev)) {
5913 if (info->state & (1 << MD_DISK_CANDIDATE)) { 5950 if (info->state & (1 << MD_DISK_CANDIDATE))
5914 /* Through --cluster-confirm */
5915 set_bit(Candidate, &rdev->flags); 5951 set_bit(Candidate, &rdev->flags);
5916 err = md_cluster_ops->new_disk_ack(mddev, true); 5952 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
5917 if (err) {
5918 export_rdev(rdev);
5919 return err;
5920 }
5921 } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
5922 /* --add initiated by this node */ 5953 /* --add initiated by this node */
5923 err = md_cluster_ops->add_new_disk_start(mddev, rdev); 5954 err = md_cluster_ops->add_new_disk(mddev, rdev);
5924 if (err) { 5955 if (err) {
5925 md_cluster_ops->add_new_disk_finish(mddev);
5926 export_rdev(rdev); 5956 export_rdev(rdev);
5927 return err; 5957 return err;
5928 } 5958 }
@@ -5931,13 +5961,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
5931 5961
5932 rdev->raid_disk = -1; 5962 rdev->raid_disk = -1;
5933 err = bind_rdev_to_array(rdev, mddev); 5963 err = bind_rdev_to_array(rdev, mddev);
5964
5934 if (err) 5965 if (err)
5935 export_rdev(rdev); 5966 export_rdev(rdev);
5936 else 5967
5968 if (mddev_is_clustered(mddev)) {
5969 if (info->state & (1 << MD_DISK_CANDIDATE))
5970 md_cluster_ops->new_disk_ack(mddev, (err == 0));
5971 else {
5972 if (err)
5973 md_cluster_ops->add_new_disk_cancel(mddev);
5974 else
5975 err = add_bound_rdev(rdev);
5976 }
5977
5978 } else if (!err)
5937 err = add_bound_rdev(rdev); 5979 err = add_bound_rdev(rdev);
5938 if (mddev_is_clustered(mddev) && 5980
5939 (info->state & (1 << MD_DISK_CLUSTER_ADD)))
5940 md_cluster_ops->add_new_disk_finish(mddev);
5941 return err; 5981 return err;
5942 } 5982 }
5943 5983
@@ -5993,13 +6033,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
5993{ 6033{
5994 char b[BDEVNAME_SIZE]; 6034 char b[BDEVNAME_SIZE];
5995 struct md_rdev *rdev; 6035 struct md_rdev *rdev;
6036 int ret = -1;
5996 6037
5997 rdev = find_rdev(mddev, dev); 6038 rdev = find_rdev(mddev, dev);
5998 if (!rdev) 6039 if (!rdev)
5999 return -ENXIO; 6040 return -ENXIO;
6000 6041
6001 if (mddev_is_clustered(mddev)) 6042 if (mddev_is_clustered(mddev))
6002 md_cluster_ops->metadata_update_start(mddev); 6043 ret = md_cluster_ops->metadata_update_start(mddev);
6044
6045 if (rdev->raid_disk < 0)
6046 goto kick_rdev;
6003 6047
6004 clear_bit(Blocked, &rdev->flags); 6048 clear_bit(Blocked, &rdev->flags);
6005 remove_and_add_spares(mddev, rdev); 6049 remove_and_add_spares(mddev, rdev);
@@ -6007,20 +6051,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6007 if (rdev->raid_disk >= 0) 6051 if (rdev->raid_disk >= 0)
6008 goto busy; 6052 goto busy;
6009 6053
6010 if (mddev_is_clustered(mddev)) 6054kick_rdev:
6055 if (mddev_is_clustered(mddev) && ret == 0)
6011 md_cluster_ops->remove_disk(mddev, rdev); 6056 md_cluster_ops->remove_disk(mddev, rdev);
6012 6057
6013 md_kick_rdev_from_array(rdev); 6058 md_kick_rdev_from_array(rdev);
6014 md_update_sb(mddev, 1); 6059 md_update_sb(mddev, 1);
6015 md_new_event(mddev); 6060 md_new_event(mddev);
6016 6061
6017 if (mddev_is_clustered(mddev))
6018 md_cluster_ops->metadata_update_finish(mddev);
6019
6020 return 0; 6062 return 0;
6021busy: 6063busy:
6022 if (mddev_is_clustered(mddev)) 6064 if (mddev_is_clustered(mddev) && ret == 0)
6023 md_cluster_ops->metadata_update_cancel(mddev); 6065 md_cluster_ops->metadata_update_cancel(mddev);
6066
6024 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 6067 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
6025 bdevname(rdev->bdev,b), mdname(mddev)); 6068 bdevname(rdev->bdev,b), mdname(mddev));
6026 return -EBUSY; 6069 return -EBUSY;
@@ -6071,14 +6114,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
6071 goto abort_export; 6114 goto abort_export;
6072 } 6115 }
6073 6116
6074 if (mddev_is_clustered(mddev))
6075 md_cluster_ops->metadata_update_start(mddev);
6076 clear_bit(In_sync, &rdev->flags); 6117 clear_bit(In_sync, &rdev->flags);
6077 rdev->desc_nr = -1; 6118 rdev->desc_nr = -1;
6078 rdev->saved_raid_disk = -1; 6119 rdev->saved_raid_disk = -1;
6079 err = bind_rdev_to_array(rdev, mddev); 6120 err = bind_rdev_to_array(rdev, mddev);
6080 if (err) 6121 if (err)
6081 goto abort_clustered; 6122 goto abort_export;
6082 6123
6083 /* 6124 /*
6084 * The rest should better be atomic, we can have disk failures 6125 * The rest should better be atomic, we can have disk failures
@@ -6088,9 +6129,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
6088 rdev->raid_disk = -1; 6129 rdev->raid_disk = -1;
6089 6130
6090 md_update_sb(mddev, 1); 6131 md_update_sb(mddev, 1);
6091
6092 if (mddev_is_clustered(mddev))
6093 md_cluster_ops->metadata_update_finish(mddev);
6094 /* 6132 /*
6095 * Kick recovery, maybe this spare has to be added to the 6133 * Kick recovery, maybe this spare has to be added to the
6096 * array immediately. 6134 * array immediately.
@@ -6100,9 +6138,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
6100 md_new_event(mddev); 6138 md_new_event(mddev);
6101 return 0; 6139 return 0;
6102 6140
6103abort_clustered:
6104 if (mddev_is_clustered(mddev))
6105 md_cluster_ops->metadata_update_cancel(mddev);
6106abort_export: 6141abort_export:
6107 export_rdev(rdev); 6142 export_rdev(rdev);
6108 return err; 6143 return err;
@@ -6420,8 +6455,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6420 return rv; 6455 return rv;
6421 } 6456 }
6422 } 6457 }
6423 if (mddev_is_clustered(mddev))
6424 md_cluster_ops->metadata_update_start(mddev);
6425 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6458 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6426 rv = update_size(mddev, (sector_t)info->size * 2); 6459 rv = update_size(mddev, (sector_t)info->size * 2);
6427 6460
@@ -6479,12 +6512,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6479 } 6512 }
6480 } 6513 }
6481 md_update_sb(mddev, 1); 6514 md_update_sb(mddev, 1);
6482 if (mddev_is_clustered(mddev))
6483 md_cluster_ops->metadata_update_finish(mddev);
6484 return rv; 6515 return rv;
6485err: 6516err:
6486 if (mddev_is_clustered(mddev))
6487 md_cluster_ops->metadata_update_cancel(mddev);
6488 return rv; 6517 return rv;
6489} 6518}
6490 6519
@@ -7597,11 +7626,7 @@ int md_allow_write(struct mddev *mddev)
7597 mddev->safemode == 0) 7626 mddev->safemode == 0)
7598 mddev->safemode = 1; 7627 mddev->safemode = 1;
7599 spin_unlock(&mddev->lock); 7628 spin_unlock(&mddev->lock);
7600 if (mddev_is_clustered(mddev))
7601 md_cluster_ops->metadata_update_start(mddev);
7602 md_update_sb(mddev, 0); 7629 md_update_sb(mddev, 0);
7603 if (mddev_is_clustered(mddev))
7604 md_cluster_ops->metadata_update_finish(mddev);
7605 sysfs_notify_dirent_safe(mddev->sysfs_state); 7630 sysfs_notify_dirent_safe(mddev->sysfs_state);
7606 } else 7631 } else
7607 spin_unlock(&mddev->lock); 7632 spin_unlock(&mddev->lock);
@@ -7633,6 +7658,7 @@ void md_do_sync(struct md_thread *thread)
7633 struct md_rdev *rdev; 7658 struct md_rdev *rdev;
7634 char *desc, *action = NULL; 7659 char *desc, *action = NULL;
7635 struct blk_plug plug; 7660 struct blk_plug plug;
7661 bool cluster_resync_finished = false;
7636 7662
7637 /* just incase thread restarts... */ 7663 /* just incase thread restarts... */
7638 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7664 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7802,9 +7828,6 @@ void md_do_sync(struct md_thread *thread)
7802 md_new_event(mddev); 7828 md_new_event(mddev);
7803 update_time = jiffies; 7829 update_time = jiffies;
7804 7830
7805 if (mddev_is_clustered(mddev))
7806 md_cluster_ops->resync_start(mddev, j, max_sectors);
7807
7808 blk_start_plug(&plug); 7831 blk_start_plug(&plug);
7809 while (j < max_sectors) { 7832 while (j < max_sectors) {
7810 sector_t sectors; 7833 sector_t sectors;
@@ -7868,8 +7891,6 @@ void md_do_sync(struct md_thread *thread)
7868 j = max_sectors; 7891 j = max_sectors;
7869 if (j > 2) 7892 if (j > 2)
7870 mddev->curr_resync = j; 7893 mddev->curr_resync = j;
7871 if (mddev_is_clustered(mddev))
7872 md_cluster_ops->resync_info_update(mddev, j, max_sectors);
7873 mddev->curr_mark_cnt = io_sectors; 7894 mddev->curr_mark_cnt = io_sectors;
7874 if (last_check == 0) 7895 if (last_check == 0)
7875 /* this is the earliest that rebuild will be 7896 /* this is the earliest that rebuild will be
@@ -7940,7 +7961,11 @@ void md_do_sync(struct md_thread *thread)
7940 mddev->curr_resync_completed = mddev->curr_resync; 7961 mddev->curr_resync_completed = mddev->curr_resync;
7941 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7962 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7942 } 7963 }
7943 /* tell personality that we are finished */ 7964 /* tell personality and other nodes that we are finished */
7965 if (mddev_is_clustered(mddev)) {
7966 md_cluster_ops->resync_finish(mddev);
7967 cluster_resync_finished = true;
7968 }
7944 mddev->pers->sync_request(mddev, max_sectors, &skipped); 7969 mddev->pers->sync_request(mddev, max_sectors, &skipped);
7945 7970
7946 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 7971 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
@@ -7976,11 +8001,13 @@ void md_do_sync(struct md_thread *thread)
7976 } 8001 }
7977 } 8002 }
7978 skip: 8003 skip:
7979 if (mddev_is_clustered(mddev))
7980 md_cluster_ops->resync_finish(mddev);
7981
7982 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8004 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7983 8005
8006 if (mddev_is_clustered(mddev) &&
8007 test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8008 !cluster_resync_finished)
8009 md_cluster_ops->resync_finish(mddev);
8010
7984 spin_lock(&mddev->lock); 8011 spin_lock(&mddev->lock);
7985 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8012 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7986 /* We completed so min/max setting can be forgotten if used. */ 8013 /* We completed so min/max setting can be forgotten if used. */
@@ -8023,10 +8050,14 @@ static int remove_and_add_spares(struct mddev *mddev,
8023 if (removed && mddev->kobj.sd) 8050 if (removed && mddev->kobj.sd)
8024 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8051 sysfs_notify(&mddev->kobj, NULL, "degraded");
8025 8052
8026 if (this) 8053 if (this && removed)
8027 goto no_add; 8054 goto no_add;
8028 8055
8029 rdev_for_each(rdev, mddev) { 8056 rdev_for_each(rdev, mddev) {
8057 if (this && this != rdev)
8058 continue;
8059 if (test_bit(Candidate, &rdev->flags))
8060 continue;
8030 if (rdev->raid_disk >= 0 && 8061 if (rdev->raid_disk >= 0 &&
8031 !test_bit(In_sync, &rdev->flags) && 8062 !test_bit(In_sync, &rdev->flags) &&
8032 !test_bit(Faulty, &rdev->flags)) 8063 !test_bit(Faulty, &rdev->flags))
@@ -8060,14 +8091,25 @@ no_add:
8060static void md_start_sync(struct work_struct *ws) 8091static void md_start_sync(struct work_struct *ws)
8061{ 8092{
8062 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8093 struct mddev *mddev = container_of(ws, struct mddev, del_work);
8094 int ret = 0;
8095
8096 if (mddev_is_clustered(mddev)) {
8097 ret = md_cluster_ops->resync_start(mddev);
8098 if (ret) {
8099 mddev->sync_thread = NULL;
8100 goto out;
8101 }
8102 }
8063 8103
8064 mddev->sync_thread = md_register_thread(md_do_sync, 8104 mddev->sync_thread = md_register_thread(md_do_sync,
8065 mddev, 8105 mddev,
8066 "resync"); 8106 "resync");
8107out:
8067 if (!mddev->sync_thread) { 8108 if (!mddev->sync_thread) {
8068 printk(KERN_ERR "%s: could not start resync" 8109 if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
8069 " thread...\n", 8110 printk(KERN_ERR "%s: could not start resync"
8070 mdname(mddev)); 8111 " thread...\n",
8112 mdname(mddev));
8071 /* leave the spares where they are, it shouldn't hurt */ 8113 /* leave the spares where they are, it shouldn't hurt */
8072 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8114 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8073 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8115 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -8186,13 +8228,8 @@ void md_check_recovery(struct mddev *mddev)
8186 sysfs_notify_dirent_safe(mddev->sysfs_state); 8228 sysfs_notify_dirent_safe(mddev->sysfs_state);
8187 } 8229 }
8188 8230
8189 if (mddev->flags & MD_UPDATE_SB_FLAGS) { 8231 if (mddev->flags & MD_UPDATE_SB_FLAGS)
8190 if (mddev_is_clustered(mddev))
8191 md_cluster_ops->metadata_update_start(mddev);
8192 md_update_sb(mddev, 0); 8232 md_update_sb(mddev, 0);
8193 if (mddev_is_clustered(mddev))
8194 md_cluster_ops->metadata_update_finish(mddev);
8195 }
8196 8233
8197 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 8234 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
8198 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 8235 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -8290,8 +8327,6 @@ void md_reap_sync_thread(struct mddev *mddev)
8290 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8327 set_bit(MD_CHANGE_DEVS, &mddev->flags);
8291 } 8328 }
8292 } 8329 }
8293 if (mddev_is_clustered(mddev))
8294 md_cluster_ops->metadata_update_start(mddev);
8295 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8330 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8296 mddev->pers->finish_reshape) 8331 mddev->pers->finish_reshape)
8297 mddev->pers->finish_reshape(mddev); 8332 mddev->pers->finish_reshape(mddev);
@@ -8304,8 +8339,6 @@ void md_reap_sync_thread(struct mddev *mddev)
8304 rdev->saved_raid_disk = -1; 8339 rdev->saved_raid_disk = -1;
8305 8340
8306 md_update_sb(mddev, 1); 8341 md_update_sb(mddev, 1);
8307 if (mddev_is_clustered(mddev))
8308 md_cluster_ops->metadata_update_finish(mddev);
8309 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8342 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8310 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8343 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8311 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8344 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -8928,25 +8961,129 @@ err_wq:
8928 return ret; 8961 return ret;
8929} 8962}
8930 8963
8931void md_reload_sb(struct mddev *mddev) 8964static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
8932{ 8965{
8933 struct md_rdev *rdev, *tmp; 8966 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
8967 struct md_rdev *rdev2;
8968 int role, ret;
8969 char b[BDEVNAME_SIZE];
8934 8970
8935 rdev_for_each_safe(rdev, tmp, mddev) { 8971 /* Check for change of roles in the active devices */
8936 rdev->sb_loaded = 0; 8972 rdev_for_each(rdev2, mddev) {
8937 ClearPageUptodate(rdev->sb_page); 8973 if (test_bit(Faulty, &rdev2->flags))
8974 continue;
8975
8976 /* Check if the roles changed */
8977 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
8978
8979 if (test_bit(Candidate, &rdev2->flags)) {
8980 if (role == 0xfffe) {
8981 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
8982 md_kick_rdev_from_array(rdev2);
8983 continue;
8984 }
8985 else
8986 clear_bit(Candidate, &rdev2->flags);
8987 }
8988
8989 if (role != rdev2->raid_disk) {
8990 /* got activated */
8991 if (rdev2->raid_disk == -1 && role != 0xffff) {
8992 rdev2->saved_raid_disk = role;
8993 ret = remove_and_add_spares(mddev, rdev2);
8994 pr_info("Activated spare: %s\n",
8995 bdevname(rdev2->bdev,b));
8996 continue;
8997 }
8998 /* device faulty
8999 * We just want to do the minimum to mark the disk
9000 * as faulty. The recovery is performed by the
9001 * one who initiated the error.
9002 */
9003 if ((role == 0xfffe) || (role == 0xfffd)) {
9004 md_error(mddev, rdev2);
9005 clear_bit(Blocked, &rdev2->flags);
9006 }
9007 }
8938 } 9008 }
8939 mddev->raid_disks = 0; 9009
8940 analyze_sbs(mddev); 9010 /* recovery_cp changed */
8941 rdev_for_each_safe(rdev, tmp, mddev) { 9011 if (le64_to_cpu(sb->resync_offset) != mddev->recovery_cp)
8942 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9012 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
8943 /* since we don't write to faulty devices, we figure out if the 9013
8944 * disk is faulty by comparing events 9014 /* Finally set the event to be up to date */
8945 */ 9015 mddev->events = le64_to_cpu(sb->events);
8946 if (mddev->events > sb->events) 9016}
8947 set_bit(Faulty, &rdev->flags); 9017
9018static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9019{
9020 int err;
9021 struct page *swapout = rdev->sb_page;
9022 struct mdp_superblock_1 *sb;
9023
9024 /* Store the sb page of the rdev in the swapout temporary
9025 * variable in case we err in the future
9026 */
9027 rdev->sb_page = NULL;
9028 alloc_disk_sb(rdev);
9029 ClearPageUptodate(rdev->sb_page);
9030 rdev->sb_loaded = 0;
9031 err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
9032
9033 if (err < 0) {
9034 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9035 __func__, __LINE__, rdev->desc_nr, err);
9036 put_page(rdev->sb_page);
9037 rdev->sb_page = swapout;
9038 rdev->sb_loaded = 1;
9039 return err;
9040 }
9041
9042 sb = page_address(rdev->sb_page);
9043 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
9044 * is not set
9045 */
9046
9047 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9048 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9049
9050 /* The other node finished recovery, call spare_active to set
9051 * device In_sync and mddev->degraded
9052 */
9053 if (rdev->recovery_offset == MaxSector &&
9054 !test_bit(In_sync, &rdev->flags) &&
9055 mddev->pers->spare_active(mddev))
9056 sysfs_notify(&mddev->kobj, NULL, "degraded");
9057
9058 put_page(swapout);
9059 return 0;
9060}
9061
9062void md_reload_sb(struct mddev *mddev, int nr)
9063{
9064 struct md_rdev *rdev;
9065 int err;
9066
9067 /* Find the rdev */
9068 rdev_for_each_rcu(rdev, mddev) {
9069 if (rdev->desc_nr == nr)
9070 break;
9071 }
9072
9073 if (!rdev || rdev->desc_nr != nr) {
9074 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9075 return;
8948 } 9076 }
8949 9077
9078 err = read_rdev(mddev, rdev);
9079 if (err < 0)
9080 return;
9081
9082 check_sb_changes(mddev, rdev);
9083
9084 /* Read all rdev's to update recovery_offset */
9085 rdev_for_each_rcu(rdev, mddev)
9086 read_rdev(mddev, rdev);
8950} 9087}
8951EXPORT_SYMBOL(md_reload_sb); 9088EXPORT_SYMBOL(md_reload_sb);
8952 9089
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ab339571e57f..2ea00356bb23 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -658,7 +658,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
658 struct mddev *mddev); 658 struct mddev *mddev);
659 659
660extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); 660extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
661extern void md_reload_sb(struct mddev *mddev); 661extern void md_reload_sb(struct mddev *mddev, int raid_disk);
662extern void md_update_sb(struct mddev *mddev, int force); 662extern void md_update_sb(struct mddev *mddev, int force);
663extern void md_kick_rdev_from_array(struct md_rdev * rdev); 663extern void md_kick_rdev_from_array(struct md_rdev * rdev);
664struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); 664struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ddd8a5f572aa..ce2d797f8787 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
90#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) 90#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
91#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) 91#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
92#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) 92#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
93#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
94#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
93#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) 95#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
94 96
95static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 97static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
@@ -1590,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1590 if (rdev->raid_disk >= 0) 1592 if (rdev->raid_disk >= 0)
1591 first = last = rdev->raid_disk; 1593 first = last = rdev->raid_disk;
1592 1594
1595 /*
1596 * find the disk ... but prefer rdev->saved_raid_disk
1597 * if possible.
1598 */
1599 if (rdev->saved_raid_disk >= 0 &&
1600 rdev->saved_raid_disk >= first &&
1601 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1602 first = last = rdev->saved_raid_disk;
1603
1593 for (mirror = first; mirror <= last; mirror++) { 1604 for (mirror = first; mirror <= last; mirror++) {
1594 p = conf->mirrors+mirror; 1605 p = conf->mirrors+mirror;
1595 if (!p->rdev) { 1606 if (!p->rdev) {
@@ -2488,6 +2499,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2488 2499
2489 bitmap_close_sync(mddev->bitmap); 2500 bitmap_close_sync(mddev->bitmap);
2490 close_sync(conf); 2501 close_sync(conf);
2502
2503 if (mddev_is_clustered(mddev)) {
2504 conf->cluster_sync_low = 0;
2505 conf->cluster_sync_high = 0;
2506 }
2491 return 0; 2507 return 0;
2492 } 2508 }
2493 2509
@@ -2508,7 +2524,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2508 return sync_blocks; 2524 return sync_blocks;
2509 } 2525 }
2510 2526
2511 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 2527 /* we are incrementing sector_nr below. To be safe, we check against
2528 * sector_nr + two times RESYNC_SECTORS
2529 */
2530
2531 bitmap_cond_end_sync(mddev->bitmap, sector_nr,
2532 mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
2512 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); 2533 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
2513 2534
2514 raise_barrier(conf, sector_nr); 2535 raise_barrier(conf, sector_nr);
@@ -2699,6 +2720,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2699 bio_full: 2720 bio_full:
2700 r1_bio->sectors = nr_sectors; 2721 r1_bio->sectors = nr_sectors;
2701 2722
2723 if (mddev_is_clustered(mddev) &&
2724 conf->cluster_sync_high < sector_nr + nr_sectors) {
2725 conf->cluster_sync_low = mddev->curr_resync_completed;
2726 conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
2727 /* Send resync message */
2728 md_cluster_ops->resync_info_update(mddev,
2729 conf->cluster_sync_low,
2730 conf->cluster_sync_high);
2731 }
2732
2702 /* For a user-requested sync, we read all readable devices and do a 2733 /* For a user-requested sync, we read all readable devices and do a
2703 * compare 2734 * compare
2704 */ 2735 */
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c52d7139c5d7..61c39b390cd8 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -111,6 +111,13 @@ struct r1conf {
111 * the new thread here until we fully activate the array. 111 * the new thread here until we fully activate the array.
112 */ 112 */
113 struct md_thread *thread; 113 struct md_thread *thread;
114
115 /* Keep track of cluster resync window to send to other
116 * nodes.
117 */
118 sector_t cluster_sync_low;
119 sector_t cluster_sync_high;
120
114}; 121};
115 122
116/* 123/*
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9f69dc526f8c..e0983c30e73b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3137,7 +3137,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3137 /* resync. Schedule a read for every block at this virt offset */ 3137 /* resync. Schedule a read for every block at this virt offset */
3138 int count = 0; 3138 int count = 0;
3139 3139
3140 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 3140 bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
3141 3141
3142 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 3142 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3143 &sync_blocks, mddev->degraded) && 3143 &sync_blocks, mddev->degraded) &&
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 49bb8d3ff9be..5b79770c4f08 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5613,7 +5613,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
5613 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 5613 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
5614 } 5614 }
5615 5615
5616 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 5616 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
5617 5617
5618 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 5618 sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
5619 if (sh == NULL) { 5619 if (sh == NULL) {
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 2ae6131e69a5..867ee874fa80 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -302,6 +302,7 @@ struct mdp_superblock_1 {
302#define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening 302#define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening
303 * is guided by bitmap. 303 * is guided by bitmap.
304 */ 304 */
305#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
305#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ 306#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
306 |MD_FEATURE_RECOVERY_OFFSET \ 307 |MD_FEATURE_RECOVERY_OFFSET \
307 |MD_FEATURE_RESHAPE_ACTIVE \ 308 |MD_FEATURE_RESHAPE_ACTIVE \
@@ -310,6 +311,7 @@ struct mdp_superblock_1 {
310 |MD_FEATURE_RESHAPE_BACKWARDS \ 311 |MD_FEATURE_RESHAPE_BACKWARDS \
311 |MD_FEATURE_NEW_OFFSET \ 312 |MD_FEATURE_NEW_OFFSET \
312 |MD_FEATURE_RECOVERY_BITMAP \ 313 |MD_FEATURE_RECOVERY_BITMAP \
314 |MD_FEATURE_CLUSTERED \
313 ) 315 )
314 316
315#endif 317#endif