aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c634
1 files changed, 550 insertions, 84 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f19b874753a9..306268ec99ff 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -44,6 +44,7 @@
44#include <linux/suspend.h> 44#include <linux/suspend.h>
45#include <linux/poll.h> 45#include <linux/poll.h>
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/ctype.h>
47 48
48#include <linux/init.h> 49#include <linux/init.h>
49 50
@@ -72,6 +73,10 @@ static void autostart_arrays (int part);
72static LIST_HEAD(pers_list); 73static LIST_HEAD(pers_list);
73static DEFINE_SPINLOCK(pers_lock); 74static DEFINE_SPINLOCK(pers_lock);
74 75
76static void md_print_devices(void);
77
78#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
79
75/* 80/*
76 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 81 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
77 * is 1000 KB/sec, so the extra system load does not show up that much. 82 * is 1000 KB/sec, so the extra system load does not show up that much.
@@ -170,7 +175,7 @@ EXPORT_SYMBOL_GPL(md_new_event);
170/* Alternate version that can be called from interrupts 175/* Alternate version that can be called from interrupts
171 * when calling sysfs_notify isn't needed. 176 * when calling sysfs_notify isn't needed.
172 */ 177 */
173void md_new_event_inintr(mddev_t *mddev) 178static void md_new_event_inintr(mddev_t *mddev)
174{ 179{
175 atomic_inc(&md_event_count); 180 atomic_inc(&md_event_count);
176 wake_up(&md_event_waiters); 181 wake_up(&md_event_waiters);
@@ -732,6 +737,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
732{ 737{
733 mdp_disk_t *desc; 738 mdp_disk_t *desc;
734 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 739 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
740 __u64 ev1 = md_event(sb);
735 741
736 rdev->raid_disk = -1; 742 rdev->raid_disk = -1;
737 rdev->flags = 0; 743 rdev->flags = 0;
@@ -748,7 +754,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
748 mddev->layout = sb->layout; 754 mddev->layout = sb->layout;
749 mddev->raid_disks = sb->raid_disks; 755 mddev->raid_disks = sb->raid_disks;
750 mddev->size = sb->size; 756 mddev->size = sb->size;
751 mddev->events = md_event(sb); 757 mddev->events = ev1;
752 mddev->bitmap_offset = 0; 758 mddev->bitmap_offset = 0;
753 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 759 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
754 760
@@ -797,7 +803,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
797 803
798 } else if (mddev->pers == NULL) { 804 } else if (mddev->pers == NULL) {
799 /* Insist on good event counter while assembling */ 805 /* Insist on good event counter while assembling */
800 __u64 ev1 = md_event(sb);
801 ++ev1; 806 ++ev1;
802 if (ev1 < mddev->events) 807 if (ev1 < mddev->events)
803 return -EINVAL; 808 return -EINVAL;
@@ -805,19 +810,21 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
805 /* if adding to array with a bitmap, then we can accept an 810 /* if adding to array with a bitmap, then we can accept an
806 * older device ... but not too old. 811 * older device ... but not too old.
807 */ 812 */
808 __u64 ev1 = md_event(sb);
809 if (ev1 < mddev->bitmap->events_cleared) 813 if (ev1 < mddev->bitmap->events_cleared)
810 return 0; 814 return 0;
811 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 815 } else {
812 return 0; 816 if (ev1 < mddev->events)
817 /* just a hot-add of a new device, leave raid_disk at -1 */
818 return 0;
819 }
813 820
814 if (mddev->level != LEVEL_MULTIPATH) { 821 if (mddev->level != LEVEL_MULTIPATH) {
815 desc = sb->disks + rdev->desc_nr; 822 desc = sb->disks + rdev->desc_nr;
816 823
817 if (desc->state & (1<<MD_DISK_FAULTY)) 824 if (desc->state & (1<<MD_DISK_FAULTY))
818 set_bit(Faulty, &rdev->flags); 825 set_bit(Faulty, &rdev->flags);
819 else if (desc->state & (1<<MD_DISK_SYNC) && 826 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
820 desc->raid_disk < mddev->raid_disks) { 827 desc->raid_disk < mddev->raid_disks */) {
821 set_bit(In_sync, &rdev->flags); 828 set_bit(In_sync, &rdev->flags);
822 rdev->raid_disk = desc->raid_disk; 829 rdev->raid_disk = desc->raid_disk;
823 } 830 }
@@ -1100,6 +1107,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1100static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1107static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1101{ 1108{
1102 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1109 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1110 __u64 ev1 = le64_to_cpu(sb->events);
1103 1111
1104 rdev->raid_disk = -1; 1112 rdev->raid_disk = -1;
1105 rdev->flags = 0; 1113 rdev->flags = 0;
@@ -1115,7 +1123,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1115 mddev->layout = le32_to_cpu(sb->layout); 1123 mddev->layout = le32_to_cpu(sb->layout);
1116 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1124 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1117 mddev->size = le64_to_cpu(sb->size)/2; 1125 mddev->size = le64_to_cpu(sb->size)/2;
1118 mddev->events = le64_to_cpu(sb->events); 1126 mddev->events = ev1;
1119 mddev->bitmap_offset = 0; 1127 mddev->bitmap_offset = 0;
1120 mddev->default_bitmap_offset = 1024 >> 9; 1128 mddev->default_bitmap_offset = 1024 >> 9;
1121 1129
@@ -1149,7 +1157,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1149 1157
1150 } else if (mddev->pers == NULL) { 1158 } else if (mddev->pers == NULL) {
1151 /* Insist of good event counter while assembling */ 1159 /* Insist of good event counter while assembling */
1152 __u64 ev1 = le64_to_cpu(sb->events);
1153 ++ev1; 1160 ++ev1;
1154 if (ev1 < mddev->events) 1161 if (ev1 < mddev->events)
1155 return -EINVAL; 1162 return -EINVAL;
@@ -1157,12 +1164,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1157 /* If adding to array with a bitmap, then we can accept an 1164 /* If adding to array with a bitmap, then we can accept an
1158 * older device, but not too old. 1165 * older device, but not too old.
1159 */ 1166 */
1160 __u64 ev1 = le64_to_cpu(sb->events);
1161 if (ev1 < mddev->bitmap->events_cleared) 1167 if (ev1 < mddev->bitmap->events_cleared)
1162 return 0; 1168 return 0;
1163 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 1169 } else {
1164 return 0; 1170 if (ev1 < mddev->events)
1165 1171 /* just a hot-add of a new device, leave raid_disk at -1 */
1172 return 0;
1173 }
1166 if (mddev->level != LEVEL_MULTIPATH) { 1174 if (mddev->level != LEVEL_MULTIPATH) {
1167 int role; 1175 int role;
1168 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1176 rdev->desc_nr = le32_to_cpu(sb->dev_number);
@@ -1174,7 +1182,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1174 set_bit(Faulty, &rdev->flags); 1182 set_bit(Faulty, &rdev->flags);
1175 break; 1183 break;
1176 default: 1184 default:
1177 set_bit(In_sync, &rdev->flags); 1185 if ((le32_to_cpu(sb->feature_map) &
1186 MD_FEATURE_RECOVERY_OFFSET))
1187 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1188 else
1189 set_bit(In_sync, &rdev->flags);
1178 rdev->raid_disk = role; 1190 rdev->raid_disk = role;
1179 break; 1191 break;
1180 } 1192 }
@@ -1198,6 +1210,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1198 1210
1199 sb->feature_map = 0; 1211 sb->feature_map = 0;
1200 sb->pad0 = 0; 1212 sb->pad0 = 0;
1213 sb->recovery_offset = cpu_to_le64(0);
1201 memset(sb->pad1, 0, sizeof(sb->pad1)); 1214 memset(sb->pad1, 0, sizeof(sb->pad1));
1202 memset(sb->pad2, 0, sizeof(sb->pad2)); 1215 memset(sb->pad2, 0, sizeof(sb->pad2));
1203 memset(sb->pad3, 0, sizeof(sb->pad3)); 1216 memset(sb->pad3, 0, sizeof(sb->pad3));
@@ -1218,6 +1231,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1218 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1231 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1219 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1232 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1220 } 1233 }
1234
1235 if (rdev->raid_disk >= 0 &&
1236 !test_bit(In_sync, &rdev->flags) &&
1237 rdev->recovery_offset > 0) {
1238 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1239 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1240 }
1241
1221 if (mddev->reshape_position != MaxSector) { 1242 if (mddev->reshape_position != MaxSector) {
1222 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1243 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1223 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1244 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
@@ -1242,11 +1263,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1242 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1263 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1243 else if (test_bit(In_sync, &rdev2->flags)) 1264 else if (test_bit(In_sync, &rdev2->flags))
1244 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1265 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1266 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1267 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1245 else 1268 else
1246 sb->dev_roles[i] = cpu_to_le16(0xffff); 1269 sb->dev_roles[i] = cpu_to_le16(0xffff);
1247 } 1270 }
1248 1271
1249 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
1250 sb->sb_csum = calc_sb_1_csum(sb); 1272 sb->sb_csum = calc_sb_1_csum(sb);
1251} 1273}
1252 1274
@@ -1507,7 +1529,7 @@ static void print_rdev(mdk_rdev_t *rdev)
1507 printk(KERN_INFO "md: no rdev superblock!\n"); 1529 printk(KERN_INFO "md: no rdev superblock!\n");
1508} 1530}
1509 1531
1510void md_print_devices(void) 1532static void md_print_devices(void)
1511{ 1533{
1512 struct list_head *tmp, *tmp2; 1534 struct list_head *tmp, *tmp2;
1513 mdk_rdev_t *rdev; 1535 mdk_rdev_t *rdev;
@@ -1536,15 +1558,30 @@ void md_print_devices(void)
1536} 1558}
1537 1559
1538 1560
1539static void sync_sbs(mddev_t * mddev) 1561static void sync_sbs(mddev_t * mddev, int nospares)
1540{ 1562{
1563 /* Update each superblock (in-memory image), but
1564 * if we are allowed to, skip spares which already
1565 * have the right event counter, or have one earlier
1566 * (which would mean they aren't being marked as dirty
1567 * with the rest of the array)
1568 */
1541 mdk_rdev_t *rdev; 1569 mdk_rdev_t *rdev;
1542 struct list_head *tmp; 1570 struct list_head *tmp;
1543 1571
1544 ITERATE_RDEV(mddev,rdev,tmp) { 1572 ITERATE_RDEV(mddev,rdev,tmp) {
1545 super_types[mddev->major_version]. 1573 if (rdev->sb_events == mddev->events ||
1546 sync_super(mddev, rdev); 1574 (nospares &&
1547 rdev->sb_loaded = 1; 1575 rdev->raid_disk < 0 &&
1576 (rdev->sb_events&1)==0 &&
1577 rdev->sb_events+1 == mddev->events)) {
1578 /* Don't update this superblock */
1579 rdev->sb_loaded = 2;
1580 } else {
1581 super_types[mddev->major_version].
1582 sync_super(mddev, rdev);
1583 rdev->sb_loaded = 1;
1584 }
1548 } 1585 }
1549} 1586}
1550 1587
@@ -1554,12 +1591,42 @@ void md_update_sb(mddev_t * mddev)
1554 struct list_head *tmp; 1591 struct list_head *tmp;
1555 mdk_rdev_t *rdev; 1592 mdk_rdev_t *rdev;
1556 int sync_req; 1593 int sync_req;
1594 int nospares = 0;
1557 1595
1558repeat: 1596repeat:
1559 spin_lock_irq(&mddev->write_lock); 1597 spin_lock_irq(&mddev->write_lock);
1560 sync_req = mddev->in_sync; 1598 sync_req = mddev->in_sync;
1561 mddev->utime = get_seconds(); 1599 mddev->utime = get_seconds();
1562 mddev->events ++; 1600 if (mddev->sb_dirty == 3)
1601 /* just a clean<-> dirty transition, possibly leave spares alone,
1602 * though if events isn't the right even/odd, we will have to do
1603 * spares after all
1604 */
1605 nospares = 1;
1606
1607 /* If this is just a dirty<->clean transition, and the array is clean
1608 * and 'events' is odd, we can roll back to the previous clean state */
1609 if (mddev->sb_dirty == 3
1610 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1611 && (mddev->events & 1))
1612 mddev->events--;
1613 else {
1614 /* otherwise we have to go forward and ... */
1615 mddev->events ++;
1616 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1617 /* .. if the array isn't clean, insist on an odd 'events' */
1618 if ((mddev->events&1)==0) {
1619 mddev->events++;
1620 nospares = 0;
1621 }
1622 } else {
1623 /* otherwise insist on an even 'events' (for clean states) */
1624 if ((mddev->events&1)) {
1625 mddev->events++;
1626 nospares = 0;
1627 }
1628 }
1629 }
1563 1630
1564 if (!mddev->events) { 1631 if (!mddev->events) {
1565 /* 1632 /*
@@ -1571,7 +1638,7 @@ repeat:
1571 mddev->events --; 1638 mddev->events --;
1572 } 1639 }
1573 mddev->sb_dirty = 2; 1640 mddev->sb_dirty = 2;
1574 sync_sbs(mddev); 1641 sync_sbs(mddev, nospares);
1575 1642
1576 /* 1643 /*
1577 * do not write anything to disk if using 1644 * do not write anything to disk if using
@@ -1593,6 +1660,8 @@ repeat:
1593 ITERATE_RDEV(mddev,rdev,tmp) { 1660 ITERATE_RDEV(mddev,rdev,tmp) {
1594 char b[BDEVNAME_SIZE]; 1661 char b[BDEVNAME_SIZE];
1595 dprintk(KERN_INFO "md: "); 1662 dprintk(KERN_INFO "md: ");
1663 if (rdev->sb_loaded != 1)
1664 continue; /* no noise on spare devices */
1596 if (test_bit(Faulty, &rdev->flags)) 1665 if (test_bit(Faulty, &rdev->flags))
1597 dprintk("(skipping faulty "); 1666 dprintk("(skipping faulty ");
1598 1667
@@ -1604,6 +1673,7 @@ repeat:
1604 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1673 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1605 bdevname(rdev->bdev,b), 1674 bdevname(rdev->bdev,b),
1606 (unsigned long long)rdev->sb_offset); 1675 (unsigned long long)rdev->sb_offset);
1676 rdev->sb_events = mddev->events;
1607 1677
1608 } else 1678 } else
1609 dprintk(")\n"); 1679 dprintk(")\n");
@@ -1667,6 +1737,10 @@ state_show(mdk_rdev_t *rdev, char *page)
1667 len += sprintf(page+len, "%sin_sync",sep); 1737 len += sprintf(page+len, "%sin_sync",sep);
1668 sep = ","; 1738 sep = ",";
1669 } 1739 }
1740 if (test_bit(WriteMostly, &rdev->flags)) {
1741 len += sprintf(page+len, "%swrite_mostly",sep);
1742 sep = ",";
1743 }
1670 if (!test_bit(Faulty, &rdev->flags) && 1744 if (!test_bit(Faulty, &rdev->flags) &&
1671 !test_bit(In_sync, &rdev->flags)) { 1745 !test_bit(In_sync, &rdev->flags)) {
1672 len += sprintf(page+len, "%sspare", sep); 1746 len += sprintf(page+len, "%sspare", sep);
@@ -1675,8 +1749,40 @@ state_show(mdk_rdev_t *rdev, char *page)
1675 return len+sprintf(page+len, "\n"); 1749 return len+sprintf(page+len, "\n");
1676} 1750}
1677 1751
1752static ssize_t
1753state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1754{
1755 /* can write
1756 * faulty - simulates and error
1757 * remove - disconnects the device
1758 * writemostly - sets write_mostly
1759 * -writemostly - clears write_mostly
1760 */
1761 int err = -EINVAL;
1762 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1763 md_error(rdev->mddev, rdev);
1764 err = 0;
1765 } else if (cmd_match(buf, "remove")) {
1766 if (rdev->raid_disk >= 0)
1767 err = -EBUSY;
1768 else {
1769 mddev_t *mddev = rdev->mddev;
1770 kick_rdev_from_array(rdev);
1771 md_update_sb(mddev);
1772 md_new_event(mddev);
1773 err = 0;
1774 }
1775 } else if (cmd_match(buf, "writemostly")) {
1776 set_bit(WriteMostly, &rdev->flags);
1777 err = 0;
1778 } else if (cmd_match(buf, "-writemostly")) {
1779 clear_bit(WriteMostly, &rdev->flags);
1780 err = 0;
1781 }
1782 return err ? err : len;
1783}
1678static struct rdev_sysfs_entry 1784static struct rdev_sysfs_entry
1679rdev_state = __ATTR_RO(state); 1785rdev_state = __ATTR(state, 0644, state_show, state_store);
1680 1786
1681static ssize_t 1787static ssize_t
1682super_show(mdk_rdev_t *rdev, char *page) 1788super_show(mdk_rdev_t *rdev, char *page)
@@ -1873,6 +1979,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1873 rdev->desc_nr = -1; 1979 rdev->desc_nr = -1;
1874 rdev->flags = 0; 1980 rdev->flags = 0;
1875 rdev->data_offset = 0; 1981 rdev->data_offset = 0;
1982 rdev->sb_events = 0;
1876 atomic_set(&rdev->nr_pending, 0); 1983 atomic_set(&rdev->nr_pending, 0);
1877 atomic_set(&rdev->read_errors, 0); 1984 atomic_set(&rdev->read_errors, 0);
1878 atomic_set(&rdev->corrected_errors, 0); 1985 atomic_set(&rdev->corrected_errors, 0);
@@ -1978,6 +2085,54 @@ static void analyze_sbs(mddev_t * mddev)
1978} 2085}
1979 2086
1980static ssize_t 2087static ssize_t
2088safe_delay_show(mddev_t *mddev, char *page)
2089{
2090 int msec = (mddev->safemode_delay*1000)/HZ;
2091 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2092}
2093static ssize_t
2094safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2095{
2096 int scale=1;
2097 int dot=0;
2098 int i;
2099 unsigned long msec;
2100 char buf[30];
2101 char *e;
2102 /* remove a period, and count digits after it */
2103 if (len >= sizeof(buf))
2104 return -EINVAL;
2105 strlcpy(buf, cbuf, len);
2106 buf[len] = 0;
2107 for (i=0; i<len; i++) {
2108 if (dot) {
2109 if (isdigit(buf[i])) {
2110 buf[i-1] = buf[i];
2111 scale *= 10;
2112 }
2113 buf[i] = 0;
2114 } else if (buf[i] == '.') {
2115 dot=1;
2116 buf[i] = 0;
2117 }
2118 }
2119 msec = simple_strtoul(buf, &e, 10);
2120 if (e == buf || (*e && *e != '\n'))
2121 return -EINVAL;
2122 msec = (msec * 1000) / scale;
2123 if (msec == 0)
2124 mddev->safemode_delay = 0;
2125 else {
2126 mddev->safemode_delay = (msec*HZ)/1000;
2127 if (mddev->safemode_delay == 0)
2128 mddev->safemode_delay = 1;
2129 }
2130 return len;
2131}
2132static struct md_sysfs_entry md_safe_delay =
2133__ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store);
2134
2135static ssize_t
1981level_show(mddev_t *mddev, char *page) 2136level_show(mddev_t *mddev, char *page)
1982{ 2137{
1983 struct mdk_personality *p = mddev->pers; 2138 struct mdk_personality *p = mddev->pers;
@@ -2012,6 +2167,32 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2012static struct md_sysfs_entry md_level = 2167static struct md_sysfs_entry md_level =
2013__ATTR(level, 0644, level_show, level_store); 2168__ATTR(level, 0644, level_show, level_store);
2014 2169
2170
2171static ssize_t
2172layout_show(mddev_t *mddev, char *page)
2173{
2174 /* just a number, not meaningful for all levels */
2175 return sprintf(page, "%d\n", mddev->layout);
2176}
2177
2178static ssize_t
2179layout_store(mddev_t *mddev, const char *buf, size_t len)
2180{
2181 char *e;
2182 unsigned long n = simple_strtoul(buf, &e, 10);
2183 if (mddev->pers)
2184 return -EBUSY;
2185
2186 if (!*buf || (*e && *e != '\n'))
2187 return -EINVAL;
2188
2189 mddev->layout = n;
2190 return len;
2191}
2192static struct md_sysfs_entry md_layout =
2193__ATTR(layout, 0655, layout_show, layout_store);
2194
2195
2015static ssize_t 2196static ssize_t
2016raid_disks_show(mddev_t *mddev, char *page) 2197raid_disks_show(mddev_t *mddev, char *page)
2017{ 2198{
@@ -2067,6 +2248,200 @@ static struct md_sysfs_entry md_chunk_size =
2067__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); 2248__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
2068 2249
2069static ssize_t 2250static ssize_t
2251resync_start_show(mddev_t *mddev, char *page)
2252{
2253 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2254}
2255
2256static ssize_t
2257resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2258{
2259 /* can only set chunk_size if array is not yet active */
2260 char *e;
2261 unsigned long long n = simple_strtoull(buf, &e, 10);
2262
2263 if (mddev->pers)
2264 return -EBUSY;
2265 if (!*buf || (*e && *e != '\n'))
2266 return -EINVAL;
2267
2268 mddev->recovery_cp = n;
2269 return len;
2270}
2271static struct md_sysfs_entry md_resync_start =
2272__ATTR(resync_start, 0644, resync_start_show, resync_start_store);
2273
2274/*
2275 * The array state can be:
2276 *
2277 * clear
2278 * No devices, no size, no level
2279 * Equivalent to STOP_ARRAY ioctl
2280 * inactive
2281 * May have some settings, but array is not active
2282 * all IO results in error
2283 * When written, doesn't tear down array, but just stops it
2284 * suspended (not supported yet)
2285 * All IO requests will block. The array can be reconfigured.
2286 * Writing this, if accepted, will block until array is quiessent
2287 * readonly
2288 * no resync can happen. no superblocks get written.
2289 * write requests fail
2290 * read-auto
2291 * like readonly, but behaves like 'clean' on a write request.
2292 *
2293 * clean - no pending writes, but otherwise active.
2294 * When written to inactive array, starts without resync
2295 * If a write request arrives then
2296 * if metadata is known, mark 'dirty' and switch to 'active'.
2297 * if not known, block and switch to write-pending
2298 * If written to an active array that has pending writes, then fails.
2299 * active
2300 * fully active: IO and resync can be happening.
2301 * When written to inactive array, starts with resync
2302 *
2303 * write-pending
2304 * clean, but writes are blocked waiting for 'active' to be written.
2305 *
2306 * active-idle
2307 * like active, but no writes have been seen for a while (100msec).
2308 *
2309 */
2310enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2311 write_pending, active_idle, bad_word};
2312static char *array_states[] = {
2313 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2314 "write-pending", "active-idle", NULL };
2315
2316static int match_word(const char *word, char **list)
2317{
2318 int n;
2319 for (n=0; list[n]; n++)
2320 if (cmd_match(word, list[n]))
2321 break;
2322 return n;
2323}
2324
2325static ssize_t
2326array_state_show(mddev_t *mddev, char *page)
2327{
2328 enum array_state st = inactive;
2329
2330 if (mddev->pers)
2331 switch(mddev->ro) {
2332 case 1:
2333 st = readonly;
2334 break;
2335 case 2:
2336 st = read_auto;
2337 break;
2338 case 0:
2339 if (mddev->in_sync)
2340 st = clean;
2341 else if (mddev->safemode)
2342 st = active_idle;
2343 else
2344 st = active;
2345 }
2346 else {
2347 if (list_empty(&mddev->disks) &&
2348 mddev->raid_disks == 0 &&
2349 mddev->size == 0)
2350 st = clear;
2351 else
2352 st = inactive;
2353 }
2354 return sprintf(page, "%s\n", array_states[st]);
2355}
2356
2357static int do_md_stop(mddev_t * mddev, int ro);
2358static int do_md_run(mddev_t * mddev);
2359static int restart_array(mddev_t *mddev);
2360
2361static ssize_t
2362array_state_store(mddev_t *mddev, const char *buf, size_t len)
2363{
2364 int err = -EINVAL;
2365 enum array_state st = match_word(buf, array_states);
2366 switch(st) {
2367 case bad_word:
2368 break;
2369 case clear:
2370 /* stopping an active array */
2371 if (mddev->pers) {
2372 if (atomic_read(&mddev->active) > 1)
2373 return -EBUSY;
2374 err = do_md_stop(mddev, 0);
2375 }
2376 break;
2377 case inactive:
2378 /* stopping an active array */
2379 if (mddev->pers) {
2380 if (atomic_read(&mddev->active) > 1)
2381 return -EBUSY;
2382 err = do_md_stop(mddev, 2);
2383 }
2384 break;
2385 case suspended:
2386 break; /* not supported yet */
2387 case readonly:
2388 if (mddev->pers)
2389 err = do_md_stop(mddev, 1);
2390 else {
2391 mddev->ro = 1;
2392 err = do_md_run(mddev);
2393 }
2394 break;
2395 case read_auto:
2396 /* stopping an active array */
2397 if (mddev->pers) {
2398 err = do_md_stop(mddev, 1);
2399 if (err == 0)
2400 mddev->ro = 2; /* FIXME mark devices writable */
2401 } else {
2402 mddev->ro = 2;
2403 err = do_md_run(mddev);
2404 }
2405 break;
2406 case clean:
2407 if (mddev->pers) {
2408 restart_array(mddev);
2409 spin_lock_irq(&mddev->write_lock);
2410 if (atomic_read(&mddev->writes_pending) == 0) {
2411 mddev->in_sync = 1;
2412 mddev->sb_dirty = 1;
2413 }
2414 spin_unlock_irq(&mddev->write_lock);
2415 } else {
2416 mddev->ro = 0;
2417 mddev->recovery_cp = MaxSector;
2418 err = do_md_run(mddev);
2419 }
2420 break;
2421 case active:
2422 if (mddev->pers) {
2423 restart_array(mddev);
2424 mddev->sb_dirty = 0;
2425 wake_up(&mddev->sb_wait);
2426 err = 0;
2427 } else {
2428 mddev->ro = 0;
2429 err = do_md_run(mddev);
2430 }
2431 break;
2432 case write_pending:
2433 case active_idle:
2434 /* these cannot be set */
2435 break;
2436 }
2437 if (err)
2438 return err;
2439 else
2440 return len;
2441}
2442static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store);
2443
2444static ssize_t
2070null_show(mddev_t *mddev, char *page) 2445null_show(mddev_t *mddev, char *page)
2071{ 2446{
2072 return -EINVAL; 2447 return -EINVAL;
@@ -2428,11 +2803,15 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2428 2803
2429static struct attribute *md_default_attrs[] = { 2804static struct attribute *md_default_attrs[] = {
2430 &md_level.attr, 2805 &md_level.attr,
2806 &md_layout.attr,
2431 &md_raid_disks.attr, 2807 &md_raid_disks.attr,
2432 &md_chunk_size.attr, 2808 &md_chunk_size.attr,
2433 &md_size.attr, 2809 &md_size.attr,
2810 &md_resync_start.attr,
2434 &md_metadata.attr, 2811 &md_metadata.attr,
2435 &md_new_device.attr, 2812 &md_new_device.attr,
2813 &md_safe_delay.attr,
2814 &md_array_state.attr,
2436 NULL, 2815 NULL,
2437}; 2816};
2438 2817
@@ -2553,8 +2932,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
2553 return NULL; 2932 return NULL;
2554} 2933}
2555 2934
2556void md_wakeup_thread(mdk_thread_t *thread);
2557
2558static void md_safemode_timeout(unsigned long data) 2935static void md_safemode_timeout(unsigned long data)
2559{ 2936{
2560 mddev_t *mddev = (mddev_t *) data; 2937 mddev_t *mddev = (mddev_t *) data;
@@ -2708,7 +3085,7 @@ static int do_md_run(mddev_t * mddev)
2708 mddev->safemode = 0; 3085 mddev->safemode = 0;
2709 mddev->safemode_timer.function = md_safemode_timeout; 3086 mddev->safemode_timer.function = md_safemode_timeout;
2710 mddev->safemode_timer.data = (unsigned long) mddev; 3087 mddev->safemode_timer.data = (unsigned long) mddev;
2711 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 3088 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
2712 mddev->in_sync = 1; 3089 mddev->in_sync = 1;
2713 3090
2714 ITERATE_RDEV(mddev,rdev,tmp) 3091 ITERATE_RDEV(mddev,rdev,tmp)
@@ -2736,6 +3113,36 @@ static int do_md_run(mddev_t * mddev)
2736 mddev->queue->queuedata = mddev; 3113 mddev->queue->queuedata = mddev;
2737 mddev->queue->make_request_fn = mddev->pers->make_request; 3114 mddev->queue->make_request_fn = mddev->pers->make_request;
2738 3115
3116 /* If there is a partially-recovered drive we need to
3117 * start recovery here. If we leave it to md_check_recovery,
3118 * it will remove the drives and not do the right thing
3119 */
3120 if (mddev->degraded) {
3121 struct list_head *rtmp;
3122 int spares = 0;
3123 ITERATE_RDEV(mddev,rdev,rtmp)
3124 if (rdev->raid_disk >= 0 &&
3125 !test_bit(In_sync, &rdev->flags) &&
3126 !test_bit(Faulty, &rdev->flags))
3127 /* complete an interrupted recovery */
3128 spares++;
3129 if (spares && mddev->pers->sync_request) {
3130 mddev->recovery = 0;
3131 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3132 mddev->sync_thread = md_register_thread(md_do_sync,
3133 mddev,
3134 "%s_resync");
3135 if (!mddev->sync_thread) {
3136 printk(KERN_ERR "%s: could not start resync"
3137 " thread...\n",
3138 mdname(mddev));
3139 /* leave the spares where they are, it shouldn't hurt */
3140 mddev->recovery = 0;
3141 } else
3142 md_wakeup_thread(mddev->sync_thread);
3143 }
3144 }
3145
2739 mddev->changed = 1; 3146 mddev->changed = 1;
2740 md_new_event(mddev); 3147 md_new_event(mddev);
2741 return 0; 3148 return 0;
@@ -2769,18 +3176,47 @@ static int restart_array(mddev_t *mddev)
2769 */ 3176 */
2770 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2771 md_wakeup_thread(mddev->thread); 3178 md_wakeup_thread(mddev->thread);
3179 md_wakeup_thread(mddev->sync_thread);
2772 err = 0; 3180 err = 0;
2773 } else { 3181 } else
2774 printk(KERN_ERR "md: %s has no personality assigned.\n",
2775 mdname(mddev));
2776 err = -EINVAL; 3182 err = -EINVAL;
2777 }
2778 3183
2779out: 3184out:
2780 return err; 3185 return err;
2781} 3186}
2782 3187
2783static int do_md_stop(mddev_t * mddev, int ro) 3188/* similar to deny_write_access, but accounts for our holding a reference
3189 * to the file ourselves */
3190static int deny_bitmap_write_access(struct file * file)
3191{
3192 struct inode *inode = file->f_mapping->host;
3193
3194 spin_lock(&inode->i_lock);
3195 if (atomic_read(&inode->i_writecount) > 1) {
3196 spin_unlock(&inode->i_lock);
3197 return -ETXTBSY;
3198 }
3199 atomic_set(&inode->i_writecount, -1);
3200 spin_unlock(&inode->i_lock);
3201
3202 return 0;
3203}
3204
3205static void restore_bitmap_write_access(struct file *file)
3206{
3207 struct inode *inode = file->f_mapping->host;
3208
3209 spin_lock(&inode->i_lock);
3210 atomic_set(&inode->i_writecount, 1);
3211 spin_unlock(&inode->i_lock);
3212}
3213
3214/* mode:
3215 * 0 - completely stop and dis-assemble array
3216 * 1 - switch to readonly
3217 * 2 - stop but do not disassemble array
3218 */
3219static int do_md_stop(mddev_t * mddev, int mode)
2784{ 3220{
2785 int err = 0; 3221 int err = 0;
2786 struct gendisk *disk = mddev->gendisk; 3222 struct gendisk *disk = mddev->gendisk;
@@ -2792,6 +3228,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2792 } 3228 }
2793 3229
2794 if (mddev->sync_thread) { 3230 if (mddev->sync_thread) {
3231 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
2795 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3232 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2796 md_unregister_thread(mddev->sync_thread); 3233 md_unregister_thread(mddev->sync_thread);
2797 mddev->sync_thread = NULL; 3234 mddev->sync_thread = NULL;
@@ -2801,12 +3238,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
2801 3238
2802 invalidate_partition(disk, 0); 3239 invalidate_partition(disk, 0);
2803 3240
2804 if (ro) { 3241 switch(mode) {
3242 case 1: /* readonly */
2805 err = -ENXIO; 3243 err = -ENXIO;
2806 if (mddev->ro==1) 3244 if (mddev->ro==1)
2807 goto out; 3245 goto out;
2808 mddev->ro = 1; 3246 mddev->ro = 1;
2809 } else { 3247 break;
3248 case 0: /* disassemble */
3249 case 2: /* stop */
2810 bitmap_flush(mddev); 3250 bitmap_flush(mddev);
2811 md_super_wait(mddev); 3251 md_super_wait(mddev);
2812 if (mddev->ro) 3252 if (mddev->ro)
@@ -2821,19 +3261,20 @@ static int do_md_stop(mddev_t * mddev, int ro)
2821 if (mddev->ro) 3261 if (mddev->ro)
2822 mddev->ro = 0; 3262 mddev->ro = 0;
2823 } 3263 }
2824 if (!mddev->in_sync) { 3264 if (!mddev->in_sync || mddev->sb_dirty) {
2825 /* mark array as shutdown cleanly */ 3265 /* mark array as shutdown cleanly */
2826 mddev->in_sync = 1; 3266 mddev->in_sync = 1;
2827 md_update_sb(mddev); 3267 md_update_sb(mddev);
2828 } 3268 }
2829 if (ro) 3269 if (mode == 1)
2830 set_disk_ro(disk, 1); 3270 set_disk_ro(disk, 1);
3271 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
2831 } 3272 }
2832 3273
2833 /* 3274 /*
2834 * Free resources if final stop 3275 * Free resources if final stop
2835 */ 3276 */
2836 if (!ro) { 3277 if (mode == 0) {
2837 mdk_rdev_t *rdev; 3278 mdk_rdev_t *rdev;
2838 struct list_head *tmp; 3279 struct list_head *tmp;
2839 struct gendisk *disk; 3280 struct gendisk *disk;
@@ -2841,7 +3282,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2841 3282
2842 bitmap_destroy(mddev); 3283 bitmap_destroy(mddev);
2843 if (mddev->bitmap_file) { 3284 if (mddev->bitmap_file) {
2844 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 3285 restore_bitmap_write_access(mddev->bitmap_file);
2845 fput(mddev->bitmap_file); 3286 fput(mddev->bitmap_file);
2846 mddev->bitmap_file = NULL; 3287 mddev->bitmap_file = NULL;
2847 } 3288 }
@@ -2857,11 +3298,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
2857 export_array(mddev); 3298 export_array(mddev);
2858 3299
2859 mddev->array_size = 0; 3300 mddev->array_size = 0;
3301 mddev->size = 0;
3302 mddev->raid_disks = 0;
3303 mddev->recovery_cp = 0;
3304
2860 disk = mddev->gendisk; 3305 disk = mddev->gendisk;
2861 if (disk) 3306 if (disk)
2862 set_capacity(disk, 0); 3307 set_capacity(disk, 0);
2863 mddev->changed = 1; 3308 mddev->changed = 1;
2864 } else 3309 } else if (mddev->pers)
2865 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3310 printk(KERN_INFO "md: %s switched to read-only mode.\n",
2866 mdname(mddev)); 3311 mdname(mddev));
2867 err = 0; 3312 err = 0;
@@ -3264,6 +3709,17 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3264 3709
3265 rdev->raid_disk = -1; 3710 rdev->raid_disk = -1;
3266 err = bind_rdev_to_array(rdev, mddev); 3711 err = bind_rdev_to_array(rdev, mddev);
3712 if (!err && !mddev->pers->hot_remove_disk) {
3713 /* If there is hot_add_disk but no hot_remove_disk
3714 * then added disks for geometry changes,
3715 * and should be added immediately.
3716 */
3717 super_types[mddev->major_version].
3718 validate_super(mddev, rdev);
3719 err = mddev->pers->hot_add_disk(mddev, rdev);
3720 if (err)
3721 unbind_rdev_from_array(rdev);
3722 }
3267 if (err) 3723 if (err)
3268 export_rdev(rdev); 3724 export_rdev(rdev);
3269 3725
@@ -3434,23 +3890,6 @@ abort_export:
3434 return err; 3890 return err;
3435} 3891}
3436 3892
3437/* similar to deny_write_access, but accounts for our holding a reference
3438 * to the file ourselves */
3439static int deny_bitmap_write_access(struct file * file)
3440{
3441 struct inode *inode = file->f_mapping->host;
3442
3443 spin_lock(&inode->i_lock);
3444 if (atomic_read(&inode->i_writecount) > 1) {
3445 spin_unlock(&inode->i_lock);
3446 return -ETXTBSY;
3447 }
3448 atomic_set(&inode->i_writecount, -1);
3449 spin_unlock(&inode->i_lock);
3450
3451 return 0;
3452}
3453
3454static int set_bitmap_file(mddev_t *mddev, int fd) 3893static int set_bitmap_file(mddev_t *mddev, int fd)
3455{ 3894{
3456 int err; 3895 int err;
@@ -3491,12 +3930,17 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
3491 mddev->pers->quiesce(mddev, 1); 3930 mddev->pers->quiesce(mddev, 1);
3492 if (fd >= 0) 3931 if (fd >= 0)
3493 err = bitmap_create(mddev); 3932 err = bitmap_create(mddev);
3494 if (fd < 0 || err) 3933 if (fd < 0 || err) {
3495 bitmap_destroy(mddev); 3934 bitmap_destroy(mddev);
3935 fd = -1; /* make sure to put the file */
3936 }
3496 mddev->pers->quiesce(mddev, 0); 3937 mddev->pers->quiesce(mddev, 0);
3497 } else if (fd < 0) { 3938 }
3498 if (mddev->bitmap_file) 3939 if (fd < 0) {
3940 if (mddev->bitmap_file) {
3941 restore_bitmap_write_access(mddev->bitmap_file);
3499 fput(mddev->bitmap_file); 3942 fput(mddev->bitmap_file);
3943 }
3500 mddev->bitmap_file = NULL; 3944 mddev->bitmap_file = NULL;
3501 } 3945 }
3502 3946
@@ -3977,11 +4421,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
3977 goto done_unlock; 4421 goto done_unlock;
3978 4422
3979 default: 4423 default:
3980 if (_IOC_TYPE(cmd) == MD_MAJOR)
3981 printk(KERN_WARNING "md: %s(pid %d) used"
3982 " obsolete MD ioctl, upgrade your"
3983 " software to use new ictls.\n",
3984 current->comm, current->pid);
3985 err = -EINVAL; 4424 err = -EINVAL;
3986 goto abort_unlock; 4425 goto abort_unlock;
3987 } 4426 }
@@ -4586,7 +5025,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
4586 spin_lock_irq(&mddev->write_lock); 5025 spin_lock_irq(&mddev->write_lock);
4587 if (mddev->in_sync) { 5026 if (mddev->in_sync) {
4588 mddev->in_sync = 0; 5027 mddev->in_sync = 0;
4589 mddev->sb_dirty = 1; 5028 mddev->sb_dirty = 3;
4590 md_wakeup_thread(mddev->thread); 5029 md_wakeup_thread(mddev->thread);
4591 } 5030 }
4592 spin_unlock_irq(&mddev->write_lock); 5031 spin_unlock_irq(&mddev->write_lock);
@@ -4599,7 +5038,7 @@ void md_write_end(mddev_t *mddev)
4599 if (atomic_dec_and_test(&mddev->writes_pending)) { 5038 if (atomic_dec_and_test(&mddev->writes_pending)) {
4600 if (mddev->safemode == 2) 5039 if (mddev->safemode == 2)
4601 md_wakeup_thread(mddev->thread); 5040 md_wakeup_thread(mddev->thread);
4602 else 5041 else if (mddev->safemode_delay)
4603 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5042 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
4604 } 5043 }
4605} 5044}
@@ -4620,10 +5059,14 @@ void md_do_sync(mddev_t *mddev)
4620 struct list_head *tmp; 5059 struct list_head *tmp;
4621 sector_t last_check; 5060 sector_t last_check;
4622 int skipped = 0; 5061 int skipped = 0;
5062 struct list_head *rtmp;
5063 mdk_rdev_t *rdev;
4623 5064
4624 /* just incase thread restarts... */ 5065 /* just incase thread restarts... */
4625 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5066 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
4626 return; 5067 return;
5068 if (mddev->ro) /* never try to sync a read-only array */
5069 return;
4627 5070
4628 /* we overload curr_resync somewhat here. 5071 /* we overload curr_resync somewhat here.
4629 * 0 == not engaged in resync at all 5072 * 0 == not engaged in resync at all
@@ -4682,17 +5125,30 @@ void md_do_sync(mddev_t *mddev)
4682 } 5125 }
4683 } while (mddev->curr_resync < 2); 5126 } while (mddev->curr_resync < 2);
4684 5127
5128 j = 0;
4685 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5129 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4686 /* resync follows the size requested by the personality, 5130 /* resync follows the size requested by the personality,
4687 * which defaults to physical size, but can be virtual size 5131 * which defaults to physical size, but can be virtual size
4688 */ 5132 */
4689 max_sectors = mddev->resync_max_sectors; 5133 max_sectors = mddev->resync_max_sectors;
4690 mddev->resync_mismatches = 0; 5134 mddev->resync_mismatches = 0;
5135 /* we don't use the checkpoint if there's a bitmap */
5136 if (!mddev->bitmap &&
5137 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5138 j = mddev->recovery_cp;
4691 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5139 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4692 max_sectors = mddev->size << 1; 5140 max_sectors = mddev->size << 1;
4693 else 5141 else {
4694 /* recovery follows the physical size of devices */ 5142 /* recovery follows the physical size of devices */
4695 max_sectors = mddev->size << 1; 5143 max_sectors = mddev->size << 1;
5144 j = MaxSector;
5145 ITERATE_RDEV(mddev,rdev,rtmp)
5146 if (rdev->raid_disk >= 0 &&
5147 !test_bit(Faulty, &rdev->flags) &&
5148 !test_bit(In_sync, &rdev->flags) &&
5149 rdev->recovery_offset < j)
5150 j = rdev->recovery_offset;
5151 }
4696 5152
4697 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 5153 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
4698 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 5154 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
@@ -4702,12 +5158,7 @@ void md_do_sync(mddev_t *mddev)
4702 speed_max(mddev)); 5158 speed_max(mddev));
4703 5159
4704 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5160 is_mddev_idle(mddev); /* this also initializes IO event counters */
4705 /* we don't use the checkpoint if there's a bitmap */ 5161
4706 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
4707 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4708 j = mddev->recovery_cp;
4709 else
4710 j = 0;
4711 io_sectors = 0; 5162 io_sectors = 0;
4712 for (m = 0; m < SYNC_MARKS; m++) { 5163 for (m = 0; m < SYNC_MARKS; m++) {
4713 mark[m] = jiffies; 5164 mark[m] = jiffies;
@@ -4828,15 +5279,28 @@ void md_do_sync(mddev_t *mddev)
4828 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5279 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
4829 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 5280 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
4830 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5281 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
4831 mddev->curr_resync > 2 && 5282 mddev->curr_resync > 2) {
4832 mddev->curr_resync >= mddev->recovery_cp) { 5283 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4833 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5284 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4834 printk(KERN_INFO 5285 if (mddev->curr_resync >= mddev->recovery_cp) {
4835 "md: checkpointing recovery of %s.\n", 5286 printk(KERN_INFO
4836 mdname(mddev)); 5287 "md: checkpointing recovery of %s.\n",
4837 mddev->recovery_cp = mddev->curr_resync; 5288 mdname(mddev));
4838 } else 5289 mddev->recovery_cp = mddev->curr_resync;
4839 mddev->recovery_cp = MaxSector; 5290 }
5291 } else
5292 mddev->recovery_cp = MaxSector;
5293 } else {
5294 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5295 mddev->curr_resync = MaxSector;
5296 ITERATE_RDEV(mddev,rdev,rtmp)
5297 if (rdev->raid_disk >= 0 &&
5298 !test_bit(Faulty, &rdev->flags) &&
5299 !test_bit(In_sync, &rdev->flags) &&
5300 rdev->recovery_offset < mddev->curr_resync)
5301 rdev->recovery_offset = mddev->curr_resync;
5302 mddev->sb_dirty = 1;
5303 }
4840 } 5304 }
4841 5305
4842 skip: 5306 skip:
@@ -4908,7 +5372,7 @@ void md_check_recovery(mddev_t *mddev)
4908 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5372 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
4909 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5373 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
4910 mddev->in_sync = 1; 5374 mddev->in_sync = 1;
4911 mddev->sb_dirty = 1; 5375 mddev->sb_dirty = 3;
4912 } 5376 }
4913 if (mddev->safemode == 1) 5377 if (mddev->safemode == 1)
4914 mddev->safemode = 0; 5378 mddev->safemode = 0;
@@ -4957,6 +5421,8 @@ void md_check_recovery(mddev_t *mddev)
4957 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5421 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
4958 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5422 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4959 5423
5424 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5425 goto unlock;
4960 /* no recovery is running. 5426 /* no recovery is running.
4961 * remove any failed drives, then 5427 * remove any failed drives, then
4962 * add spares if possible. 5428 * add spares if possible.
@@ -4979,6 +5445,7 @@ void md_check_recovery(mddev_t *mddev)
4979 ITERATE_RDEV(mddev,rdev,rtmp) 5445 ITERATE_RDEV(mddev,rdev,rtmp)
4980 if (rdev->raid_disk < 0 5446 if (rdev->raid_disk < 0
4981 && !test_bit(Faulty, &rdev->flags)) { 5447 && !test_bit(Faulty, &rdev->flags)) {
5448 rdev->recovery_offset = 0;
4982 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5449 if (mddev->pers->hot_add_disk(mddev,rdev)) {
4983 char nm[20]; 5450 char nm[20];
4984 sprintf(nm, "rd%d", rdev->raid_disk); 5451 sprintf(nm, "rd%d", rdev->raid_disk);
@@ -5216,7 +5683,6 @@ EXPORT_SYMBOL(md_write_end);
5216EXPORT_SYMBOL(md_register_thread); 5683EXPORT_SYMBOL(md_register_thread);
5217EXPORT_SYMBOL(md_unregister_thread); 5684EXPORT_SYMBOL(md_unregister_thread);
5218EXPORT_SYMBOL(md_wakeup_thread); 5685EXPORT_SYMBOL(md_wakeup_thread);
5219EXPORT_SYMBOL(md_print_devices);
5220EXPORT_SYMBOL(md_check_recovery); 5686EXPORT_SYMBOL(md_check_recovery);
5221MODULE_LICENSE("GPL"); 5687MODULE_LICENSE("GPL");
5222MODULE_ALIAS("md"); 5688MODULE_ALIAS("md");