aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c893
1 files changed, 711 insertions, 182 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8175a2a222da..1b76fb29fb70 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -42,6 +42,7 @@
42#include <linux/devfs_fs_kernel.h> 42#include <linux/devfs_fs_kernel.h>
43#include <linux/buffer_head.h> /* for invalidate_bdev */ 43#include <linux/buffer_head.h> /* for invalidate_bdev */
44#include <linux/suspend.h> 44#include <linux/suspend.h>
45#include <linux/poll.h>
45 46
46#include <linux/init.h> 47#include <linux/init.h>
47 48
@@ -67,7 +68,7 @@
67static void autostart_arrays (int part); 68static void autostart_arrays (int part);
68#endif 69#endif
69 70
70static mdk_personality_t *pers[MAX_PERSONALITY]; 71static LIST_HEAD(pers_list);
71static DEFINE_SPINLOCK(pers_lock); 72static DEFINE_SPINLOCK(pers_lock);
72 73
73/* 74/*
@@ -80,10 +81,22 @@ static DEFINE_SPINLOCK(pers_lock);
80 * idle IO detection. 81 * idle IO detection.
81 * 82 *
82 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 83 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
84 * or /sys/block/mdX/md/sync_speed_{min,max}
83 */ 85 */
84 86
85static int sysctl_speed_limit_min = 1000; 87static int sysctl_speed_limit_min = 1000;
86static int sysctl_speed_limit_max = 200000; 88static int sysctl_speed_limit_max = 200000;
89static inline int speed_min(mddev_t *mddev)
90{
91 return mddev->sync_speed_min ?
92 mddev->sync_speed_min : sysctl_speed_limit_min;
93}
94
95static inline int speed_max(mddev_t *mddev)
96{
97 return mddev->sync_speed_max ?
98 mddev->sync_speed_max : sysctl_speed_limit_max;
99}
87 100
88static struct ctl_table_header *raid_table_header; 101static struct ctl_table_header *raid_table_header;
89 102
@@ -134,6 +147,24 @@ static struct block_device_operations md_fops;
134static int start_readonly; 147static int start_readonly;
135 148
136/* 149/*
150 * We have a system wide 'event count' that is incremented
151 * on any 'interesting' event, and readers of /proc/mdstat
152 * can use 'poll' or 'select' to find out when the event
153 * count increases.
154 *
155 * Events are:
156 * start array, stop array, error, add device, remove device,
157 * start build, activate spare
158 */
159static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
160static atomic_t md_event_count;
161static void md_new_event(mddev_t *mddev)
162{
163 atomic_inc(&md_event_count);
164 wake_up(&md_event_waiters);
165}
166
167/*
137 * Enables to iterate over all existing md arrays 168 * Enables to iterate over all existing md arrays
138 * all_mddevs_lock protects this list. 169 * all_mddevs_lock protects this list.
139 */ 170 */
@@ -209,12 +240,10 @@ static mddev_t * mddev_find(dev_t unit)
209 } 240 }
210 spin_unlock(&all_mddevs_lock); 241 spin_unlock(&all_mddevs_lock);
211 242
212 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 243 new = kzalloc(sizeof(*new), GFP_KERNEL);
213 if (!new) 244 if (!new)
214 return NULL; 245 return NULL;
215 246
216 memset(new, 0, sizeof(*new));
217
218 new->unit = unit; 247 new->unit = unit;
219 if (MAJOR(unit) == MD_MAJOR) 248 if (MAJOR(unit) == MD_MAJOR)
220 new->md_minor = MINOR(unit); 249 new->md_minor = MINOR(unit);
@@ -262,7 +291,7 @@ static inline void mddev_unlock(mddev_t * mddev)
262 md_wakeup_thread(mddev->thread); 291 md_wakeup_thread(mddev->thread);
263} 292}
264 293
265mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 294static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
266{ 295{
267 mdk_rdev_t * rdev; 296 mdk_rdev_t * rdev;
268 struct list_head *tmp; 297 struct list_head *tmp;
@@ -286,6 +315,18 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
286 return NULL; 315 return NULL;
287} 316}
288 317
318static struct mdk_personality *find_pers(int level, char *clevel)
319{
320 struct mdk_personality *pers;
321 list_for_each_entry(pers, &pers_list, list) {
322 if (level != LEVEL_NONE && pers->level == level)
323 return pers;
324 if (strcmp(pers->name, clevel)==0)
325 return pers;
326 }
327 return NULL;
328}
329
289static inline sector_t calc_dev_sboffset(struct block_device *bdev) 330static inline sector_t calc_dev_sboffset(struct block_device *bdev)
290{ 331{
291 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 332 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
@@ -320,7 +361,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
320static void free_disk_sb(mdk_rdev_t * rdev) 361static void free_disk_sb(mdk_rdev_t * rdev)
321{ 362{
322 if (rdev->sb_page) { 363 if (rdev->sb_page) {
323 page_cache_release(rdev->sb_page); 364 put_page(rdev->sb_page);
324 rdev->sb_loaded = 0; 365 rdev->sb_loaded = 0;
325 rdev->sb_page = NULL; 366 rdev->sb_page = NULL;
326 rdev->sb_offset = 0; 367 rdev->sb_offset = 0;
@@ -461,6 +502,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
461 bio_put(bio); 502 bio_put(bio);
462 return ret; 503 return ret;
463} 504}
505EXPORT_SYMBOL_GPL(sync_page_io);
464 506
465static int read_disk_sb(mdk_rdev_t * rdev, int size) 507static int read_disk_sb(mdk_rdev_t * rdev, int size)
466{ 508{
@@ -665,6 +707,10 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
665 } 707 }
666 rdev->size = calc_dev_size(rdev, sb->chunk_size); 708 rdev->size = calc_dev_size(rdev, sb->chunk_size);
667 709
710 if (rdev->size < sb->size && sb->level > 1)
711 /* "this cannot possibly happen" ... */
712 ret = -EINVAL;
713
668 abort: 714 abort:
669 return ret; 715 return ret;
670} 716}
@@ -688,6 +734,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
688 mddev->ctime = sb->ctime; 734 mddev->ctime = sb->ctime;
689 mddev->utime = sb->utime; 735 mddev->utime = sb->utime;
690 mddev->level = sb->level; 736 mddev->level = sb->level;
737 mddev->clevel[0] = 0;
691 mddev->layout = sb->layout; 738 mddev->layout = sb->layout;
692 mddev->raid_disks = sb->raid_disks; 739 mddev->raid_disks = sb->raid_disks;
693 mddev->size = sb->size; 740 mddev->size = sb->size;
@@ -714,9 +761,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
714 761
715 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 762 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
716 mddev->bitmap_file == NULL) { 763 mddev->bitmap_file == NULL) {
717 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { 764 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
765 && mddev->level != 10) {
718 /* FIXME use a better test */ 766 /* FIXME use a better test */
719 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 767 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
720 return -EINVAL; 768 return -EINVAL;
721 } 769 }
722 mddev->bitmap_offset = mddev->default_bitmap_offset; 770 mddev->bitmap_offset = mddev->default_bitmap_offset;
@@ -968,6 +1016,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
968 } 1016 }
969 rdev->preferred_minor = 0xffff; 1017 rdev->preferred_minor = 0xffff;
970 rdev->data_offset = le64_to_cpu(sb->data_offset); 1018 rdev->data_offset = le64_to_cpu(sb->data_offset);
1019 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
971 1020
972 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1021 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
973 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1022 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
@@ -1006,6 +1055,9 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1006 rdev->size = le64_to_cpu(sb->data_size)/2; 1055 rdev->size = le64_to_cpu(sb->data_size)/2;
1007 if (le32_to_cpu(sb->chunksize)) 1056 if (le32_to_cpu(sb->chunksize))
1008 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1057 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1058
1059 if (le32_to_cpu(sb->size) > rdev->size*2)
1060 return -EINVAL;
1009 return 0; 1061 return 0;
1010} 1062}
1011 1063
@@ -1023,6 +1075,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1023 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1075 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1024 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1076 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1025 mddev->level = le32_to_cpu(sb->level); 1077 mddev->level = le32_to_cpu(sb->level);
1078 mddev->clevel[0] = 0;
1026 mddev->layout = le32_to_cpu(sb->layout); 1079 mddev->layout = le32_to_cpu(sb->layout);
1027 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1080 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1028 mddev->size = le64_to_cpu(sb->size)/2; 1081 mddev->size = le64_to_cpu(sb->size)/2;
@@ -1037,8 +1090,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1037 1090
1038 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1091 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1039 mddev->bitmap_file == NULL ) { 1092 mddev->bitmap_file == NULL ) {
1040 if (mddev->level != 1) { 1093 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1041 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 1094 && mddev->level != 10) {
1095 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
1042 return -EINVAL; 1096 return -EINVAL;
1043 } 1097 }
1044 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1098 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
@@ -1105,6 +1159,8 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1105 else 1159 else
1106 sb->resync_offset = cpu_to_le64(0); 1160 sb->resync_offset = cpu_to_le64(0);
1107 1161
1162 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
1163
1108 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1164 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1109 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1165 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1110 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1166 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@ -1187,6 +1243,14 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1187 MD_BUG(); 1243 MD_BUG();
1188 return -EINVAL; 1244 return -EINVAL;
1189 } 1245 }
1246 /* make sure rdev->size exceeds mddev->size */
1247 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1248 if (mddev->pers)
1249 /* Cannot change size, so fail */
1250 return -ENOSPC;
1251 else
1252 mddev->size = rdev->size;
1253 }
1190 same_pdev = match_dev_unit(mddev, rdev); 1254 same_pdev = match_dev_unit(mddev, rdev);
1191 if (same_pdev) 1255 if (same_pdev)
1192 printk(KERN_WARNING 1256 printk(KERN_WARNING
@@ -1496,6 +1560,26 @@ repeat:
1496 1560
1497} 1561}
1498 1562
1563/* words written to sysfs files may, or my not, be \n terminated.
1564 * We want to accept with case. For this we use cmd_match.
1565 */
1566static int cmd_match(const char *cmd, const char *str)
1567{
1568 /* See if cmd, written into a sysfs file, matches
1569 * str. They must either be the same, or cmd can
1570 * have a trailing newline
1571 */
1572 while (*cmd && *str && *cmd == *str) {
1573 cmd++;
1574 str++;
1575 }
1576 if (*cmd == '\n')
1577 cmd++;
1578 if (*str || *cmd)
1579 return 0;
1580 return 1;
1581}
1582
1499struct rdev_sysfs_entry { 1583struct rdev_sysfs_entry {
1500 struct attribute attr; 1584 struct attribute attr;
1501 ssize_t (*show)(mdk_rdev_t *, char *); 1585 ssize_t (*show)(mdk_rdev_t *, char *);
@@ -1538,9 +1622,113 @@ super_show(mdk_rdev_t *rdev, char *page)
1538} 1622}
1539static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1623static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1540 1624
1625static ssize_t
1626errors_show(mdk_rdev_t *rdev, char *page)
1627{
1628 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1629}
1630
1631static ssize_t
1632errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1633{
1634 char *e;
1635 unsigned long n = simple_strtoul(buf, &e, 10);
1636 if (*buf && (*e == 0 || *e == '\n')) {
1637 atomic_set(&rdev->corrected_errors, n);
1638 return len;
1639 }
1640 return -EINVAL;
1641}
1642static struct rdev_sysfs_entry rdev_errors =
1643__ATTR(errors, 0644, errors_show, errors_store);
1644
1645static ssize_t
1646slot_show(mdk_rdev_t *rdev, char *page)
1647{
1648 if (rdev->raid_disk < 0)
1649 return sprintf(page, "none\n");
1650 else
1651 return sprintf(page, "%d\n", rdev->raid_disk);
1652}
1653
1654static ssize_t
1655slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1656{
1657 char *e;
1658 int slot = simple_strtoul(buf, &e, 10);
1659 if (strncmp(buf, "none", 4)==0)
1660 slot = -1;
1661 else if (e==buf || (*e && *e!= '\n'))
1662 return -EINVAL;
1663 if (rdev->mddev->pers)
1664 /* Cannot set slot in active array (yet) */
1665 return -EBUSY;
1666 if (slot >= rdev->mddev->raid_disks)
1667 return -ENOSPC;
1668 rdev->raid_disk = slot;
1669 /* assume it is working */
1670 rdev->flags = 0;
1671 set_bit(In_sync, &rdev->flags);
1672 return len;
1673}
1674
1675
1676static struct rdev_sysfs_entry rdev_slot =
1677__ATTR(slot, 0644, slot_show, slot_store);
1678
1679static ssize_t
1680offset_show(mdk_rdev_t *rdev, char *page)
1681{
1682 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1683}
1684
1685static ssize_t
1686offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1687{
1688 char *e;
1689 unsigned long long offset = simple_strtoull(buf, &e, 10);
1690 if (e==buf || (*e && *e != '\n'))
1691 return -EINVAL;
1692 if (rdev->mddev->pers)
1693 return -EBUSY;
1694 rdev->data_offset = offset;
1695 return len;
1696}
1697
1698static struct rdev_sysfs_entry rdev_offset =
1699__ATTR(offset, 0644, offset_show, offset_store);
1700
1701static ssize_t
1702rdev_size_show(mdk_rdev_t *rdev, char *page)
1703{
1704 return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1705}
1706
1707static ssize_t
1708rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1709{
1710 char *e;
1711 unsigned long long size = simple_strtoull(buf, &e, 10);
1712 if (e==buf || (*e && *e != '\n'))
1713 return -EINVAL;
1714 if (rdev->mddev->pers)
1715 return -EBUSY;
1716 rdev->size = size;
1717 if (size < rdev->mddev->size || rdev->mddev->size == 0)
1718 rdev->mddev->size = size;
1719 return len;
1720}
1721
1722static struct rdev_sysfs_entry rdev_size =
1723__ATTR(size, 0644, rdev_size_show, rdev_size_store);
1724
1541static struct attribute *rdev_default_attrs[] = { 1725static struct attribute *rdev_default_attrs[] = {
1542 &rdev_state.attr, 1726 &rdev_state.attr,
1543 &rdev_super.attr, 1727 &rdev_super.attr,
1728 &rdev_errors.attr,
1729 &rdev_slot.attr,
1730 &rdev_offset.attr,
1731 &rdev_size.attr,
1544 NULL, 1732 NULL,
1545}; 1733};
1546static ssize_t 1734static ssize_t
@@ -1598,12 +1786,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1598 mdk_rdev_t *rdev; 1786 mdk_rdev_t *rdev;
1599 sector_t size; 1787 sector_t size;
1600 1788
1601 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1789 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
1602 if (!rdev) { 1790 if (!rdev) {
1603 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1791 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1604 return ERR_PTR(-ENOMEM); 1792 return ERR_PTR(-ENOMEM);
1605 } 1793 }
1606 memset(rdev, 0, sizeof(*rdev));
1607 1794
1608 if ((err = alloc_disk_sb(rdev))) 1795 if ((err = alloc_disk_sb(rdev)))
1609 goto abort_free; 1796 goto abort_free;
@@ -1621,6 +1808,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1621 rdev->data_offset = 0; 1808 rdev->data_offset = 0;
1622 atomic_set(&rdev->nr_pending, 0); 1809 atomic_set(&rdev->nr_pending, 0);
1623 atomic_set(&rdev->read_errors, 0); 1810 atomic_set(&rdev->read_errors, 0);
1811 atomic_set(&rdev->corrected_errors, 0);
1624 1812
1625 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1813 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1626 if (!size) { 1814 if (!size) {
@@ -1725,16 +1913,37 @@ static void analyze_sbs(mddev_t * mddev)
1725static ssize_t 1913static ssize_t
1726level_show(mddev_t *mddev, char *page) 1914level_show(mddev_t *mddev, char *page)
1727{ 1915{
1728 mdk_personality_t *p = mddev->pers; 1916 struct mdk_personality *p = mddev->pers;
1729 if (p == NULL && mddev->raid_disks == 0) 1917 if (p)
1730 return 0;
1731 if (mddev->level >= 0)
1732 return sprintf(page, "raid%d\n", mddev->level);
1733 else
1734 return sprintf(page, "%s\n", p->name); 1918 return sprintf(page, "%s\n", p->name);
1919 else if (mddev->clevel[0])
1920 return sprintf(page, "%s\n", mddev->clevel);
1921 else if (mddev->level != LEVEL_NONE)
1922 return sprintf(page, "%d\n", mddev->level);
1923 else
1924 return 0;
1925}
1926
1927static ssize_t
1928level_store(mddev_t *mddev, const char *buf, size_t len)
1929{
1930 int rv = len;
1931 if (mddev->pers)
1932 return -EBUSY;
1933 if (len == 0)
1934 return 0;
1935 if (len >= sizeof(mddev->clevel))
1936 return -ENOSPC;
1937 strncpy(mddev->clevel, buf, len);
1938 if (mddev->clevel[len-1] == '\n')
1939 len--;
1940 mddev->clevel[len] = 0;
1941 mddev->level = LEVEL_NONE;
1942 return rv;
1735} 1943}
1736 1944
1737static struct md_sysfs_entry md_level = __ATTR_RO(level); 1945static struct md_sysfs_entry md_level =
1946__ATTR(level, 0644, level_show, level_store);
1738 1947
1739static ssize_t 1948static ssize_t
1740raid_disks_show(mddev_t *mddev, char *page) 1949raid_disks_show(mddev_t *mddev, char *page)
@@ -1744,7 +1953,197 @@ raid_disks_show(mddev_t *mddev, char *page)
1744 return sprintf(page, "%d\n", mddev->raid_disks); 1953 return sprintf(page, "%d\n", mddev->raid_disks);
1745} 1954}
1746 1955
1747static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks); 1956static int update_raid_disks(mddev_t *mddev, int raid_disks);
1957
1958static ssize_t
1959raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
1960{
1961 /* can only set raid_disks if array is not yet active */
1962 char *e;
1963 int rv = 0;
1964 unsigned long n = simple_strtoul(buf, &e, 10);
1965
1966 if (!*buf || (*e && *e != '\n'))
1967 return -EINVAL;
1968
1969 if (mddev->pers)
1970 rv = update_raid_disks(mddev, n);
1971 else
1972 mddev->raid_disks = n;
1973 return rv ? rv : len;
1974}
1975static struct md_sysfs_entry md_raid_disks =
1976__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
1977
1978static ssize_t
1979chunk_size_show(mddev_t *mddev, char *page)
1980{
1981 return sprintf(page, "%d\n", mddev->chunk_size);
1982}
1983
1984static ssize_t
1985chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
1986{
1987 /* can only set chunk_size if array is not yet active */
1988 char *e;
1989 unsigned long n = simple_strtoul(buf, &e, 10);
1990
1991 if (mddev->pers)
1992 return -EBUSY;
1993 if (!*buf || (*e && *e != '\n'))
1994 return -EINVAL;
1995
1996 mddev->chunk_size = n;
1997 return len;
1998}
1999static struct md_sysfs_entry md_chunk_size =
2000__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
2001
2002static ssize_t
2003null_show(mddev_t *mddev, char *page)
2004{
2005 return -EINVAL;
2006}
2007
2008static ssize_t
2009new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2010{
2011 /* buf must be %d:%d\n? giving major and minor numbers */
2012 /* The new device is added to the array.
2013 * If the array has a persistent superblock, we read the
2014 * superblock to initialise info and check validity.
2015 * Otherwise, only checking done is that in bind_rdev_to_array,
2016 * which mainly checks size.
2017 */
2018 char *e;
2019 int major = simple_strtoul(buf, &e, 10);
2020 int minor;
2021 dev_t dev;
2022 mdk_rdev_t *rdev;
2023 int err;
2024
2025 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2026 return -EINVAL;
2027 minor = simple_strtoul(e+1, &e, 10);
2028 if (*e && *e != '\n')
2029 return -EINVAL;
2030 dev = MKDEV(major, minor);
2031 if (major != MAJOR(dev) ||
2032 minor != MINOR(dev))
2033 return -EOVERFLOW;
2034
2035
2036 if (mddev->persistent) {
2037 rdev = md_import_device(dev, mddev->major_version,
2038 mddev->minor_version);
2039 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2040 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2041 mdk_rdev_t, same_set);
2042 err = super_types[mddev->major_version]
2043 .load_super(rdev, rdev0, mddev->minor_version);
2044 if (err < 0)
2045 goto out;
2046 }
2047 } else
2048 rdev = md_import_device(dev, -1, -1);
2049
2050 if (IS_ERR(rdev))
2051 return PTR_ERR(rdev);
2052 err = bind_rdev_to_array(rdev, mddev);
2053 out:
2054 if (err)
2055 export_rdev(rdev);
2056 return err ? err : len;
2057}
2058
2059static struct md_sysfs_entry md_new_device =
2060__ATTR(new_dev, 0200, null_show, new_dev_store);
2061
2062static ssize_t
2063size_show(mddev_t *mddev, char *page)
2064{
2065 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2066}
2067
2068static int update_size(mddev_t *mddev, unsigned long size);
2069
2070static ssize_t
2071size_store(mddev_t *mddev, const char *buf, size_t len)
2072{
2073 /* If array is inactive, we can reduce the component size, but
2074 * not increase it (except from 0).
2075 * If array is active, we can try an on-line resize
2076 */
2077 char *e;
2078 int err = 0;
2079 unsigned long long size = simple_strtoull(buf, &e, 10);
2080 if (!*buf || *buf == '\n' ||
2081 (*e && *e != '\n'))
2082 return -EINVAL;
2083
2084 if (mddev->pers) {
2085 err = update_size(mddev, size);
2086 md_update_sb(mddev);
2087 } else {
2088 if (mddev->size == 0 ||
2089 mddev->size > size)
2090 mddev->size = size;
2091 else
2092 err = -ENOSPC;
2093 }
2094 return err ? err : len;
2095}
2096
2097static struct md_sysfs_entry md_size =
2098__ATTR(component_size, 0644, size_show, size_store);
2099
2100
2101/* Metdata version.
2102 * This is either 'none' for arrays with externally managed metadata,
2103 * or N.M for internally known formats
2104 */
2105static ssize_t
2106metadata_show(mddev_t *mddev, char *page)
2107{
2108 if (mddev->persistent)
2109 return sprintf(page, "%d.%d\n",
2110 mddev->major_version, mddev->minor_version);
2111 else
2112 return sprintf(page, "none\n");
2113}
2114
2115static ssize_t
2116metadata_store(mddev_t *mddev, const char *buf, size_t len)
2117{
2118 int major, minor;
2119 char *e;
2120 if (!list_empty(&mddev->disks))
2121 return -EBUSY;
2122
2123 if (cmd_match(buf, "none")) {
2124 mddev->persistent = 0;
2125 mddev->major_version = 0;
2126 mddev->minor_version = 90;
2127 return len;
2128 }
2129 major = simple_strtoul(buf, &e, 10);
2130 if (e==buf || *e != '.')
2131 return -EINVAL;
2132 buf = e+1;
2133 minor = simple_strtoul(buf, &e, 10);
2134 if (e==buf || *e != '\n')
2135 return -EINVAL;
2136 if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
2137 super_types[major].name == NULL)
2138 return -ENOENT;
2139 mddev->major_version = major;
2140 mddev->minor_version = minor;
2141 mddev->persistent = 1;
2142 return len;
2143}
2144
2145static struct md_sysfs_entry md_metadata =
2146__ATTR(metadata_version, 0644, metadata_show, metadata_store);
1748 2147
1749static ssize_t 2148static ssize_t
1750action_show(mddev_t *mddev, char *page) 2149action_show(mddev_t *mddev, char *page)
@@ -1771,31 +2170,27 @@ action_store(mddev_t *mddev, const char *page, size_t len)
1771 if (!mddev->pers || !mddev->pers->sync_request) 2170 if (!mddev->pers || !mddev->pers->sync_request)
1772 return -EINVAL; 2171 return -EINVAL;
1773 2172
1774 if (strcmp(page, "idle")==0 || strcmp(page, "idle\n")==0) { 2173 if (cmd_match(page, "idle")) {
1775 if (mddev->sync_thread) { 2174 if (mddev->sync_thread) {
1776 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2175 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1777 md_unregister_thread(mddev->sync_thread); 2176 md_unregister_thread(mddev->sync_thread);
1778 mddev->sync_thread = NULL; 2177 mddev->sync_thread = NULL;
1779 mddev->recovery = 0; 2178 mddev->recovery = 0;
1780 } 2179 }
1781 return len; 2180 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1782 } 2181 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
1783
1784 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1785 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
1786 return -EBUSY; 2182 return -EBUSY;
1787 if (strcmp(page, "resync")==0 || strcmp(page, "resync\n")==0 || 2183 else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
1788 strcmp(page, "recover")==0 || strcmp(page, "recover\n")==0)
1789 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2184 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1790 else { 2185 else {
1791 if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) 2186 if (cmd_match(page, "check"))
1792 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2187 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
1793 else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) 2188 else if (cmd_match(page, "repair"))
1794 return -EINVAL; 2189 return -EINVAL;
1795 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2190 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
1796 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2191 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
1797 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1798 } 2192 }
2193 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1799 md_wakeup_thread(mddev->thread); 2194 md_wakeup_thread(mddev->thread);
1800 return len; 2195 return len;
1801} 2196}
@@ -1814,15 +2209,107 @@ md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
1814static struct md_sysfs_entry 2209static struct md_sysfs_entry
1815md_mismatches = __ATTR_RO(mismatch_cnt); 2210md_mismatches = __ATTR_RO(mismatch_cnt);
1816 2211
2212static ssize_t
2213sync_min_show(mddev_t *mddev, char *page)
2214{
2215 return sprintf(page, "%d (%s)\n", speed_min(mddev),
2216 mddev->sync_speed_min ? "local": "system");
2217}
2218
2219static ssize_t
2220sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2221{
2222 int min;
2223 char *e;
2224 if (strncmp(buf, "system", 6)==0) {
2225 mddev->sync_speed_min = 0;
2226 return len;
2227 }
2228 min = simple_strtoul(buf, &e, 10);
2229 if (buf == e || (*e && *e != '\n') || min <= 0)
2230 return -EINVAL;
2231 mddev->sync_speed_min = min;
2232 return len;
2233}
2234
2235static struct md_sysfs_entry md_sync_min =
2236__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2237
2238static ssize_t
2239sync_max_show(mddev_t *mddev, char *page)
2240{
2241 return sprintf(page, "%d (%s)\n", speed_max(mddev),
2242 mddev->sync_speed_max ? "local": "system");
2243}
2244
2245static ssize_t
2246sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2247{
2248 int max;
2249 char *e;
2250 if (strncmp(buf, "system", 6)==0) {
2251 mddev->sync_speed_max = 0;
2252 return len;
2253 }
2254 max = simple_strtoul(buf, &e, 10);
2255 if (buf == e || (*e && *e != '\n') || max <= 0)
2256 return -EINVAL;
2257 mddev->sync_speed_max = max;
2258 return len;
2259}
2260
2261static struct md_sysfs_entry md_sync_max =
2262__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2263
2264
2265static ssize_t
2266sync_speed_show(mddev_t *mddev, char *page)
2267{
2268 unsigned long resync, dt, db;
2269 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2270 dt = ((jiffies - mddev->resync_mark) / HZ);
2271 if (!dt) dt++;
2272 db = resync - (mddev->resync_mark_cnt);
2273 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2274}
2275
2276static struct md_sysfs_entry
2277md_sync_speed = __ATTR_RO(sync_speed);
2278
2279static ssize_t
2280sync_completed_show(mddev_t *mddev, char *page)
2281{
2282 unsigned long max_blocks, resync;
2283
2284 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2285 max_blocks = mddev->resync_max_sectors;
2286 else
2287 max_blocks = mddev->size << 1;
2288
2289 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2290 return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2291}
2292
2293static struct md_sysfs_entry
2294md_sync_completed = __ATTR_RO(sync_completed);
2295
1817static struct attribute *md_default_attrs[] = { 2296static struct attribute *md_default_attrs[] = {
1818 &md_level.attr, 2297 &md_level.attr,
1819 &md_raid_disks.attr, 2298 &md_raid_disks.attr,
2299 &md_chunk_size.attr,
2300 &md_size.attr,
2301 &md_metadata.attr,
2302 &md_new_device.attr,
1820 NULL, 2303 NULL,
1821}; 2304};
1822 2305
1823static struct attribute *md_redundancy_attrs[] = { 2306static struct attribute *md_redundancy_attrs[] = {
1824 &md_scan_mode.attr, 2307 &md_scan_mode.attr,
1825 &md_mismatches.attr, 2308 &md_mismatches.attr,
2309 &md_sync_min.attr,
2310 &md_sync_max.attr,
2311 &md_sync_speed.attr,
2312 &md_sync_completed.attr,
1826 NULL, 2313 NULL,
1827}; 2314};
1828static struct attribute_group md_redundancy_group = { 2315static struct attribute_group md_redundancy_group = {
@@ -1937,14 +2424,16 @@ static void md_safemode_timeout(unsigned long data)
1937 md_wakeup_thread(mddev->thread); 2424 md_wakeup_thread(mddev->thread);
1938} 2425}
1939 2426
2427static int start_dirty_degraded;
1940 2428
1941static int do_md_run(mddev_t * mddev) 2429static int do_md_run(mddev_t * mddev)
1942{ 2430{
1943 int pnum, err; 2431 int err;
1944 int chunk_size; 2432 int chunk_size;
1945 struct list_head *tmp; 2433 struct list_head *tmp;
1946 mdk_rdev_t *rdev; 2434 mdk_rdev_t *rdev;
1947 struct gendisk *disk; 2435 struct gendisk *disk;
2436 struct mdk_personality *pers;
1948 char b[BDEVNAME_SIZE]; 2437 char b[BDEVNAME_SIZE];
1949 2438
1950 if (list_empty(&mddev->disks)) 2439 if (list_empty(&mddev->disks))
@@ -1961,20 +2450,8 @@ static int do_md_run(mddev_t * mddev)
1961 analyze_sbs(mddev); 2450 analyze_sbs(mddev);
1962 2451
1963 chunk_size = mddev->chunk_size; 2452 chunk_size = mddev->chunk_size;
1964 pnum = level_to_pers(mddev->level);
1965 2453
1966 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 2454 if (chunk_size) {
1967 if (!chunk_size) {
1968 /*
1969 * 'default chunksize' in the old md code used to
1970 * be PAGE_SIZE, baaad.
1971 * we abort here to be on the safe side. We don't
1972 * want to continue the bad practice.
1973 */
1974 printk(KERN_ERR
1975 "no chunksize specified, see 'man raidtab'\n");
1976 return -EINVAL;
1977 }
1978 if (chunk_size > MAX_CHUNK_SIZE) { 2455 if (chunk_size > MAX_CHUNK_SIZE) {
1979 printk(KERN_ERR "too big chunk_size: %d > %d\n", 2456 printk(KERN_ERR "too big chunk_size: %d > %d\n",
1980 chunk_size, MAX_CHUNK_SIZE); 2457 chunk_size, MAX_CHUNK_SIZE);
@@ -2010,10 +2487,10 @@ static int do_md_run(mddev_t * mddev)
2010 } 2487 }
2011 2488
2012#ifdef CONFIG_KMOD 2489#ifdef CONFIG_KMOD
2013 if (!pers[pnum]) 2490 if (mddev->level != LEVEL_NONE)
2014 { 2491 request_module("md-level-%d", mddev->level);
2015 request_module("md-personality-%d", pnum); 2492 else if (mddev->clevel[0])
2016 } 2493 request_module("md-%s", mddev->clevel);
2017#endif 2494#endif
2018 2495
2019 /* 2496 /*
@@ -2035,30 +2512,39 @@ static int do_md_run(mddev_t * mddev)
2035 return -ENOMEM; 2512 return -ENOMEM;
2036 2513
2037 spin_lock(&pers_lock); 2514 spin_lock(&pers_lock);
2038 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 2515 pers = find_pers(mddev->level, mddev->clevel);
2516 if (!pers || !try_module_get(pers->owner)) {
2039 spin_unlock(&pers_lock); 2517 spin_unlock(&pers_lock);
2040 printk(KERN_WARNING "md: personality %d is not loaded!\n", 2518 if (mddev->level != LEVEL_NONE)
2041 pnum); 2519 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
2520 mddev->level);
2521 else
2522 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
2523 mddev->clevel);
2042 return -EINVAL; 2524 return -EINVAL;
2043 } 2525 }
2044 2526 mddev->pers = pers;
2045 mddev->pers = pers[pnum];
2046 spin_unlock(&pers_lock); 2527 spin_unlock(&pers_lock);
2528 mddev->level = pers->level;
2529 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2047 2530
2048 mddev->recovery = 0; 2531 mddev->recovery = 0;
2049 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2532 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
2050 mddev->barriers_work = 1; 2533 mddev->barriers_work = 1;
2534 mddev->ok_start_degraded = start_dirty_degraded;
2051 2535
2052 if (start_readonly) 2536 if (start_readonly)
2053 mddev->ro = 2; /* read-only, but switch on first write */ 2537 mddev->ro = 2; /* read-only, but switch on first write */
2054 2538
2055 /* before we start the array running, initialise the bitmap */ 2539 err = mddev->pers->run(mddev);
2056 err = bitmap_create(mddev); 2540 if (!err && mddev->pers->sync_request) {
2057 if (err) 2541 err = bitmap_create(mddev);
2058 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 2542 if (err) {
2059 mdname(mddev), err); 2543 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
2060 else 2544 mdname(mddev), err);
2061 err = mddev->pers->run(mddev); 2545 mddev->pers->stop(mddev);
2546 }
2547 }
2062 if (err) { 2548 if (err) {
2063 printk(KERN_ERR "md: pers->run() failed ...\n"); 2549 printk(KERN_ERR "md: pers->run() failed ...\n");
2064 module_put(mddev->pers->owner); 2550 module_put(mddev->pers->owner);
@@ -2104,6 +2590,7 @@ static int do_md_run(mddev_t * mddev)
2104 mddev->queue->make_request_fn = mddev->pers->make_request; 2590 mddev->queue->make_request_fn = mddev->pers->make_request;
2105 2591
2106 mddev->changed = 1; 2592 mddev->changed = 1;
2593 md_new_event(mddev);
2107 return 0; 2594 return 0;
2108} 2595}
2109 2596
@@ -2231,6 +2718,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2231 printk(KERN_INFO "md: %s switched to read-only mode.\n", 2718 printk(KERN_INFO "md: %s switched to read-only mode.\n",
2232 mdname(mddev)); 2719 mdname(mddev));
2233 err = 0; 2720 err = 0;
2721 md_new_event(mddev);
2234out: 2722out:
2235 return err; 2723 return err;
2236} 2724}
@@ -2668,12 +3156,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2668 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3156 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2669 set_bit(WriteMostly, &rdev->flags); 3157 set_bit(WriteMostly, &rdev->flags);
2670 3158
2671 err = bind_rdev_to_array(rdev, mddev);
2672 if (err) {
2673 export_rdev(rdev);
2674 return err;
2675 }
2676
2677 if (!mddev->persistent) { 3159 if (!mddev->persistent) {
2678 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3160 printk(KERN_INFO "md: nonpersistent superblock ...\n");
2679 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3161 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
@@ -2681,8 +3163,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2681 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3163 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2682 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3164 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
2683 3165
2684 if (!mddev->size || (mddev->size > rdev->size)) 3166 err = bind_rdev_to_array(rdev, mddev);
2685 mddev->size = rdev->size; 3167 if (err) {
3168 export_rdev(rdev);
3169 return err;
3170 }
2686 } 3171 }
2687 3172
2688 return 0; 3173 return 0;
@@ -2705,6 +3190,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
2705 3190
2706 kick_rdev_from_array(rdev); 3191 kick_rdev_from_array(rdev);
2707 md_update_sb(mddev); 3192 md_update_sb(mddev);
3193 md_new_event(mddev);
2708 3194
2709 return 0; 3195 return 0;
2710busy: 3196busy:
@@ -2753,15 +3239,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2753 size = calc_dev_size(rdev, mddev->chunk_size); 3239 size = calc_dev_size(rdev, mddev->chunk_size);
2754 rdev->size = size; 3240 rdev->size = size;
2755 3241
2756 if (size < mddev->size) {
2757 printk(KERN_WARNING
2758 "%s: disk size %llu blocks < array size %llu\n",
2759 mdname(mddev), (unsigned long long)size,
2760 (unsigned long long)mddev->size);
2761 err = -ENOSPC;
2762 goto abort_export;
2763 }
2764
2765 if (test_bit(Faulty, &rdev->flags)) { 3242 if (test_bit(Faulty, &rdev->flags)) {
2766 printk(KERN_WARNING 3243 printk(KERN_WARNING
2767 "md: can not hot-add faulty %s disk to %s!\n", 3244 "md: can not hot-add faulty %s disk to %s!\n",
@@ -2771,7 +3248,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2771 } 3248 }
2772 clear_bit(In_sync, &rdev->flags); 3249 clear_bit(In_sync, &rdev->flags);
2773 rdev->desc_nr = -1; 3250 rdev->desc_nr = -1;
2774 bind_rdev_to_array(rdev, mddev); 3251 err = bind_rdev_to_array(rdev, mddev);
3252 if (err)
3253 goto abort_export;
2775 3254
2776 /* 3255 /*
2777 * The rest should better be atomic, we can have disk failures 3256 * The rest should better be atomic, we can have disk failures
@@ -2795,7 +3274,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2795 */ 3274 */
2796 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3275 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2797 md_wakeup_thread(mddev->thread); 3276 md_wakeup_thread(mddev->thread);
2798 3277 md_new_event(mddev);
2799 return 0; 3278 return 0;
2800 3279
2801abort_unbind_export: 3280abort_unbind_export:
@@ -2942,6 +3421,81 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
2942 return 0; 3421 return 0;
2943} 3422}
2944 3423
3424static int update_size(mddev_t *mddev, unsigned long size)
3425{
3426 mdk_rdev_t * rdev;
3427 int rv;
3428 struct list_head *tmp;
3429
3430 if (mddev->pers->resize == NULL)
3431 return -EINVAL;
3432 /* The "size" is the amount of each device that is used.
3433 * This can only make sense for arrays with redundancy.
3434 * linear and raid0 always use whatever space is available
3435 * We can only consider changing the size if no resync
3436 * or reconstruction is happening, and if the new size
3437 * is acceptable. It must fit before the sb_offset or,
3438 * if that is <data_offset, it must fit before the
3439 * size of each device.
3440 * If size is zero, we find the largest size that fits.
3441 */
3442 if (mddev->sync_thread)
3443 return -EBUSY;
3444 ITERATE_RDEV(mddev,rdev,tmp) {
3445 sector_t avail;
3446 int fit = (size == 0);
3447 if (rdev->sb_offset > rdev->data_offset)
3448 avail = (rdev->sb_offset*2) - rdev->data_offset;
3449 else
3450 avail = get_capacity(rdev->bdev->bd_disk)
3451 - rdev->data_offset;
3452 if (fit && (size == 0 || size > avail/2))
3453 size = avail/2;
3454 if (avail < ((sector_t)size << 1))
3455 return -ENOSPC;
3456 }
3457 rv = mddev->pers->resize(mddev, (sector_t)size *2);
3458 if (!rv) {
3459 struct block_device *bdev;
3460
3461 bdev = bdget_disk(mddev->gendisk, 0);
3462 if (bdev) {
3463 down(&bdev->bd_inode->i_sem);
3464 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3465 up(&bdev->bd_inode->i_sem);
3466 bdput(bdev);
3467 }
3468 }
3469 return rv;
3470}
3471
3472static int update_raid_disks(mddev_t *mddev, int raid_disks)
3473{
3474 int rv;
3475 /* change the number of raid disks */
3476 if (mddev->pers->reshape == NULL)
3477 return -EINVAL;
3478 if (raid_disks <= 0 ||
3479 raid_disks >= mddev->max_disks)
3480 return -EINVAL;
3481 if (mddev->sync_thread)
3482 return -EBUSY;
3483 rv = mddev->pers->reshape(mddev, raid_disks);
3484 if (!rv) {
3485 struct block_device *bdev;
3486
3487 bdev = bdget_disk(mddev->gendisk, 0);
3488 if (bdev) {
3489 down(&bdev->bd_inode->i_sem);
3490 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3491 up(&bdev->bd_inode->i_sem);
3492 bdput(bdev);
3493 }
3494 }
3495 return rv;
3496}
3497
3498
2945/* 3499/*
2946 * update_array_info is used to change the configuration of an 3500 * update_array_info is used to change the configuration of an
2947 * on-line array. 3501 * on-line array.
@@ -2990,71 +3544,12 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
2990 else 3544 else
2991 return mddev->pers->reconfig(mddev, info->layout, -1); 3545 return mddev->pers->reconfig(mddev, info->layout, -1);
2992 } 3546 }
2993 if (mddev->size != info->size) { 3547 if (mddev->size != info->size)
2994 mdk_rdev_t * rdev; 3548 rv = update_size(mddev, info->size);
2995 struct list_head *tmp; 3549
2996 if (mddev->pers->resize == NULL) 3550 if (mddev->raid_disks != info->raid_disks)
2997 return -EINVAL; 3551 rv = update_raid_disks(mddev, info->raid_disks);
2998 /* The "size" is the amount of each device that is used. 3552
2999 * This can only make sense for arrays with redundancy.
3000 * linear and raid0 always use whatever space is available
3001 * We can only consider changing the size if no resync
3002 * or reconstruction is happening, and if the new size
3003 * is acceptable. It must fit before the sb_offset or,
3004 * if that is <data_offset, it must fit before the
3005 * size of each device.
3006 * If size is zero, we find the largest size that fits.
3007 */
3008 if (mddev->sync_thread)
3009 return -EBUSY;
3010 ITERATE_RDEV(mddev,rdev,tmp) {
3011 sector_t avail;
3012 int fit = (info->size == 0);
3013 if (rdev->sb_offset > rdev->data_offset)
3014 avail = (rdev->sb_offset*2) - rdev->data_offset;
3015 else
3016 avail = get_capacity(rdev->bdev->bd_disk)
3017 - rdev->data_offset;
3018 if (fit && (info->size == 0 || info->size > avail/2))
3019 info->size = avail/2;
3020 if (avail < ((sector_t)info->size << 1))
3021 return -ENOSPC;
3022 }
3023 rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
3024 if (!rv) {
3025 struct block_device *bdev;
3026
3027 bdev = bdget_disk(mddev->gendisk, 0);
3028 if (bdev) {
3029 down(&bdev->bd_inode->i_sem);
3030 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3031 up(&bdev->bd_inode->i_sem);
3032 bdput(bdev);
3033 }
3034 }
3035 }
3036 if (mddev->raid_disks != info->raid_disks) {
3037 /* change the number of raid disks */
3038 if (mddev->pers->reshape == NULL)
3039 return -EINVAL;
3040 if (info->raid_disks <= 0 ||
3041 info->raid_disks >= mddev->max_disks)
3042 return -EINVAL;
3043 if (mddev->sync_thread)
3044 return -EBUSY;
3045 rv = mddev->pers->reshape(mddev, info->raid_disks);
3046 if (!rv) {
3047 struct block_device *bdev;
3048
3049 bdev = bdget_disk(mddev->gendisk, 0);
3050 if (bdev) {
3051 down(&bdev->bd_inode->i_sem);
3052 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3053 up(&bdev->bd_inode->i_sem);
3054 bdput(bdev);
3055 }
3056 }
3057 }
3058 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 3553 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
3059 if (mddev->pers->quiesce == NULL) 3554 if (mddev->pers->quiesce == NULL)
3060 return -EINVAL; 3555 return -EINVAL;
@@ -3476,11 +3971,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3476{ 3971{
3477 mdk_thread_t *thread; 3972 mdk_thread_t *thread;
3478 3973
3479 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3974 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
3480 if (!thread) 3975 if (!thread)
3481 return NULL; 3976 return NULL;
3482 3977
3483 memset(thread, 0, sizeof(mdk_thread_t));
3484 init_waitqueue_head(&thread->wqueue); 3978 init_waitqueue_head(&thread->wqueue);
3485 3979
3486 thread->run = run; 3980 thread->run = run;
@@ -3524,6 +4018,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
3524 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4018 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3525 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4019 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3526 md_wakeup_thread(mddev->thread); 4020 md_wakeup_thread(mddev->thread);
4021 md_new_event(mddev);
3527} 4022}
3528 4023
3529/* seq_file implementation /proc/mdstat */ 4024/* seq_file implementation /proc/mdstat */
@@ -3664,24 +4159,29 @@ static void md_seq_stop(struct seq_file *seq, void *v)
3664 mddev_put(mddev); 4159 mddev_put(mddev);
3665} 4160}
3666 4161
4162struct mdstat_info {
4163 int event;
4164};
4165
3667static int md_seq_show(struct seq_file *seq, void *v) 4166static int md_seq_show(struct seq_file *seq, void *v)
3668{ 4167{
3669 mddev_t *mddev = v; 4168 mddev_t *mddev = v;
3670 sector_t size; 4169 sector_t size;
3671 struct list_head *tmp2; 4170 struct list_head *tmp2;
3672 mdk_rdev_t *rdev; 4171 mdk_rdev_t *rdev;
3673 int i; 4172 struct mdstat_info *mi = seq->private;
3674 struct bitmap *bitmap; 4173 struct bitmap *bitmap;
3675 4174
3676 if (v == (void*)1) { 4175 if (v == (void*)1) {
4176 struct mdk_personality *pers;
3677 seq_printf(seq, "Personalities : "); 4177 seq_printf(seq, "Personalities : ");
3678 spin_lock(&pers_lock); 4178 spin_lock(&pers_lock);
3679 for (i = 0; i < MAX_PERSONALITY; i++) 4179 list_for_each_entry(pers, &pers_list, list)
3680 if (pers[i]) 4180 seq_printf(seq, "[%s] ", pers->name);
3681 seq_printf(seq, "[%s] ", pers[i]->name);
3682 4181
3683 spin_unlock(&pers_lock); 4182 spin_unlock(&pers_lock);
3684 seq_printf(seq, "\n"); 4183 seq_printf(seq, "\n");
4184 mi->event = atomic_read(&md_event_count);
3685 return 0; 4185 return 0;
3686 } 4186 }
3687 if (v == (void*)2) { 4187 if (v == (void*)2) {
@@ -3790,47 +4290,68 @@ static struct seq_operations md_seq_ops = {
3790static int md_seq_open(struct inode *inode, struct file *file) 4290static int md_seq_open(struct inode *inode, struct file *file)
3791{ 4291{
3792 int error; 4292 int error;
4293 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
4294 if (mi == NULL)
4295 return -ENOMEM;
3793 4296
3794 error = seq_open(file, &md_seq_ops); 4297 error = seq_open(file, &md_seq_ops);
4298 if (error)
4299 kfree(mi);
4300 else {
4301 struct seq_file *p = file->private_data;
4302 p->private = mi;
4303 mi->event = atomic_read(&md_event_count);
4304 }
3795 return error; 4305 return error;
3796} 4306}
3797 4307
4308static int md_seq_release(struct inode *inode, struct file *file)
4309{
4310 struct seq_file *m = file->private_data;
4311 struct mdstat_info *mi = m->private;
4312 m->private = NULL;
4313 kfree(mi);
4314 return seq_release(inode, file);
4315}
4316
4317static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
4318{
4319 struct seq_file *m = filp->private_data;
4320 struct mdstat_info *mi = m->private;
4321 int mask;
4322
4323 poll_wait(filp, &md_event_waiters, wait);
4324
4325 /* always allow read */
4326 mask = POLLIN | POLLRDNORM;
4327
4328 if (mi->event != atomic_read(&md_event_count))
4329 mask |= POLLERR | POLLPRI;
4330 return mask;
4331}
4332
3798static struct file_operations md_seq_fops = { 4333static struct file_operations md_seq_fops = {
3799 .open = md_seq_open, 4334 .open = md_seq_open,
3800 .read = seq_read, 4335 .read = seq_read,
3801 .llseek = seq_lseek, 4336 .llseek = seq_lseek,
3802 .release = seq_release, 4337 .release = md_seq_release,
4338 .poll = mdstat_poll,
3803}; 4339};
3804 4340
3805int register_md_personality(int pnum, mdk_personality_t *p) 4341int register_md_personality(struct mdk_personality *p)
3806{ 4342{
3807 if (pnum >= MAX_PERSONALITY) {
3808 printk(KERN_ERR
3809 "md: tried to install personality %s as nr %d, but max is %lu\n",
3810 p->name, pnum, MAX_PERSONALITY-1);
3811 return -EINVAL;
3812 }
3813
3814 spin_lock(&pers_lock); 4343 spin_lock(&pers_lock);
3815 if (pers[pnum]) { 4344 list_add_tail(&p->list, &pers_list);
3816 spin_unlock(&pers_lock); 4345 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
3817 return -EBUSY;
3818 }
3819
3820 pers[pnum] = p;
3821 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3822 spin_unlock(&pers_lock); 4346 spin_unlock(&pers_lock);
3823 return 0; 4347 return 0;
3824} 4348}
3825 4349
3826int unregister_md_personality(int pnum) 4350int unregister_md_personality(struct mdk_personality *p)
3827{ 4351{
3828 if (pnum >= MAX_PERSONALITY) 4352 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
3829 return -EINVAL;
3830
3831 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3832 spin_lock(&pers_lock); 4353 spin_lock(&pers_lock);
3833 pers[pnum] = NULL; 4354 list_del_init(&p->list);
3834 spin_unlock(&pers_lock); 4355 spin_unlock(&pers_lock);
3835 return 0; 4356 return 0;
3836} 4357}
@@ -4012,10 +4533,10 @@ static void md_do_sync(mddev_t *mddev)
4012 4533
4013 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 4534 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
4014 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 4535 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
4015 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 4536 " %d KB/sec/disc.\n", speed_min(mddev));
4016 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 4537 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
4017 "(but not more than %d KB/sec) for reconstruction.\n", 4538 "(but not more than %d KB/sec) for reconstruction.\n",
4018 sysctl_speed_limit_max); 4539 speed_max(mddev));
4019 4540
4020 is_mddev_idle(mddev); /* this also initializes IO event counters */ 4541 is_mddev_idle(mddev); /* this also initializes IO event counters */
4021 /* we don't use the checkpoint if there's a bitmap */ 4542 /* we don't use the checkpoint if there's a bitmap */
@@ -4056,7 +4577,7 @@ static void md_do_sync(mddev_t *mddev)
4056 4577
4057 skipped = 0; 4578 skipped = 0;
4058 sectors = mddev->pers->sync_request(mddev, j, &skipped, 4579 sectors = mddev->pers->sync_request(mddev, j, &skipped,
4059 currspeed < sysctl_speed_limit_min); 4580 currspeed < speed_min(mddev));
4060 if (sectors == 0) { 4581 if (sectors == 0) {
4061 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4582 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
4062 goto out; 4583 goto out;
@@ -4069,7 +4590,11 @@ static void md_do_sync(mddev_t *mddev)
4069 4590
4070 j += sectors; 4591 j += sectors;
4071 if (j>1) mddev->curr_resync = j; 4592 if (j>1) mddev->curr_resync = j;
4072 4593 if (last_check == 0)
4594 /* this is the earliers that rebuilt will be
4595 * visible in /proc/mdstat
4596 */
4597 md_new_event(mddev);
4073 4598
4074 if (last_check + window > io_sectors || j == max_sectors) 4599 if (last_check + window > io_sectors || j == max_sectors)
4075 continue; 4600 continue;
@@ -4117,8 +4642,8 @@ static void md_do_sync(mddev_t *mddev)
4117 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 4642 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
4118 /((jiffies-mddev->resync_mark)/HZ +1) +1; 4643 /((jiffies-mddev->resync_mark)/HZ +1) +1;
4119 4644
4120 if (currspeed > sysctl_speed_limit_min) { 4645 if (currspeed > speed_min(mddev)) {
4121 if ((currspeed > sysctl_speed_limit_max) || 4646 if ((currspeed > speed_max(mddev)) ||
4122 !is_mddev_idle(mddev)) { 4647 !is_mddev_idle(mddev)) {
4123 msleep(500); 4648 msleep(500);
4124 goto repeat; 4649 goto repeat;
@@ -4255,6 +4780,7 @@ void md_check_recovery(mddev_t *mddev)
4255 mddev->recovery = 0; 4780 mddev->recovery = 0;
4256 /* flag recovery needed just to double check */ 4781 /* flag recovery needed just to double check */
4257 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4782 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4783 md_new_event(mddev);
4258 goto unlock; 4784 goto unlock;
4259 } 4785 }
4260 /* Clear some bits that don't mean anything, but 4786 /* Clear some bits that don't mean anything, but
@@ -4292,6 +4818,7 @@ void md_check_recovery(mddev_t *mddev)
4292 sprintf(nm, "rd%d", rdev->raid_disk); 4818 sprintf(nm, "rd%d", rdev->raid_disk);
4293 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 4819 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
4294 spares++; 4820 spares++;
4821 md_new_event(mddev);
4295 } else 4822 } else
4296 break; 4823 break;
4297 } 4824 }
@@ -4324,9 +4851,9 @@ void md_check_recovery(mddev_t *mddev)
4324 mdname(mddev)); 4851 mdname(mddev));
4325 /* leave the spares where they are, it shouldn't hurt */ 4852 /* leave the spares where they are, it shouldn't hurt */
4326 mddev->recovery = 0; 4853 mddev->recovery = 0;
4327 } else { 4854 } else
4328 md_wakeup_thread(mddev->sync_thread); 4855 md_wakeup_thread(mddev->sync_thread);
4329 } 4856 md_new_event(mddev);
4330 } 4857 }
4331 unlock: 4858 unlock:
4332 mddev_unlock(mddev); 4859 mddev_unlock(mddev);
@@ -4503,12 +5030,14 @@ static int set_ro(const char *val, struct kernel_param *kp)
4503 int num = simple_strtoul(val, &e, 10); 5030 int num = simple_strtoul(val, &e, 10);
4504 if (*val && (*e == '\0' || *e == '\n')) { 5031 if (*val && (*e == '\0' || *e == '\n')) {
4505 start_readonly = num; 5032 start_readonly = num;
4506 return 0;; 5033 return 0;
4507 } 5034 }
4508 return -EINVAL; 5035 return -EINVAL;
4509} 5036}
4510 5037
4511module_param_call(start_ro, set_ro, get_ro, NULL, 0600); 5038module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
5039module_param(start_dirty_degraded, int, 0644);
5040
4512 5041
4513EXPORT_SYMBOL(register_md_personality); 5042EXPORT_SYMBOL(register_md_personality);
4514EXPORT_SYMBOL(unregister_md_personality); 5043EXPORT_SYMBOL(unregister_md_personality);