aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-04-03 12:08:19 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-03 12:08:19 -0400
commit223cdea4c4b5af5181b2da00ac85711d1e0c737c (patch)
treedfe7226c70ddabbf2e2e63924ba636345278e79c /drivers/md/md.c
parent31e6e2dac575c9d21a6ec56ca52ae89086baa705 (diff)
parentc8f517c444e4f9f55b5b5ca202b8404691a35805 (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (53 commits) md/raid5 revise rules for when to update metadata during reshape md/raid5: minor code cleanups in make_request. md: remove CONFIG_MD_RAID_RESHAPE config option. md/raid5: be more careful about write ordering when reshaping. md: don't display meaningless values in sysfs files resync_start and sync_speed md/raid5: allow layout and chunksize to be changed on active array. md/raid5: reshape using largest of old and new chunk size md/raid5: prepare for allowing reshape to change layout md/raid5: prepare for allowing reshape to change chunksize. md/raid5: clearly differentiate 'before' and 'after' stripes during reshape. Documentation/md.txt update md: allow number of drives in raid5 to be reduced md/raid5: change reshape-progress measurement to cope with reshaping backwards. md: add explicit method to signal the end of a reshape. md/raid5: enhance raid5_size to work correctly with negative delta_disks md/raid5: drop qd_idx from r6_state md/raid6: move raid6 data processing to raid6_pq.ko md: raid5 run(): Fix max_degraded for raid level 4. md: 'array_size' sysfs attribute md: centralize ->array_sectors modifications ...
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c615
1 files changed, 467 insertions, 148 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a307f87eb90e..ed5727c089a9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,9 +33,9 @@
33*/ 33*/
34 34
35#include <linux/kthread.h> 35#include <linux/kthread.h>
36#include <linux/raid/md.h> 36#include <linux/blkdev.h>
37#include <linux/raid/bitmap.h>
38#include <linux/sysctl.h> 37#include <linux/sysctl.h>
38#include <linux/seq_file.h>
39#include <linux/buffer_head.h> /* for invalidate_bdev */ 39#include <linux/buffer_head.h> /* for invalidate_bdev */
40#include <linux/poll.h> 40#include <linux/poll.h>
41#include <linux/ctype.h> 41#include <linux/ctype.h>
@@ -45,11 +45,10 @@
45#include <linux/reboot.h> 45#include <linux/reboot.h>
46#include <linux/file.h> 46#include <linux/file.h>
47#include <linux/delay.h> 47#include <linux/delay.h>
48 48#include <linux/raid/md_p.h>
49#define MAJOR_NR MD_MAJOR 49#include <linux/raid/md_u.h>
50 50#include "md.h"
51/* 63 partitions with the alternate major number (mdp) */ 51#include "bitmap.h"
52#define MdpMinorShift 6
53 52
54#define DEBUG 0 53#define DEBUG 0
55#define dprintk(x...) ((void)(DEBUG && printk(x))) 54#define dprintk(x...) ((void)(DEBUG && printk(x)))
@@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
202 ) 201 )
203 202
204 203
205static int md_fail_request(struct request_queue *q, struct bio *bio) 204/* Rather than calling directly into the personality make_request function,
205 * IO requests come here first so that we can check if the device is
206 * being suspended pending a reconfiguration.
207 * We hold a refcount over the call to ->make_request. By the time that
208 * call has finished, the bio has been linked into some internal structure
209 * and so is visible to ->quiesce(), so we don't need the refcount any more.
210 */
211static int md_make_request(struct request_queue *q, struct bio *bio)
206{ 212{
207 bio_io_error(bio); 213 mddev_t *mddev = q->queuedata;
208 return 0; 214 int rv;
215 if (mddev == NULL || mddev->pers == NULL) {
216 bio_io_error(bio);
217 return 0;
218 }
219 rcu_read_lock();
220 if (mddev->suspended) {
221 DEFINE_WAIT(__wait);
222 for (;;) {
223 prepare_to_wait(&mddev->sb_wait, &__wait,
224 TASK_UNINTERRUPTIBLE);
225 if (!mddev->suspended)
226 break;
227 rcu_read_unlock();
228 schedule();
229 rcu_read_lock();
230 }
231 finish_wait(&mddev->sb_wait, &__wait);
232 }
233 atomic_inc(&mddev->active_io);
234 rcu_read_unlock();
235 rv = mddev->pers->make_request(q, bio);
236 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
237 wake_up(&mddev->sb_wait);
238
239 return rv;
240}
241
242static void mddev_suspend(mddev_t *mddev)
243{
244 BUG_ON(mddev->suspended);
245 mddev->suspended = 1;
246 synchronize_rcu();
247 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
248 mddev->pers->quiesce(mddev, 1);
249 md_unregister_thread(mddev->thread);
250 mddev->thread = NULL;
251 /* we now know that no code is executing in the personality module,
252 * except possibly the tail end of a ->bi_end_io function, but that
253 * is certain to complete before the module has a chance to get
254 * unloaded
255 */
256}
257
258static void mddev_resume(mddev_t *mddev)
259{
260 mddev->suspended = 0;
261 wake_up(&mddev->sb_wait);
262 mddev->pers->quiesce(mddev, 0);
209} 263}
210 264
265
211static inline mddev_t *mddev_get(mddev_t *mddev) 266static inline mddev_t *mddev_get(mddev_t *mddev)
212{ 267{
213 atomic_inc(&mddev->active); 268 atomic_inc(&mddev->active);
@@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit)
310 init_timer(&new->safemode_timer); 365 init_timer(&new->safemode_timer);
311 atomic_set(&new->active, 1); 366 atomic_set(&new->active, 1);
312 atomic_set(&new->openers, 0); 367 atomic_set(&new->openers, 0);
368 atomic_set(&new->active_io, 0);
313 spin_lock_init(&new->write_lock); 369 spin_lock_init(&new->write_lock);
314 init_waitqueue_head(&new->sb_wait); 370 init_waitqueue_head(&new->sb_wait);
315 init_waitqueue_head(&new->recovery_wait); 371 init_waitqueue_head(&new->recovery_wait);
@@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev)
326 return mutex_lock_interruptible(&mddev->reconfig_mutex); 382 return mutex_lock_interruptible(&mddev->reconfig_mutex);
327} 383}
328 384
385static inline int mddev_is_locked(mddev_t *mddev)
386{
387 return mutex_is_locked(&mddev->reconfig_mutex);
388}
389
329static inline int mddev_trylock(mddev_t * mddev) 390static inline int mddev_trylock(mddev_t * mddev)
330{ 391{
331 return mutex_trylock(&mddev->reconfig_mutex); 392 return mutex_trylock(&mddev->reconfig_mutex);
@@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
409 rdev->sb_loaded = 0; 470 rdev->sb_loaded = 0;
410 rdev->sb_page = NULL; 471 rdev->sb_page = NULL;
411 rdev->sb_start = 0; 472 rdev->sb_start = 0;
412 rdev->size = 0; 473 rdev->sectors = 0;
413 } 474 }
414} 475}
415 476
@@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
775 else 836 else
776 ret = 0; 837 ret = 0;
777 } 838 }
778 rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; 839 rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
779 840
780 if (rdev->size < sb->size && sb->level > 1) 841 if (rdev->sectors < sb->size * 2 && sb->level > 1)
781 /* "this cannot possibly happen" ... */ 842 /* "this cannot possibly happen" ... */
782 ret = -EINVAL; 843 ret = -EINVAL;
783 844
@@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
812 mddev->clevel[0] = 0; 873 mddev->clevel[0] = 0;
813 mddev->layout = sb->layout; 874 mddev->layout = sb->layout;
814 mddev->raid_disks = sb->raid_disks; 875 mddev->raid_disks = sb->raid_disks;
815 mddev->size = sb->size; 876 mddev->dev_sectors = sb->size * 2;
816 mddev->events = ev1; 877 mddev->events = ev1;
817 mddev->bitmap_offset = 0; 878 mddev->bitmap_offset = 0;
818 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 879 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
926 987
927 sb->ctime = mddev->ctime; 988 sb->ctime = mddev->ctime;
928 sb->level = mddev->level; 989 sb->level = mddev->level;
929 sb->size = mddev->size; 990 sb->size = mddev->dev_sectors / 2;
930 sb->raid_disks = mddev->raid_disks; 991 sb->raid_disks = mddev->raid_disks;
931 sb->md_minor = mddev->md_minor; 992 sb->md_minor = mddev->md_minor;
932 sb->not_persistent = 0; 993 sb->not_persistent = 0;
@@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1024static unsigned long long 1085static unsigned long long
1025super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1086super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1026{ 1087{
1027 if (num_sectors && num_sectors < rdev->mddev->size * 2) 1088 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1028 return 0; /* component must fit device */ 1089 return 0; /* component must fit device */
1029 if (rdev->mddev->bitmap_offset) 1090 if (rdev->mddev->bitmap_offset)
1030 return 0; /* can't move bitmap */ 1091 return 0; /* can't move bitmap */
@@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1180 ret = 0; 1241 ret = 0;
1181 } 1242 }
1182 if (minor_version) 1243 if (minor_version)
1183 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1244 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1245 le64_to_cpu(sb->data_offset);
1184 else 1246 else
1185 rdev->size = rdev->sb_start / 2; 1247 rdev->sectors = rdev->sb_start;
1186 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1248 if (rdev->sectors < le64_to_cpu(sb->data_size))
1187 return -EINVAL; 1249 return -EINVAL;
1188 rdev->size = le64_to_cpu(sb->data_size)/2; 1250 rdev->sectors = le64_to_cpu(sb->data_size);
1189 if (le32_to_cpu(sb->chunksize)) 1251 if (le32_to_cpu(sb->chunksize))
1190 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1252 rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
1191 1253
1192 if (le64_to_cpu(sb->size) > rdev->size*2) 1254 if (le64_to_cpu(sb->size) > rdev->sectors)
1193 return -EINVAL; 1255 return -EINVAL;
1194 return ret; 1256 return ret;
1195} 1257}
@@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1216 mddev->clevel[0] = 0; 1278 mddev->clevel[0] = 0;
1217 mddev->layout = le32_to_cpu(sb->layout); 1279 mddev->layout = le32_to_cpu(sb->layout);
1218 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1280 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1219 mddev->size = le64_to_cpu(sb->size)/2; 1281 mddev->dev_sectors = le64_to_cpu(sb->size);
1220 mddev->events = ev1; 1282 mddev->events = ev1;
1221 mddev->bitmap_offset = 0; 1283 mddev->bitmap_offset = 0;
1222 mddev->default_bitmap_offset = 1024 >> 9; 1284 mddev->default_bitmap_offset = 1024 >> 9;
@@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1312 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1374 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1313 1375
1314 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1376 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1315 sb->size = cpu_to_le64(mddev->size<<1); 1377 sb->size = cpu_to_le64(mddev->dev_sectors);
1316 1378
1317 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1379 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1318 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1380 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
@@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1320 } 1382 }
1321 1383
1322 if (rdev->raid_disk >= 0 && 1384 if (rdev->raid_disk >= 0 &&
1323 !test_bit(In_sync, &rdev->flags) && 1385 !test_bit(In_sync, &rdev->flags)) {
1324 rdev->recovery_offset > 0) { 1386 if (mddev->curr_resync_completed > rdev->recovery_offset)
1325 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1387 rdev->recovery_offset = mddev->curr_resync_completed;
1326 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1388 if (rdev->recovery_offset > 0) {
1389 sb->feature_map |=
1390 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1391 sb->recovery_offset =
1392 cpu_to_le64(rdev->recovery_offset);
1393 }
1327 } 1394 }
1328 1395
1329 if (mddev->reshape_position != MaxSector) { 1396 if (mddev->reshape_position != MaxSector) {
@@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1365{ 1432{
1366 struct mdp_superblock_1 *sb; 1433 struct mdp_superblock_1 *sb;
1367 sector_t max_sectors; 1434 sector_t max_sectors;
1368 if (num_sectors && num_sectors < rdev->mddev->size * 2) 1435 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1369 return 0; /* component must fit device */ 1436 return 0; /* component must fit device */
1370 if (rdev->sb_start < rdev->data_offset) { 1437 if (rdev->sb_start < rdev->data_offset) {
1371 /* minor versions 1 and 2; superblock before data */ 1438 /* minor versions 1 and 2; superblock before data */
@@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1381 sector_t sb_start; 1448 sector_t sb_start;
1382 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1449 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1383 sb_start &= ~(sector_t)(4*2 - 1); 1450 sb_start &= ~(sector_t)(4*2 - 1);
1384 max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; 1451 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1385 if (!num_sectors || num_sectors > max_sectors) 1452 if (!num_sectors || num_sectors > max_sectors)
1386 num_sectors = max_sectors; 1453 num_sectors = max_sectors;
1387 rdev->sb_start = sb_start; 1454 rdev->sb_start = sb_start;
@@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1433 1500
1434static LIST_HEAD(pending_raid_disks); 1501static LIST_HEAD(pending_raid_disks);
1435 1502
1503static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
1504{
1505 struct mdk_personality *pers = mddev->pers;
1506 struct gendisk *disk = mddev->gendisk;
1507 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1508 struct blk_integrity *bi_mddev = blk_get_integrity(disk);
1509
1510 /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
1511 if (pers && pers->level >= 4 && pers->level <= 6)
1512 return;
1513
1514 /* If rdev is integrity capable, register profile for mddev */
1515 if (!bi_mddev && bi_rdev) {
1516 if (blk_integrity_register(disk, bi_rdev))
1517 printk(KERN_ERR "%s: %s Could not register integrity!\n",
1518 __func__, disk->disk_name);
1519 else
1520 printk(KERN_NOTICE "Enabling data integrity on %s\n",
1521 disk->disk_name);
1522 return;
1523 }
1524
1525 /* Check that mddev and rdev have matching profiles */
1526 if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
1527 printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
1528 disk->disk_name, rdev->bdev->bd_disk->disk_name);
1529 printk(KERN_NOTICE "Disabling data integrity on %s\n",
1530 disk->disk_name);
1531 blk_integrity_unregister(disk);
1532 }
1533}
1534
1436static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1535static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1437{ 1536{
1438 char b[BDEVNAME_SIZE]; 1537 char b[BDEVNAME_SIZE];
@@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1449 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1548 if (find_rdev(mddev, rdev->bdev->bd_dev))
1450 return -EEXIST; 1549 return -EEXIST;
1451 1550
1452 /* make sure rdev->size exceeds mddev->size */ 1551 /* make sure rdev->sectors exceeds mddev->dev_sectors */
1453 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1552 if (rdev->sectors && (mddev->dev_sectors == 0 ||
1553 rdev->sectors < mddev->dev_sectors)) {
1454 if (mddev->pers) { 1554 if (mddev->pers) {
1455 /* Cannot change size, so fail 1555 /* Cannot change size, so fail
1456 * If mddev->level <= 0, then we don't care 1556 * If mddev->level <= 0, then we don't care
@@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1459 if (mddev->level > 0) 1559 if (mddev->level > 0)
1460 return -ENOSPC; 1560 return -ENOSPC;
1461 } else 1561 } else
1462 mddev->size = rdev->size; 1562 mddev->dev_sectors = rdev->sectors;
1463 } 1563 }
1464 1564
1465 /* Verify rdev->desc_nr is unique. 1565 /* Verify rdev->desc_nr is unique.
@@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1503 1603
1504 /* May as well allow recovery to be retried once */ 1604 /* May as well allow recovery to be retried once */
1505 mddev->recovery_disabled = 0; 1605 mddev->recovery_disabled = 0;
1606
1607 md_integrity_check(rdev, mddev);
1506 return 0; 1608 return 0;
1507 1609
1508 fail: 1610 fail:
@@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1713static void print_rdev(mdk_rdev_t *rdev, int major_version) 1815static void print_rdev(mdk_rdev_t *rdev, int major_version)
1714{ 1816{
1715 char b[BDEVNAME_SIZE]; 1817 char b[BDEVNAME_SIZE];
1716 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1818 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1717 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1819 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1718 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1820 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1719 rdev->desc_nr); 1821 rdev->desc_nr);
1720 if (rdev->sb_loaded) { 1822 if (rdev->sb_loaded) {
@@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2153 return -EINVAL; 2255 return -EINVAL;
2154 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2256 if (rdev->mddev->pers && rdev->raid_disk >= 0)
2155 return -EBUSY; 2257 return -EBUSY;
2156 if (rdev->size && rdev->mddev->external) 2258 if (rdev->sectors && rdev->mddev->external)
2157 /* Must set offset before size, so overlap checks 2259 /* Must set offset before size, so overlap checks
2158 * can be sane */ 2260 * can be sane */
2159 return -EBUSY; 2261 return -EBUSY;
@@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2167static ssize_t 2269static ssize_t
2168rdev_size_show(mdk_rdev_t *rdev, char *page) 2270rdev_size_show(mdk_rdev_t *rdev, char *page)
2169{ 2271{
2170 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 2272 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2171} 2273}
2172 2274
2173static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2275static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
@@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2180 return 1; 2282 return 1;
2181} 2283}
2182 2284
2285static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2286{
2287 unsigned long long blocks;
2288 sector_t new;
2289
2290 if (strict_strtoull(buf, 10, &blocks) < 0)
2291 return -EINVAL;
2292
2293 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2294 return -EINVAL; /* sector conversion overflow */
2295
2296 new = blocks * 2;
2297 if (new != blocks * 2)
2298 return -EINVAL; /* unsigned long long to sector_t overflow */
2299
2300 *sectors = new;
2301 return 0;
2302}
2303
2183static ssize_t 2304static ssize_t
2184rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2305rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2185{ 2306{
2186 unsigned long long size;
2187 unsigned long long oldsize = rdev->size;
2188 mddev_t *my_mddev = rdev->mddev; 2307 mddev_t *my_mddev = rdev->mddev;
2308 sector_t oldsectors = rdev->sectors;
2309 sector_t sectors;
2189 2310
2190 if (strict_strtoull(buf, 10, &size) < 0) 2311 if (strict_blocks_to_sectors(buf, &sectors) < 0)
2191 return -EINVAL; 2312 return -EINVAL;
2192 if (my_mddev->pers && rdev->raid_disk >= 0) { 2313 if (my_mddev->pers && rdev->raid_disk >= 0) {
2193 if (my_mddev->persistent) { 2314 if (my_mddev->persistent) {
2194 size = super_types[my_mddev->major_version]. 2315 sectors = super_types[my_mddev->major_version].
2195 rdev_size_change(rdev, size * 2); 2316 rdev_size_change(rdev, sectors);
2196 if (!size) 2317 if (!sectors)
2197 return -EBUSY; 2318 return -EBUSY;
2198 } else if (!size) { 2319 } else if (!sectors)
2199 size = (rdev->bdev->bd_inode->i_size >> 10); 2320 sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2200 size -= rdev->data_offset/2; 2321 rdev->data_offset;
2201 }
2202 } 2322 }
2203 if (size < my_mddev->size) 2323 if (sectors < my_mddev->dev_sectors)
2204 return -EINVAL; /* component must fit device */ 2324 return -EINVAL; /* component must fit device */
2205 2325
2206 rdev->size = size; 2326 rdev->sectors = sectors;
2207 if (size > oldsize && my_mddev->external) { 2327 if (sectors > oldsectors && my_mddev->external) {
2208 /* need to check that all other rdevs with the same ->bdev 2328 /* need to check that all other rdevs with the same ->bdev
2209 * do not overlap. We need to unlock the mddev to avoid 2329 * do not overlap. We need to unlock the mddev to avoid
2210 * a deadlock. We have already changed rdev->size, and if 2330 * a deadlock. We have already changed rdev->sectors, and if
2211 * we have to change it back, we will have the lock again. 2331 * we have to change it back, we will have the lock again.
2212 */ 2332 */
2213 mddev_t *mddev; 2333 mddev_t *mddev;
@@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2223 if (test_bit(AllReserved, &rdev2->flags) || 2343 if (test_bit(AllReserved, &rdev2->flags) ||
2224 (rdev->bdev == rdev2->bdev && 2344 (rdev->bdev == rdev2->bdev &&
2225 rdev != rdev2 && 2345 rdev != rdev2 &&
2226 overlaps(rdev->data_offset, rdev->size * 2, 2346 overlaps(rdev->data_offset, rdev->sectors,
2227 rdev2->data_offset, 2347 rdev2->data_offset,
2228 rdev2->size * 2))) { 2348 rdev2->sectors))) {
2229 overlap = 1; 2349 overlap = 1;
2230 break; 2350 break;
2231 } 2351 }
@@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2239 if (overlap) { 2359 if (overlap) {
2240 /* Someone else could have slipped in a size 2360 /* Someone else could have slipped in a size
2241 * change here, but doing so is just silly. 2361 * change here, but doing so is just silly.
2242 * We put oldsize back because we *know* it is 2362 * We put oldsectors back because we *know* it is
2243 * safe, and trust userspace not to race with 2363 * safe, and trust userspace not to race with
2244 * itself 2364 * itself
2245 */ 2365 */
2246 rdev->size = oldsize; 2366 rdev->sectors = oldsectors;
2247 return -EBUSY; 2367 return -EBUSY;
2248 } 2368 }
2249 } 2369 }
@@ -2547,18 +2667,101 @@ level_show(mddev_t *mddev, char *page)
2547static ssize_t 2667static ssize_t
2548level_store(mddev_t *mddev, const char *buf, size_t len) 2668level_store(mddev_t *mddev, const char *buf, size_t len)
2549{ 2669{
2670 char level[16];
2550 ssize_t rv = len; 2671 ssize_t rv = len;
2551 if (mddev->pers) 2672 struct mdk_personality *pers;
2673 void *priv;
2674
2675 if (mddev->pers == NULL) {
2676 if (len == 0)
2677 return 0;
2678 if (len >= sizeof(mddev->clevel))
2679 return -ENOSPC;
2680 strncpy(mddev->clevel, buf, len);
2681 if (mddev->clevel[len-1] == '\n')
2682 len--;
2683 mddev->clevel[len] = 0;
2684 mddev->level = LEVEL_NONE;
2685 return rv;
2686 }
2687
2688 /* request to change the personality. Need to ensure:
2689 * - array is not engaged in resync/recovery/reshape
2690 * - old personality can be suspended
2691 * - new personality will access other array.
2692 */
2693
2694 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
2552 return -EBUSY; 2695 return -EBUSY;
2553 if (len == 0) 2696
2554 return 0; 2697 if (!mddev->pers->quiesce) {
2555 if (len >= sizeof(mddev->clevel)) 2698 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
2556 return -ENOSPC; 2699 mdname(mddev), mddev->pers->name);
2557 strncpy(mddev->clevel, buf, len); 2700 return -EINVAL;
2558 if (mddev->clevel[len-1] == '\n') 2701 }
2702
2703 /* Now find the new personality */
2704 if (len == 0 || len >= sizeof(level))
2705 return -EINVAL;
2706 strncpy(level, buf, len);
2707 if (level[len-1] == '\n')
2559 len--; 2708 len--;
2560 mddev->clevel[len] = 0; 2709 level[len] = 0;
2561 mddev->level = LEVEL_NONE; 2710
2711 request_module("md-%s", level);
2712 spin_lock(&pers_lock);
2713 pers = find_pers(LEVEL_NONE, level);
2714 if (!pers || !try_module_get(pers->owner)) {
2715 spin_unlock(&pers_lock);
2716 printk(KERN_WARNING "md: personality %s not loaded\n", level);
2717 return -EINVAL;
2718 }
2719 spin_unlock(&pers_lock);
2720
2721 if (pers == mddev->pers) {
2722 /* Nothing to do! */
2723 module_put(pers->owner);
2724 return rv;
2725 }
2726 if (!pers->takeover) {
2727 module_put(pers->owner);
2728 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2729 mdname(mddev), level);
2730 return -EINVAL;
2731 }
2732
2733 /* ->takeover must set new_* and/or delta_disks
2734 * if it succeeds, and may set them when it fails.
2735 */
2736 priv = pers->takeover(mddev);
2737 if (IS_ERR(priv)) {
2738 mddev->new_level = mddev->level;
2739 mddev->new_layout = mddev->layout;
2740 mddev->new_chunk = mddev->chunk_size;
2741 mddev->raid_disks -= mddev->delta_disks;
2742 mddev->delta_disks = 0;
2743 module_put(pers->owner);
2744 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2745 mdname(mddev), level);
2746 return PTR_ERR(priv);
2747 }
2748
2749 /* Looks like we have a winner */
2750 mddev_suspend(mddev);
2751 mddev->pers->stop(mddev);
2752 module_put(mddev->pers->owner);
2753 mddev->pers = pers;
2754 mddev->private = priv;
2755 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2756 mddev->level = mddev->new_level;
2757 mddev->layout = mddev->new_layout;
2758 mddev->chunk_size = mddev->new_chunk;
2759 mddev->delta_disks = 0;
2760 pers->run(mddev);
2761 mddev_resume(mddev);
2762 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2763 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2764 md_wakeup_thread(mddev->thread);
2562 return rv; 2765 return rv;
2563} 2766}
2564 2767
@@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
2586 if (!*buf || (*e && *e != '\n')) 2789 if (!*buf || (*e && *e != '\n'))
2587 return -EINVAL; 2790 return -EINVAL;
2588 2791
2589 if (mddev->pers) 2792 if (mddev->pers) {
2590 return -EBUSY; 2793 int err;
2591 if (mddev->reshape_position != MaxSector) 2794 if (mddev->pers->reconfig == NULL)
2795 return -EBUSY;
2796 err = mddev->pers->reconfig(mddev, n, -1);
2797 if (err)
2798 return err;
2799 } else {
2592 mddev->new_layout = n; 2800 mddev->new_layout = n;
2593 else 2801 if (mddev->reshape_position == MaxSector)
2594 mddev->layout = n; 2802 mddev->layout = n;
2803 }
2595 return len; 2804 return len;
2596} 2805}
2597static struct md_sysfs_entry md_layout = 2806static struct md_sysfs_entry md_layout =
@@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page)
2648static ssize_t 2857static ssize_t
2649chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2858chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2650{ 2859{
2651 /* can only set chunk_size if array is not yet active */
2652 char *e; 2860 char *e;
2653 unsigned long n = simple_strtoul(buf, &e, 10); 2861 unsigned long n = simple_strtoul(buf, &e, 10);
2654 2862
2655 if (!*buf || (*e && *e != '\n')) 2863 if (!*buf || (*e && *e != '\n'))
2656 return -EINVAL; 2864 return -EINVAL;
2657 2865
2658 if (mddev->pers) 2866 if (mddev->pers) {
2659 return -EBUSY; 2867 int err;
2660 else if (mddev->reshape_position != MaxSector) 2868 if (mddev->pers->reconfig == NULL)
2869 return -EBUSY;
2870 err = mddev->pers->reconfig(mddev, -1, n);
2871 if (err)
2872 return err;
2873 } else {
2661 mddev->new_chunk = n; 2874 mddev->new_chunk = n;
2662 else 2875 if (mddev->reshape_position == MaxSector)
2663 mddev->chunk_size = n; 2876 mddev->chunk_size = n;
2877 }
2664 return len; 2878 return len;
2665} 2879}
2666static struct md_sysfs_entry md_chunk_size = 2880static struct md_sysfs_entry md_chunk_size =
@@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2669static ssize_t 2883static ssize_t
2670resync_start_show(mddev_t *mddev, char *page) 2884resync_start_show(mddev_t *mddev, char *page)
2671{ 2885{
2886 if (mddev->recovery_cp == MaxSector)
2887 return sprintf(page, "none\n");
2672 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2888 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2673} 2889}
2674 2890
@@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page)
2766 else { 2982 else {
2767 if (list_empty(&mddev->disks) && 2983 if (list_empty(&mddev->disks) &&
2768 mddev->raid_disks == 0 && 2984 mddev->raid_disks == 0 &&
2769 mddev->size == 0) 2985 mddev->dev_sectors == 0)
2770 st = clear; 2986 st = clear;
2771 else 2987 else
2772 st = inactive; 2988 st = inactive;
@@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
2973static ssize_t 3189static ssize_t
2974size_show(mddev_t *mddev, char *page) 3190size_show(mddev_t *mddev, char *page)
2975{ 3191{
2976 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 3192 return sprintf(page, "%llu\n",
3193 (unsigned long long)mddev->dev_sectors / 2);
2977} 3194}
2978 3195
2979static int update_size(mddev_t *mddev, sector_t num_sectors); 3196static int update_size(mddev_t *mddev, sector_t num_sectors);
@@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
2985 * not increase it (except from 0). 3202 * not increase it (except from 0).
2986 * If array is active, we can try an on-line resize 3203 * If array is active, we can try an on-line resize
2987 */ 3204 */
2988 char *e; 3205 sector_t sectors;
2989 int err = 0; 3206 int err = strict_blocks_to_sectors(buf, &sectors);
2990 unsigned long long size = simple_strtoull(buf, &e, 10);
2991 if (!*buf || *buf == '\n' ||
2992 (*e && *e != '\n'))
2993 return -EINVAL;
2994 3207
3208 if (err < 0)
3209 return err;
2995 if (mddev->pers) { 3210 if (mddev->pers) {
2996 err = update_size(mddev, size * 2); 3211 err = update_size(mddev, sectors);
2997 md_update_sb(mddev, 1); 3212 md_update_sb(mddev, 1);
2998 } else { 3213 } else {
2999 if (mddev->size == 0 || 3214 if (mddev->dev_sectors == 0 ||
3000 mddev->size > size) 3215 mddev->dev_sectors > sectors)
3001 mddev->size = size; 3216 mddev->dev_sectors = sectors;
3002 else 3217 else
3003 err = -ENOSPC; 3218 err = -ENOSPC;
3004 } 3219 }
@@ -3251,6 +3466,8 @@ static ssize_t
3251sync_speed_show(mddev_t *mddev, char *page) 3466sync_speed_show(mddev_t *mddev, char *page)
3252{ 3467{
3253 unsigned long resync, dt, db; 3468 unsigned long resync, dt, db;
3469 if (mddev->curr_resync == 0)
3470 return sprintf(page, "none\n");
3254 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3471 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3255 dt = (jiffies - mddev->resync_mark) / HZ; 3472 dt = (jiffies - mddev->resync_mark) / HZ;
3256 if (!dt) dt++; 3473 if (!dt) dt++;
@@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3263static ssize_t 3480static ssize_t
3264sync_completed_show(mddev_t *mddev, char *page) 3481sync_completed_show(mddev_t *mddev, char *page)
3265{ 3482{
3266 unsigned long max_blocks, resync; 3483 unsigned long max_sectors, resync;
3267 3484
3268 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3485 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3269 max_blocks = mddev->resync_max_sectors; 3486 max_sectors = mddev->resync_max_sectors;
3270 else 3487 else
3271 max_blocks = mddev->size << 1; 3488 max_sectors = mddev->dev_sectors;
3272 3489
3273 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 3490 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
3274 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 3491 return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3275} 3492}
3276 3493
3277static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3494static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position =
3431__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 3648__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3432 reshape_position_store); 3649 reshape_position_store);
3433 3650
3651static ssize_t
3652array_size_show(mddev_t *mddev, char *page)
3653{
3654 if (mddev->external_size)
3655 return sprintf(page, "%llu\n",
3656 (unsigned long long)mddev->array_sectors/2);
3657 else
3658 return sprintf(page, "default\n");
3659}
3660
3661static ssize_t
3662array_size_store(mddev_t *mddev, const char *buf, size_t len)
3663{
3664 sector_t sectors;
3665
3666 if (strncmp(buf, "default", 7) == 0) {
3667 if (mddev->pers)
3668 sectors = mddev->pers->size(mddev, 0, 0);
3669 else
3670 sectors = mddev->array_sectors;
3671
3672 mddev->external_size = 0;
3673 } else {
3674 if (strict_blocks_to_sectors(buf, &sectors) < 0)
3675 return -EINVAL;
3676 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
3677 return -EINVAL;
3678
3679 mddev->external_size = 1;
3680 }
3681
3682 mddev->array_sectors = sectors;
3683 set_capacity(mddev->gendisk, mddev->array_sectors);
3684 if (mddev->pers) {
3685 struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
3686
3687 if (bdev) {
3688 mutex_lock(&bdev->bd_inode->i_mutex);
3689 i_size_write(bdev->bd_inode,
3690 (loff_t)mddev->array_sectors << 9);
3691 mutex_unlock(&bdev->bd_inode->i_mutex);
3692 bdput(bdev);
3693 }
3694 }
3695
3696 return len;
3697}
3698
3699static struct md_sysfs_entry md_array_size =
3700__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
3701 array_size_store);
3434 3702
3435static struct attribute *md_default_attrs[] = { 3703static struct attribute *md_default_attrs[] = {
3436 &md_level.attr, 3704 &md_level.attr,
@@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = {
3444 &md_safe_delay.attr, 3712 &md_safe_delay.attr,
3445 &md_array_state.attr, 3713 &md_array_state.attr,
3446 &md_reshape_position.attr, 3714 &md_reshape_position.attr,
3715 &md_array_size.attr,
3447 NULL, 3716 NULL,
3448}; 3717};
3449 3718
@@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name)
3602 mddev_put(mddev); 3871 mddev_put(mddev);
3603 return -ENOMEM; 3872 return -ENOMEM;
3604 } 3873 }
3874 mddev->queue->queuedata = mddev;
3875
3605 /* Can be unlocked because the queue is new: no concurrency */ 3876 /* Can be unlocked because the queue is new: no concurrency */
3606 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); 3877 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3607 3878
3608 blk_queue_make_request(mddev->queue, md_fail_request); 3879 blk_queue_make_request(mddev->queue, md_make_request);
3609 3880
3610 disk = alloc_disk(1 << shift); 3881 disk = alloc_disk(1 << shift);
3611 if (!disk) { 3882 if (!disk) {
@@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev)
3731 list_for_each_entry(rdev, &mddev->disks, same_set) { 4002 list_for_each_entry(rdev, &mddev->disks, same_set) {
3732 if (test_bit(Faulty, &rdev->flags)) 4003 if (test_bit(Faulty, &rdev->flags))
3733 continue; 4004 continue;
3734 if (rdev->size < chunk_size / 1024) { 4005 if (rdev->sectors < chunk_size / 512) {
3735 printk(KERN_WARNING 4006 printk(KERN_WARNING
3736 "md: Dev %s smaller than chunk_size:" 4007 "md: Dev %s smaller than chunk_size:"
3737 " %lluk < %dk\n", 4008 " %llu < %d\n",
3738 bdevname(rdev->bdev,b), 4009 bdevname(rdev->bdev,b),
3739 (unsigned long long)rdev->size, 4010 (unsigned long long)rdev->sectors,
3740 chunk_size / 1024); 4011 chunk_size / 512);
3741 return -EINVAL; 4012 return -EINVAL;
3742 } 4013 }
3743 } 4014 }
@@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev)
3761 4032
3762 /* perform some consistency tests on the device. 4033 /* perform some consistency tests on the device.
3763 * We don't want the data to overlap the metadata, 4034 * We don't want the data to overlap the metadata,
3764 * Internal Bitmap issues has handled elsewhere. 4035 * Internal Bitmap issues have been handled elsewhere.
3765 */ 4036 */
3766 if (rdev->data_offset < rdev->sb_start) { 4037 if (rdev->data_offset < rdev->sb_start) {
3767 if (mddev->size && 4038 if (mddev->dev_sectors &&
3768 rdev->data_offset + mddev->size*2 4039 rdev->data_offset + mddev->dev_sectors
3769 > rdev->sb_start) { 4040 > rdev->sb_start) {
3770 printk("md: %s: data overlaps metadata\n", 4041 printk("md: %s: data overlaps metadata\n",
3771 mdname(mddev)); 4042 mdname(mddev));
@@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev)
3801 } 4072 }
3802 mddev->pers = pers; 4073 mddev->pers = pers;
3803 spin_unlock(&pers_lock); 4074 spin_unlock(&pers_lock);
3804 mddev->level = pers->level; 4075 if (mddev->level != pers->level) {
4076 mddev->level = pers->level;
4077 mddev->new_level = pers->level;
4078 }
3805 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4079 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3806 4080
4081 if (pers->level >= 4 && pers->level <= 6)
4082 /* Cannot support integrity (yet) */
4083 blk_integrity_unregister(mddev->gendisk);
4084
3807 if (mddev->reshape_position != MaxSector && 4085 if (mddev->reshape_position != MaxSector &&
3808 pers->start_reshape == NULL) { 4086 pers->start_reshape == NULL) {
3809 /* This personality cannot handle reshaping... */ 4087 /* This personality cannot handle reshaping... */
@@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev)
3843 } 4121 }
3844 4122
3845 mddev->recovery = 0; 4123 mddev->recovery = 0;
3846 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 4124 /* may be over-ridden by personality */
4125 mddev->resync_max_sectors = mddev->dev_sectors;
4126
3847 mddev->barriers_work = 1; 4127 mddev->barriers_work = 1;
3848 mddev->ok_start_degraded = start_dirty_degraded; 4128 mddev->ok_start_degraded = start_dirty_degraded;
3849 4129
@@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev)
3853 err = mddev->pers->run(mddev); 4133 err = mddev->pers->run(mddev);
3854 if (err) 4134 if (err)
3855 printk(KERN_ERR "md: pers->run() failed ...\n"); 4135 printk(KERN_ERR "md: pers->run() failed ...\n");
3856 else if (mddev->pers->sync_request) { 4136 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4137 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4138 " but 'external_size' not in effect?\n", __func__);
4139 printk(KERN_ERR
4140 "md: invalid array_size %llu > default size %llu\n",
4141 (unsigned long long)mddev->array_sectors / 2,
4142 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4143 err = -EINVAL;
4144 mddev->pers->stop(mddev);
4145 }
4146 if (err == 0 && mddev->pers->sync_request) {
3857 err = bitmap_create(mddev); 4147 err = bitmap_create(mddev);
3858 if (err) { 4148 if (err) {
3859 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 4149 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev)
3899 4189
3900 set_capacity(disk, mddev->array_sectors); 4190 set_capacity(disk, mddev->array_sectors);
3901 4191
3902 /* If we call blk_queue_make_request here, it will
3903 * re-initialise max_sectors etc which may have been
3904 * refined inside -> run. So just set the bits we need to set.
3905 * Most initialisation happended when we called
3906 * blk_queue_make_request(..., md_fail_request)
3907 * earlier.
3908 */
3909 mddev->queue->queuedata = mddev;
3910 mddev->queue->make_request_fn = mddev->pers->make_request;
3911
3912 /* If there is a partially-recovered drive we need to 4192 /* If there is a partially-recovered drive we need to
3913 * start recovery here. If we leave it to md_check_recovery, 4193 * start recovery here. If we leave it to md_check_recovery,
3914 * it will remove the drives and not do the right thing 4194 * it will remove the drives and not do the right thing
@@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4038 md_super_wait(mddev); 4318 md_super_wait(mddev);
4039 if (mddev->ro) 4319 if (mddev->ro)
4040 set_disk_ro(disk, 0); 4320 set_disk_ro(disk, 0);
4041 blk_queue_make_request(mddev->queue, md_fail_request); 4321
4042 mddev->pers->stop(mddev); 4322 mddev->pers->stop(mddev);
4043 mddev->queue->merge_bvec_fn = NULL; 4323 mddev->queue->merge_bvec_fn = NULL;
4044 mddev->queue->unplug_fn = NULL; 4324 mddev->queue->unplug_fn = NULL;
@@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4095 export_array(mddev); 4375 export_array(mddev);
4096 4376
4097 mddev->array_sectors = 0; 4377 mddev->array_sectors = 0;
4098 mddev->size = 0; 4378 mddev->external_size = 0;
4379 mddev->dev_sectors = 0;
4099 mddev->raid_disks = 0; 4380 mddev->raid_disks = 0;
4100 mddev->recovery_cp = 0; 4381 mddev->recovery_cp = 0;
4101 mddev->resync_min = 0; 4382 mddev->resync_min = 0;
@@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4135 printk(KERN_INFO "md: %s switched to read-only mode.\n", 4416 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4136 mdname(mddev)); 4417 mdname(mddev));
4137 err = 0; 4418 err = 0;
4419 blk_integrity_unregister(disk);
4138 md_new_event(mddev); 4420 md_new_event(mddev);
4139 sysfs_notify_dirent(mddev->sysfs_state); 4421 sysfs_notify_dirent(mddev->sysfs_state);
4140out: 4422out:
@@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4300 info.patch_version = MD_PATCHLEVEL_VERSION; 4582 info.patch_version = MD_PATCHLEVEL_VERSION;
4301 info.ctime = mddev->ctime; 4583 info.ctime = mddev->ctime;
4302 info.level = mddev->level; 4584 info.level = mddev->level;
4303 info.size = mddev->size; 4585 info.size = mddev->dev_sectors / 2;
4304 if (info.size != mddev->size) /* overflow */ 4586 if (info.size != mddev->dev_sectors / 2) /* overflow */
4305 info.size = -1; 4587 info.size = -1;
4306 info.nr_disks = nr; 4588 info.nr_disks = nr;
4307 info.raid_disks = mddev->raid_disks; 4589 info.raid_disks = mddev->raid_disks;
@@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4480 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 4762 clear_bit(In_sync, &rdev->flags); /* just to be sure */
4481 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4763 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4482 set_bit(WriteMostly, &rdev->flags); 4764 set_bit(WriteMostly, &rdev->flags);
4765 else
4766 clear_bit(WriteMostly, &rdev->flags);
4483 4767
4484 rdev->raid_disk = -1; 4768 rdev->raid_disk = -1;
4485 err = bind_rdev_to_array(rdev, mddev); 4769 err = bind_rdev_to_array(rdev, mddev);
@@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4543 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4827 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4544 } else 4828 } else
4545 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4829 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4546 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; 4830 rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
4547 4831
4548 err = bind_rdev_to_array(rdev, mddev); 4832 err = bind_rdev_to_array(rdev, mddev);
4549 if (err) { 4833 if (err) {
@@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
4613 else 4897 else
4614 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4898 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4615 4899
4616 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; 4900 rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
4617 4901
4618 if (test_bit(Faulty, &rdev->flags)) { 4902 if (test_bit(Faulty, &rdev->flags)) {
4619 printk(KERN_WARNING 4903 printk(KERN_WARNING
@@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4749 5033
4750 mddev->level = info->level; 5034 mddev->level = info->level;
4751 mddev->clevel[0] = 0; 5035 mddev->clevel[0] = 0;
4752 mddev->size = info->size; 5036 mddev->dev_sectors = 2 * (sector_t)info->size;
4753 mddev->raid_disks = info->raid_disks; 5037 mddev->raid_disks = info->raid_disks;
4754 /* don't set md_minor, it is determined by which /dev/md* was 5038 /* don't set md_minor, it is determined by which /dev/md* was
4755 * openned 5039 * openned
@@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4788 return 0; 5072 return 0;
4789} 5073}
4790 5074
5075void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5076{
5077 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5078
5079 if (mddev->external_size)
5080 return;
5081
5082 mddev->array_sectors = array_sectors;
5083}
5084EXPORT_SYMBOL(md_set_array_sectors);
5085
4791static int update_size(mddev_t *mddev, sector_t num_sectors) 5086static int update_size(mddev_t *mddev, sector_t num_sectors)
4792{ 5087{
4793 mdk_rdev_t *rdev; 5088 mdk_rdev_t *rdev;
@@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
4814 */ 5109 */
4815 return -EBUSY; 5110 return -EBUSY;
4816 list_for_each_entry(rdev, &mddev->disks, same_set) { 5111 list_for_each_entry(rdev, &mddev->disks, same_set) {
4817 sector_t avail; 5112 sector_t avail = rdev->sectors;
4818 avail = rdev->size * 2;
4819 5113
4820 if (fit && (num_sectors == 0 || num_sectors > avail)) 5114 if (fit && (num_sectors == 0 || num_sectors > avail))
4821 num_sectors = avail; 5115 num_sectors = avail;
@@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4887 ) 5181 )
4888 return -EINVAL; 5182 return -EINVAL;
4889 /* Check there is only one change */ 5183 /* Check there is only one change */
4890 if (info->size >= 0 && mddev->size != info->size) cnt++; 5184 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
4891 if (mddev->raid_disks != info->raid_disks) cnt++; 5185 cnt++;
4892 if (mddev->layout != info->layout) cnt++; 5186 if (mddev->raid_disks != info->raid_disks)
4893 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 5187 cnt++;
4894 if (cnt == 0) return 0; 5188 if (mddev->layout != info->layout)
4895 if (cnt > 1) return -EINVAL; 5189 cnt++;
5190 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5191 cnt++;
5192 if (cnt == 0)
5193 return 0;
5194 if (cnt > 1)
5195 return -EINVAL;
4896 5196
4897 if (mddev->layout != info->layout) { 5197 if (mddev->layout != info->layout) {
4898 /* Change layout 5198 /* Change layout
@@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4904 else 5204 else
4905 return mddev->pers->reconfig(mddev, info->layout, -1); 5205 return mddev->pers->reconfig(mddev, info->layout, -1);
4906 } 5206 }
4907 if (info->size >= 0 && mddev->size != info->size) 5207 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
4908 rv = update_size(mddev, (sector_t)info->size * 2); 5208 rv = update_size(mddev, (sector_t)info->size * 2);
4909 5209
4910 if (mddev->raid_disks != info->raid_disks) 5210 if (mddev->raid_disks != info->raid_disks)
@@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5331 5631
5332void md_unregister_thread(mdk_thread_t *thread) 5632void md_unregister_thread(mdk_thread_t *thread)
5333{ 5633{
5634 if (!thread)
5635 return;
5334 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 5636 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5335 5637
5336 kthread_stop(thread->tsk); 5638 kthread_stop(thread->tsk);
@@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
5404 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5706 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5405 max_blocks = mddev->resync_max_sectors >> 1; 5707 max_blocks = mddev->resync_max_sectors >> 1;
5406 else 5708 else
5407 max_blocks = mddev->size; 5709 max_blocks = mddev->dev_sectors / 2;
5408 5710
5409 /* 5711 /*
5410 * Should not happen. 5712 * Should not happen.
@@ -5537,7 +5839,7 @@ struct mdstat_info {
5537static int md_seq_show(struct seq_file *seq, void *v) 5839static int md_seq_show(struct seq_file *seq, void *v)
5538{ 5840{
5539 mddev_t *mddev = v; 5841 mddev_t *mddev = v;
5540 sector_t size; 5842 sector_t sectors;
5541 mdk_rdev_t *rdev; 5843 mdk_rdev_t *rdev;
5542 struct mdstat_info *mi = seq->private; 5844 struct mdstat_info *mi = seq->private;
5543 struct bitmap *bitmap; 5845 struct bitmap *bitmap;
@@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
5573 seq_printf(seq, " %s", mddev->pers->name); 5875 seq_printf(seq, " %s", mddev->pers->name);
5574 } 5876 }
5575 5877
5576 size = 0; 5878 sectors = 0;
5577 list_for_each_entry(rdev, &mddev->disks, same_set) { 5879 list_for_each_entry(rdev, &mddev->disks, same_set) {
5578 char b[BDEVNAME_SIZE]; 5880 char b[BDEVNAME_SIZE];
5579 seq_printf(seq, " %s[%d]", 5881 seq_printf(seq, " %s[%d]",
@@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
5585 continue; 5887 continue;
5586 } else if (rdev->raid_disk < 0) 5888 } else if (rdev->raid_disk < 0)
5587 seq_printf(seq, "(S)"); /* spare */ 5889 seq_printf(seq, "(S)"); /* spare */
5588 size += rdev->size; 5890 sectors += rdev->sectors;
5589 } 5891 }
5590 5892
5591 if (!list_empty(&mddev->disks)) { 5893 if (!list_empty(&mddev->disks)) {
@@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
5595 mddev->array_sectors / 2); 5897 mddev->array_sectors / 2);
5596 else 5898 else
5597 seq_printf(seq, "\n %llu blocks", 5899 seq_printf(seq, "\n %llu blocks",
5598 (unsigned long long)size); 5900 (unsigned long long)sectors / 2);
5599 } 5901 }
5600 if (mddev->persistent) { 5902 if (mddev->persistent) {
5601 if (mddev->major_version != 0 || 5903 if (mddev->major_version != 0 ||
@@ -5722,19 +6024,19 @@ int unregister_md_personality(struct mdk_personality *p)
5722 return 0; 6024 return 0;
5723} 6025}
5724 6026
5725static int is_mddev_idle(mddev_t *mddev) 6027static int is_mddev_idle(mddev_t *mddev, int init)
5726{ 6028{
5727 mdk_rdev_t * rdev; 6029 mdk_rdev_t * rdev;
5728 int idle; 6030 int idle;
5729 long curr_events; 6031 int curr_events;
5730 6032
5731 idle = 1; 6033 idle = 1;
5732 rcu_read_lock(); 6034 rcu_read_lock();
5733 rdev_for_each_rcu(rdev, mddev) { 6035 rdev_for_each_rcu(rdev, mddev) {
5734 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 6036 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5735 curr_events = part_stat_read(&disk->part0, sectors[0]) + 6037 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
5736 part_stat_read(&disk->part0, sectors[1]) - 6038 (int)part_stat_read(&disk->part0, sectors[1]) -
5737 atomic_read(&disk->sync_io); 6039 atomic_read(&disk->sync_io);
5738 /* sync IO will cause sync_io to increase before the disk_stats 6040 /* sync IO will cause sync_io to increase before the disk_stats
5739 * as sync_io is counted when a request starts, and 6041 * as sync_io is counted when a request starts, and
5740 * disk_stats is counted when it completes. 6042 * disk_stats is counted when it completes.
@@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev)
5757 * always make curr_events less than last_events. 6059 * always make curr_events less than last_events.
5758 * 6060 *
5759 */ 6061 */
5760 if (curr_events - rdev->last_events > 4096) { 6062 if (init || curr_events - rdev->last_events > 64) {
5761 rdev->last_events = curr_events; 6063 rdev->last_events = curr_events;
5762 idle = 0; 6064 idle = 0;
5763 } 6065 }
@@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev)
5980 j = mddev->recovery_cp; 6282 j = mddev->recovery_cp;
5981 6283
5982 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6284 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5983 max_sectors = mddev->size << 1; 6285 max_sectors = mddev->dev_sectors;
5984 else { 6286 else {
5985 /* recovery follows the physical size of devices */ 6287 /* recovery follows the physical size of devices */
5986 max_sectors = mddev->size << 1; 6288 max_sectors = mddev->dev_sectors;
5987 j = MaxSector; 6289 j = MaxSector;
5988 list_for_each_entry(rdev, &mddev->disks, same_set) 6290 list_for_each_entry(rdev, &mddev->disks, same_set)
5989 if (rdev->raid_disk >= 0 && 6291 if (rdev->raid_disk >= 0 &&
@@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev)
6000 "(but not more than %d KB/sec) for %s.\n", 6302 "(but not more than %d KB/sec) for %s.\n",
6001 speed_max(mddev), desc); 6303 speed_max(mddev), desc);
6002 6304
6003 is_mddev_idle(mddev); /* this also initializes IO event counters */ 6305 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
6004 6306
6005 io_sectors = 0; 6307 io_sectors = 0;
6006 for (m = 0; m < SYNC_MARKS; m++) { 6308 for (m = 0; m < SYNC_MARKS; m++) {
@@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev)
6040 } 6342 }
6041 if (kthread_should_stop()) 6343 if (kthread_should_stop())
6042 goto interrupted; 6344 goto interrupted;
6345
6346 if (mddev->curr_resync > mddev->curr_resync_completed &&
6347 (mddev->curr_resync - mddev->curr_resync_completed)
6348 > (max_sectors >> 4)) {
6349 /* time to update curr_resync_completed */
6350 blk_unplug(mddev->queue);
6351 wait_event(mddev->recovery_wait,
6352 atomic_read(&mddev->recovery_active) == 0);
6353 mddev->curr_resync_completed =
6354 mddev->curr_resync;
6355 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6356 }
6043 sectors = mddev->pers->sync_request(mddev, j, &skipped, 6357 sectors = mddev->pers->sync_request(mddev, j, &skipped,
6044 currspeed < speed_min(mddev)); 6358 currspeed < speed_min(mddev));
6045 if (sectors == 0) { 6359 if (sectors == 0) {
@@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev)
6102 6416
6103 if (currspeed > speed_min(mddev)) { 6417 if (currspeed > speed_min(mddev)) {
6104 if ((currspeed > speed_max(mddev)) || 6418 if ((currspeed > speed_max(mddev)) ||
6105 !is_mddev_idle(mddev)) { 6419 !is_mddev_idle(mddev, 0)) {
6106 msleep(500); 6420 msleep(500);
6107 goto repeat; 6421 goto repeat;
6108 } 6422 }
@@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev)
6173 mdk_rdev_t *rdev; 6487 mdk_rdev_t *rdev;
6174 int spares = 0; 6488 int spares = 0;
6175 6489
6490 mddev->curr_resync_completed = 0;
6491
6176 list_for_each_entry(rdev, &mddev->disks, same_set) 6492 list_for_each_entry(rdev, &mddev->disks, same_set)
6177 if (rdev->raid_disk >= 0 && 6493 if (rdev->raid_disk >= 0 &&
6178 !test_bit(Blocked, &rdev->flags) && 6494 !test_bit(Blocked, &rdev->flags) &&
@@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev)
6327 sysfs_notify(&mddev->kobj, NULL, 6643 sysfs_notify(&mddev->kobj, NULL,
6328 "degraded"); 6644 "degraded");
6329 } 6645 }
6646 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6647 mddev->pers->finish_reshape)
6648 mddev->pers->finish_reshape(mddev);
6330 md_update_sb(mddev, 1); 6649 md_update_sb(mddev, 1);
6331 6650
6332 /* if array is no-longer degraded, then any saved_raid_disk 6651 /* if array is no-longer degraded, then any saved_raid_disk
@@ -6470,13 +6789,13 @@ static void md_geninit(void)
6470 6789
6471static int __init md_init(void) 6790static int __init md_init(void)
6472{ 6791{
6473 if (register_blkdev(MAJOR_NR, "md")) 6792 if (register_blkdev(MD_MAJOR, "md"))
6474 return -1; 6793 return -1;
6475 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 6794 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
6476 unregister_blkdev(MAJOR_NR, "md"); 6795 unregister_blkdev(MD_MAJOR, "md");
6477 return -1; 6796 return -1;
6478 } 6797 }
6479 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 6798 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
6480 md_probe, NULL, NULL); 6799 md_probe, NULL, NULL);
6481 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 6800 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
6482 md_probe, NULL, NULL); 6801 md_probe, NULL, NULL);
@@ -6562,10 +6881,10 @@ static __exit void md_exit(void)
6562 mddev_t *mddev; 6881 mddev_t *mddev;
6563 struct list_head *tmp; 6882 struct list_head *tmp;
6564 6883
6565 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 6884 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
6566 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 6885 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6567 6886
6568 unregister_blkdev(MAJOR_NR,"md"); 6887 unregister_blkdev(MD_MAJOR,"md");
6569 unregister_blkdev(mdp_major, "mdp"); 6888 unregister_blkdev(mdp_major, "mdp");
6570 unregister_reboot_notifier(&md_notifier); 6889 unregister_reboot_notifier(&md_notifier);
6571 unregister_sysctl_table(raid_table_header); 6890 unregister_sysctl_table(raid_table_header);