aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-05-21 18:49:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-21 18:49:14 -0400
commit6e4513972a5ad28517477d21f301a02ac7a0df76 (patch)
treec88abe8297ea3971f722e0787842fb790d14767a /drivers
parentd79df0b1eda0099a22cbcece01ce5e7d222450de (diff)
parent19fdb9eefb21b72edbc365b838502780c392bad6 (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (45 commits) md: don't insist on valid event count for spare devices. md: simplify updating of event count to sometimes avoid updating spares. md/raid6: Fix raid-6 read-error correction in degraded state md: restore ability of spare drives to spin down. md: Fix read balancing in RAID1 and RAID10 on drives > 2TB md/linear: standardise all printk messages md/raid0: tidy up printk messages. md/raid10: tidy up printk messages. md/raid1: improve printk messages md/raid5: improve consistency of error messages. md: remove EXPERIMENTAL designation from RAID10 md: allow integers to be passed to md/level md: notify mdstat waiters of level change md/raid4: permit raid0 takeover md/raid1: delay reads that could overtake behind-writes. md/raid1: fix confusing 'redirect sector' message. md: don't unregister the thread in mddev_suspend md: factor out init code for an mddev md: pass mddev to make_request functions rather than request_queue md: call md_stop_writes from md_stop ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/Kconfig4
-rw-r--r--drivers/md/bitmap.c41
-rw-r--r--drivers/md/bitmap.h2
-rw-r--r--drivers/md/faulty.c9
-rw-r--r--drivers/md/linear.c36
-rw-r--r--drivers/md/md.c537
-rw-r--r--drivers/md/md.h16
-rw-r--r--drivers/md/multipath.c13
-rw-r--r--drivers/md/raid0.c251
-rw-r--r--drivers/md/raid0.h3
-rw-r--r--drivers/md/raid1.c114
-rw-r--r--drivers/md/raid10.c300
-rw-r--r--drivers/md/raid10.h12
-rw-r--r--drivers/md/raid5.c231
14 files changed, 987 insertions, 582 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index acb3a4e404ff..4a6feac8c94a 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -100,8 +100,8 @@ config MD_RAID1
100 If unsure, say Y. 100 If unsure, say Y.
101 101
102config MD_RAID10 102config MD_RAID10
103 tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" 103 tristate "RAID-10 (mirrored striping) mode"
104 depends on BLK_DEV_MD && EXPERIMENTAL 104 depends on BLK_DEV_MD
105 ---help--- 105 ---help---
106 RAID-10 provides a combination of striping (RAID-0) and 106 RAID-10 provides a combination of striping (RAID-0) and
107 mirroring (RAID-1) with easier configuration and more flexible 107 mirroring (RAID-1) with easier configuration and more flexible
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f084249295d9..23d1d54b12a4 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -505,7 +505,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
505 return; 505 return;
506 } 506 }
507 spin_unlock_irqrestore(&bitmap->lock, flags); 507 spin_unlock_irqrestore(&bitmap->lock, flags);
508 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 508 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
509 sb->events = cpu_to_le64(bitmap->mddev->events); 509 sb->events = cpu_to_le64(bitmap->mddev->events);
510 if (bitmap->mddev->events < bitmap->events_cleared) { 510 if (bitmap->mddev->events < bitmap->events_cleared) {
511 /* rocking back to read-only */ 511 /* rocking back to read-only */
@@ -526,7 +526,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
526 526
527 if (!bitmap || !bitmap->sb_page) 527 if (!bitmap || !bitmap->sb_page)
528 return; 528 return;
529 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 529 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
530 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 530 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
531 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 531 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
532 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); 532 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
@@ -575,7 +575,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
575 return err; 575 return err;
576 } 576 }
577 577
578 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 578 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
579 579
580 chunksize = le32_to_cpu(sb->chunksize); 580 chunksize = le32_to_cpu(sb->chunksize);
581 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; 581 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
@@ -661,7 +661,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
661 return 0; 661 return 0;
662 } 662 }
663 spin_unlock_irqrestore(&bitmap->lock, flags); 663 spin_unlock_irqrestore(&bitmap->lock, flags);
664 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 664 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
665 old = le32_to_cpu(sb->state) & bits; 665 old = le32_to_cpu(sb->state) & bits;
666 switch (op) { 666 switch (op) {
667 case MASK_SET: sb->state |= cpu_to_le32(bits); 667 case MASK_SET: sb->state |= cpu_to_le32(bits);
@@ -1292,9 +1292,14 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1292 if (!bitmap) return 0; 1292 if (!bitmap) return 0;
1293 1293
1294 if (behind) { 1294 if (behind) {
1295 int bw;
1295 atomic_inc(&bitmap->behind_writes); 1296 atomic_inc(&bitmap->behind_writes);
1297 bw = atomic_read(&bitmap->behind_writes);
1298 if (bw > bitmap->behind_writes_used)
1299 bitmap->behind_writes_used = bw;
1300
1296 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", 1301 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
1297 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1302 bw, bitmap->max_write_behind);
1298 } 1303 }
1299 1304
1300 while (sectors) { 1305 while (sectors) {
@@ -1351,7 +1356,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1351{ 1356{
1352 if (!bitmap) return; 1357 if (!bitmap) return;
1353 if (behind) { 1358 if (behind) {
1354 atomic_dec(&bitmap->behind_writes); 1359 if (atomic_dec_and_test(&bitmap->behind_writes))
1360 wake_up(&bitmap->behind_wait);
1355 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", 1361 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
1356 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1362 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
1357 } 1363 }
@@ -1675,6 +1681,7 @@ int bitmap_create(mddev_t *mddev)
1675 atomic_set(&bitmap->pending_writes, 0); 1681 atomic_set(&bitmap->pending_writes, 0);
1676 init_waitqueue_head(&bitmap->write_wait); 1682 init_waitqueue_head(&bitmap->write_wait);
1677 init_waitqueue_head(&bitmap->overflow_wait); 1683 init_waitqueue_head(&bitmap->overflow_wait);
1684 init_waitqueue_head(&bitmap->behind_wait);
1678 1685
1679 bitmap->mddev = mddev; 1686 bitmap->mddev = mddev;
1680 1687
@@ -2006,6 +2013,27 @@ static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
2006static struct md_sysfs_entry bitmap_can_clear = 2013static struct md_sysfs_entry bitmap_can_clear =
2007__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); 2014__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
2008 2015
2016static ssize_t
2017behind_writes_used_show(mddev_t *mddev, char *page)
2018{
2019 if (mddev->bitmap == NULL)
2020 return sprintf(page, "0\n");
2021 return sprintf(page, "%lu\n",
2022 mddev->bitmap->behind_writes_used);
2023}
2024
2025static ssize_t
2026behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len)
2027{
2028 if (mddev->bitmap)
2029 mddev->bitmap->behind_writes_used = 0;
2030 return len;
2031}
2032
2033static struct md_sysfs_entry max_backlog_used =
2034__ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
2035 behind_writes_used_show, behind_writes_used_reset);
2036
2009static struct attribute *md_bitmap_attrs[] = { 2037static struct attribute *md_bitmap_attrs[] = {
2010 &bitmap_location.attr, 2038 &bitmap_location.attr,
2011 &bitmap_timeout.attr, 2039 &bitmap_timeout.attr,
@@ -2013,6 +2041,7 @@ static struct attribute *md_bitmap_attrs[] = {
2013 &bitmap_chunksize.attr, 2041 &bitmap_chunksize.attr,
2014 &bitmap_metadata.attr, 2042 &bitmap_metadata.attr,
2015 &bitmap_can_clear.attr, 2043 &bitmap_can_clear.attr,
2044 &max_backlog_used.attr,
2016 NULL 2045 NULL
2017}; 2046};
2018struct attribute_group md_bitmap_group = { 2047struct attribute_group md_bitmap_group = {
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index cb821d76d1b4..3797dea4723a 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -227,6 +227,7 @@ struct bitmap {
227 int allclean; 227 int allclean;
228 228
229 atomic_t behind_writes; 229 atomic_t behind_writes;
230 unsigned long behind_writes_used; /* highest actual value at runtime */
230 231
231 /* 232 /*
232 * the bitmap daemon - periodically wakes up and sweeps the bitmap 233 * the bitmap daemon - periodically wakes up and sweeps the bitmap
@@ -239,6 +240,7 @@ struct bitmap {
239 atomic_t pending_writes; /* pending writes to the bitmap file */ 240 atomic_t pending_writes; /* pending writes to the bitmap file */
240 wait_queue_head_t write_wait; 241 wait_queue_head_t write_wait;
241 wait_queue_head_t overflow_wait; 242 wait_queue_head_t overflow_wait;
243 wait_queue_head_t behind_wait;
242 244
243 struct sysfs_dirent *sysfs_can_clear; 245 struct sysfs_dirent *sysfs_can_clear;
244}; 246};
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 8e3850b98cca..1a8987884614 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -169,10 +169,9 @@ static void add_sector(conf_t *conf, sector_t start, int mode)
169 conf->nfaults = n+1; 169 conf->nfaults = n+1;
170} 170}
171 171
172static int make_request(struct request_queue *q, struct bio *bio) 172static int make_request(mddev_t *mddev, struct bio *bio)
173{ 173{
174 mddev_t *mddev = q->queuedata; 174 conf_t *conf = mddev->private;
175 conf_t *conf = (conf_t*)mddev->private;
176 int failit = 0; 175 int failit = 0;
177 176
178 if (bio_data_dir(bio) == WRITE) { 177 if (bio_data_dir(bio) == WRITE) {
@@ -225,7 +224,7 @@ static int make_request(struct request_queue *q, struct bio *bio)
225 224
226static void status(struct seq_file *seq, mddev_t *mddev) 225static void status(struct seq_file *seq, mddev_t *mddev)
227{ 226{
228 conf_t *conf = (conf_t*)mddev->private; 227 conf_t *conf = mddev->private;
229 int n; 228 int n;
230 229
231 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) 230 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
@@ -328,7 +327,7 @@ static int run(mddev_t *mddev)
328 327
329static int stop(mddev_t *mddev) 328static int stop(mddev_t *mddev)
330{ 329{
331 conf_t *conf = (conf_t *)mddev->private; 330 conf_t *conf = mddev->private;
332 331
333 kfree(conf); 332 kfree(conf);
334 mddev->private = NULL; 333 mddev->private = NULL;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 09437e958235..7e0e057db9a7 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -159,7 +159,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
159 sector_t sectors; 159 sector_t sectors;
160 160
161 if (j < 0 || j >= raid_disks || disk->rdev) { 161 if (j < 0 || j >= raid_disks || disk->rdev) {
162 printk("linear: disk numbering problem. Aborting!\n"); 162 printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n",
163 mdname(mddev));
163 goto out; 164 goto out;
164 } 165 }
165 166
@@ -187,7 +188,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
187 188
188 } 189 }
189 if (cnt != raid_disks) { 190 if (cnt != raid_disks) {
190 printk("linear: not enough drives present. Aborting!\n"); 191 printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
192 mdname(mddev));
191 goto out; 193 goto out;
192 } 194 }
193 195
@@ -282,29 +284,21 @@ static int linear_stop (mddev_t *mddev)
282 rcu_barrier(); 284 rcu_barrier();
283 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 285 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
284 kfree(conf); 286 kfree(conf);
287 mddev->private = NULL;
285 288
286 return 0; 289 return 0;
287} 290}
288 291
289static int linear_make_request (struct request_queue *q, struct bio *bio) 292static int linear_make_request (mddev_t *mddev, struct bio *bio)
290{ 293{
291 const int rw = bio_data_dir(bio);
292 mddev_t *mddev = q->queuedata;
293 dev_info_t *tmp_dev; 294 dev_info_t *tmp_dev;
294 sector_t start_sector; 295 sector_t start_sector;
295 int cpu;
296 296
297 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 297 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
298 md_barrier_request(mddev, bio); 298 md_barrier_request(mddev, bio);
299 return 0; 299 return 0;
300 } 300 }
301 301
302 cpu = part_stat_lock();
303 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
304 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
305 bio_sectors(bio));
306 part_stat_unlock();
307
308 rcu_read_lock(); 302 rcu_read_lock();
309 tmp_dev = which_dev(mddev, bio->bi_sector); 303 tmp_dev = which_dev(mddev, bio->bi_sector);
310 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; 304 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
@@ -314,12 +308,14 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
314 || (bio->bi_sector < start_sector))) { 308 || (bio->bi_sector < start_sector))) {
315 char b[BDEVNAME_SIZE]; 309 char b[BDEVNAME_SIZE];
316 310
317 printk("linear_make_request: Sector %llu out of bounds on " 311 printk(KERN_ERR
318 "dev %s: %llu sectors, offset %llu\n", 312 "md/linear:%s: make_request: Sector %llu out of bounds on "
319 (unsigned long long)bio->bi_sector, 313 "dev %s: %llu sectors, offset %llu\n",
320 bdevname(tmp_dev->rdev->bdev, b), 314 mdname(mddev),
321 (unsigned long long)tmp_dev->rdev->sectors, 315 (unsigned long long)bio->bi_sector,
322 (unsigned long long)start_sector); 316 bdevname(tmp_dev->rdev->bdev, b),
317 (unsigned long long)tmp_dev->rdev->sectors,
318 (unsigned long long)start_sector);
323 rcu_read_unlock(); 319 rcu_read_unlock();
324 bio_io_error(bio); 320 bio_io_error(bio);
325 return 0; 321 return 0;
@@ -336,9 +332,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
336 332
337 bp = bio_split(bio, end_sector - bio->bi_sector); 333 bp = bio_split(bio, end_sector - bio->bi_sector);
338 334
339 if (linear_make_request(q, &bp->bio1)) 335 if (linear_make_request(mddev, &bp->bio1))
340 generic_make_request(&bp->bio1); 336 generic_make_request(&bp->bio1);
341 if (linear_make_request(q, &bp->bio2)) 337 if (linear_make_request(mddev, &bp->bio2))
342 generic_make_request(&bp->bio2); 338 generic_make_request(&bp->bio2);
343 bio_pair_release(bp); 339 bio_pair_release(bp);
344 return 0; 340 return 0;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a9fd491796ac..46b3a044eadf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,8 +215,11 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
215 */ 215 */
216static int md_make_request(struct request_queue *q, struct bio *bio) 216static int md_make_request(struct request_queue *q, struct bio *bio)
217{ 217{
218 const int rw = bio_data_dir(bio);
218 mddev_t *mddev = q->queuedata; 219 mddev_t *mddev = q->queuedata;
219 int rv; 220 int rv;
221 int cpu;
222
220 if (mddev == NULL || mddev->pers == NULL) { 223 if (mddev == NULL || mddev->pers == NULL) {
221 bio_io_error(bio); 224 bio_io_error(bio);
222 return 0; 225 return 0;
@@ -237,13 +240,27 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
237 } 240 }
238 atomic_inc(&mddev->active_io); 241 atomic_inc(&mddev->active_io);
239 rcu_read_unlock(); 242 rcu_read_unlock();
240 rv = mddev->pers->make_request(q, bio); 243
244 rv = mddev->pers->make_request(mddev, bio);
245
246 cpu = part_stat_lock();
247 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
248 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
249 bio_sectors(bio));
250 part_stat_unlock();
251
241 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 252 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
242 wake_up(&mddev->sb_wait); 253 wake_up(&mddev->sb_wait);
243 254
244 return rv; 255 return rv;
245} 256}
246 257
258/* mddev_suspend makes sure no new requests are submitted
259 * to the device, and that any requests that have been submitted
260 * are completely handled.
261 * Once ->stop is called and completes, the module will be completely
262 * unused.
263 */
247static void mddev_suspend(mddev_t *mddev) 264static void mddev_suspend(mddev_t *mddev)
248{ 265{
249 BUG_ON(mddev->suspended); 266 BUG_ON(mddev->suspended);
@@ -251,13 +268,6 @@ static void mddev_suspend(mddev_t *mddev)
251 synchronize_rcu(); 268 synchronize_rcu();
252 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 269 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
253 mddev->pers->quiesce(mddev, 1); 270 mddev->pers->quiesce(mddev, 1);
254 md_unregister_thread(mddev->thread);
255 mddev->thread = NULL;
256 /* we now know that no code is executing in the personality module,
257 * except possibly the tail end of a ->bi_end_io function, but that
258 * is certain to complete before the module has a chance to get
259 * unloaded
260 */
261} 271}
262 272
263static void mddev_resume(mddev_t *mddev) 273static void mddev_resume(mddev_t *mddev)
@@ -344,7 +354,7 @@ static void md_submit_barrier(struct work_struct *ws)
344 bio_endio(bio, 0); 354 bio_endio(bio, 0);
345 else { 355 else {
346 bio->bi_rw &= ~(1<<BIO_RW_BARRIER); 356 bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
347 if (mddev->pers->make_request(mddev->queue, bio)) 357 if (mddev->pers->make_request(mddev, bio))
348 generic_make_request(bio); 358 generic_make_request(bio);
349 mddev->barrier = POST_REQUEST_BARRIER; 359 mddev->barrier = POST_REQUEST_BARRIER;
350 submit_barriers(mddev); 360 submit_barriers(mddev);
@@ -406,6 +416,27 @@ static void mddev_put(mddev_t *mddev)
406 spin_unlock(&all_mddevs_lock); 416 spin_unlock(&all_mddevs_lock);
407} 417}
408 418
419static void mddev_init(mddev_t *mddev)
420{
421 mutex_init(&mddev->open_mutex);
422 mutex_init(&mddev->reconfig_mutex);
423 mutex_init(&mddev->bitmap_info.mutex);
424 INIT_LIST_HEAD(&mddev->disks);
425 INIT_LIST_HEAD(&mddev->all_mddevs);
426 init_timer(&mddev->safemode_timer);
427 atomic_set(&mddev->active, 1);
428 atomic_set(&mddev->openers, 0);
429 atomic_set(&mddev->active_io, 0);
430 spin_lock_init(&mddev->write_lock);
431 atomic_set(&mddev->flush_pending, 0);
432 init_waitqueue_head(&mddev->sb_wait);
433 init_waitqueue_head(&mddev->recovery_wait);
434 mddev->reshape_position = MaxSector;
435 mddev->resync_min = 0;
436 mddev->resync_max = MaxSector;
437 mddev->level = LEVEL_NONE;
438}
439
409static mddev_t * mddev_find(dev_t unit) 440static mddev_t * mddev_find(dev_t unit)
410{ 441{
411 mddev_t *mddev, *new = NULL; 442 mddev_t *mddev, *new = NULL;
@@ -472,23 +503,7 @@ static mddev_t * mddev_find(dev_t unit)
472 else 503 else
473 new->md_minor = MINOR(unit) >> MdpMinorShift; 504 new->md_minor = MINOR(unit) >> MdpMinorShift;
474 505
475 mutex_init(&new->open_mutex); 506 mddev_init(new);
476 mutex_init(&new->reconfig_mutex);
477 mutex_init(&new->bitmap_info.mutex);
478 INIT_LIST_HEAD(&new->disks);
479 INIT_LIST_HEAD(&new->all_mddevs);
480 init_timer(&new->safemode_timer);
481 atomic_set(&new->active, 1);
482 atomic_set(&new->openers, 0);
483 atomic_set(&new->active_io, 0);
484 spin_lock_init(&new->write_lock);
485 atomic_set(&new->flush_pending, 0);
486 init_waitqueue_head(&new->sb_wait);
487 init_waitqueue_head(&new->recovery_wait);
488 new->reshape_position = MaxSector;
489 new->resync_min = 0;
490 new->resync_max = MaxSector;
491 new->level = LEVEL_NONE;
492 507
493 goto retry; 508 goto retry;
494} 509}
@@ -508,9 +523,36 @@ static inline int mddev_trylock(mddev_t * mddev)
508 return mutex_trylock(&mddev->reconfig_mutex); 523 return mutex_trylock(&mddev->reconfig_mutex);
509} 524}
510 525
511static inline void mddev_unlock(mddev_t * mddev) 526static struct attribute_group md_redundancy_group;
527
528static void mddev_unlock(mddev_t * mddev)
512{ 529{
513 mutex_unlock(&mddev->reconfig_mutex); 530 if (mddev->to_remove) {
531 /* These cannot be removed under reconfig_mutex as
532 * an access to the files will try to take reconfig_mutex
533 * while holding the file unremovable, which leads to
534 * a deadlock.
535 * So hold open_mutex instead - we are allowed to take
536 * it while holding reconfig_mutex, and md_run can
537 * use it to wait for the remove to complete.
538 */
539 struct attribute_group *to_remove = mddev->to_remove;
540 mddev->to_remove = NULL;
541 mutex_lock(&mddev->open_mutex);
542 mutex_unlock(&mddev->reconfig_mutex);
543
544 if (to_remove != &md_redundancy_group)
545 sysfs_remove_group(&mddev->kobj, to_remove);
546 if (mddev->pers == NULL ||
547 mddev->pers->sync_request == NULL) {
548 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
549 if (mddev->sysfs_action)
550 sysfs_put(mddev->sysfs_action);
551 mddev->sysfs_action = NULL;
552 }
553 mutex_unlock(&mddev->open_mutex);
554 } else
555 mutex_unlock(&mddev->reconfig_mutex);
514 556
515 md_wakeup_thread(mddev->thread); 557 md_wakeup_thread(mddev->thread);
516} 558}
@@ -1029,10 +1071,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1029 mddev->bitmap_info.default_offset; 1071 mddev->bitmap_info.default_offset;
1030 1072
1031 } else if (mddev->pers == NULL) { 1073 } else if (mddev->pers == NULL) {
1032 /* Insist on good event counter while assembling */ 1074 /* Insist on good event counter while assembling, except
1075 * for spares (which don't need an event count) */
1033 ++ev1; 1076 ++ev1;
1034 if (ev1 < mddev->events) 1077 if (sb->disks[rdev->desc_nr].state & (
1035 return -EINVAL; 1078 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1079 if (ev1 < mddev->events)
1080 return -EINVAL;
1036 } else if (mddev->bitmap) { 1081 } else if (mddev->bitmap) {
1037 /* if adding to array with a bitmap, then we can accept an 1082 /* if adding to array with a bitmap, then we can accept an
1038 * older device ... but not too old. 1083 * older device ... but not too old.
@@ -1428,10 +1473,14 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1428 } 1473 }
1429 1474
1430 } else if (mddev->pers == NULL) { 1475 } else if (mddev->pers == NULL) {
1431 /* Insist of good event counter while assembling */ 1476 /* Insist of good event counter while assembling, except for
1477 * spares (which don't need an event count) */
1432 ++ev1; 1478 ++ev1;
1433 if (ev1 < mddev->events) 1479 if (rdev->desc_nr >= 0 &&
1434 return -EINVAL; 1480 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1481 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1482 if (ev1 < mddev->events)
1483 return -EINVAL;
1435 } else if (mddev->bitmap) { 1484 } else if (mddev->bitmap) {
1436 /* If adding to array with a bitmap, then we can accept an 1485 /* If adding to array with a bitmap, then we can accept an
1437 * older device, but not too old. 1486 * older device, but not too old.
@@ -2047,7 +2096,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)
2047 if (rdev->sb_events == mddev->events || 2096 if (rdev->sb_events == mddev->events ||
2048 (nospares && 2097 (nospares &&
2049 rdev->raid_disk < 0 && 2098 rdev->raid_disk < 0 &&
2050 (rdev->sb_events&1)==0 &&
2051 rdev->sb_events+1 == mddev->events)) { 2099 rdev->sb_events+1 == mddev->events)) {
2052 /* Don't update this superblock */ 2100 /* Don't update this superblock */
2053 rdev->sb_loaded = 2; 2101 rdev->sb_loaded = 2;
@@ -2100,28 +2148,14 @@ repeat:
2100 * and 'events' is odd, we can roll back to the previous clean state */ 2148 * and 'events' is odd, we can roll back to the previous clean state */
2101 if (nospares 2149 if (nospares
2102 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2150 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2103 && (mddev->events & 1) 2151 && mddev->can_decrease_events
2104 && mddev->events != 1) 2152 && mddev->events != 1) {
2105 mddev->events--; 2153 mddev->events--;
2106 else { 2154 mddev->can_decrease_events = 0;
2155 } else {
2107 /* otherwise we have to go forward and ... */ 2156 /* otherwise we have to go forward and ... */
2108 mddev->events ++; 2157 mddev->events ++;
2109 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 2158 mddev->can_decrease_events = nospares;
2110 /* .. if the array isn't clean, an 'even' event must also go
2111 * to spares. */
2112 if ((mddev->events&1)==0) {
2113 nospares = 0;
2114 sync_req = 2; /* force a second update to get the
2115 * even/odd in sync */
2116 }
2117 } else {
2118 /* otherwise an 'odd' event must go to spares */
2119 if ((mddev->events&1)) {
2120 nospares = 0;
2121 sync_req = 2; /* force a second update to get the
2122 * even/odd in sync */
2123 }
2124 }
2125 } 2159 }
2126 2160
2127 if (!mddev->events) { 2161 if (!mddev->events) {
@@ -2365,6 +2399,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2365 return err; 2399 return err;
2366 sprintf(nm, "rd%d", rdev->raid_disk); 2400 sprintf(nm, "rd%d", rdev->raid_disk);
2367 sysfs_remove_link(&rdev->mddev->kobj, nm); 2401 sysfs_remove_link(&rdev->mddev->kobj, nm);
2402 rdev->raid_disk = -1;
2368 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2403 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2369 md_wakeup_thread(rdev->mddev->thread); 2404 md_wakeup_thread(rdev->mddev->thread);
2370 } else if (rdev->mddev->pers) { 2405 } else if (rdev->mddev->pers) {
@@ -2780,8 +2815,9 @@ static void analyze_sbs(mddev_t * mddev)
2780 2815
2781 i = 0; 2816 i = 0;
2782 rdev_for_each(rdev, tmp, mddev) { 2817 rdev_for_each(rdev, tmp, mddev) {
2783 if (rdev->desc_nr >= mddev->max_disks || 2818 if (mddev->max_disks &&
2784 i > mddev->max_disks) { 2819 (rdev->desc_nr >= mddev->max_disks ||
2820 i > mddev->max_disks)) {
2785 printk(KERN_WARNING 2821 printk(KERN_WARNING
2786 "md: %s: %s: only %d devices permitted\n", 2822 "md: %s: %s: only %d devices permitted\n",
2787 mdname(mddev), bdevname(rdev->bdev, b), 2823 mdname(mddev), bdevname(rdev->bdev, b),
@@ -2897,9 +2933,10 @@ level_show(mddev_t *mddev, char *page)
2897static ssize_t 2933static ssize_t
2898level_store(mddev_t *mddev, const char *buf, size_t len) 2934level_store(mddev_t *mddev, const char *buf, size_t len)
2899{ 2935{
2900 char level[16]; 2936 char clevel[16];
2901 ssize_t rv = len; 2937 ssize_t rv = len;
2902 struct mdk_personality *pers; 2938 struct mdk_personality *pers;
2939 long level;
2903 void *priv; 2940 void *priv;
2904 mdk_rdev_t *rdev; 2941 mdk_rdev_t *rdev;
2905 2942
@@ -2932,19 +2969,22 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2932 } 2969 }
2933 2970
2934 /* Now find the new personality */ 2971 /* Now find the new personality */
2935 if (len == 0 || len >= sizeof(level)) 2972 if (len == 0 || len >= sizeof(clevel))
2936 return -EINVAL; 2973 return -EINVAL;
2937 strncpy(level, buf, len); 2974 strncpy(clevel, buf, len);
2938 if (level[len-1] == '\n') 2975 if (clevel[len-1] == '\n')
2939 len--; 2976 len--;
2940 level[len] = 0; 2977 clevel[len] = 0;
2978 if (strict_strtol(clevel, 10, &level))
2979 level = LEVEL_NONE;
2941 2980
2942 request_module("md-%s", level); 2981 if (request_module("md-%s", clevel) != 0)
2982 request_module("md-level-%s", clevel);
2943 spin_lock(&pers_lock); 2983 spin_lock(&pers_lock);
2944 pers = find_pers(LEVEL_NONE, level); 2984 pers = find_pers(level, clevel);
2945 if (!pers || !try_module_get(pers->owner)) { 2985 if (!pers || !try_module_get(pers->owner)) {
2946 spin_unlock(&pers_lock); 2986 spin_unlock(&pers_lock);
2947 printk(KERN_WARNING "md: personality %s not loaded\n", level); 2987 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
2948 return -EINVAL; 2988 return -EINVAL;
2949 } 2989 }
2950 spin_unlock(&pers_lock); 2990 spin_unlock(&pers_lock);
@@ -2957,7 +2997,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2957 if (!pers->takeover) { 2997 if (!pers->takeover) {
2958 module_put(pers->owner); 2998 module_put(pers->owner);
2959 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 2999 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2960 mdname(mddev), level); 3000 mdname(mddev), clevel);
2961 return -EINVAL; 3001 return -EINVAL;
2962 } 3002 }
2963 3003
@@ -2973,13 +3013,44 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2973 mddev->delta_disks = 0; 3013 mddev->delta_disks = 0;
2974 module_put(pers->owner); 3014 module_put(pers->owner);
2975 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3015 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2976 mdname(mddev), level); 3016 mdname(mddev), clevel);
2977 return PTR_ERR(priv); 3017 return PTR_ERR(priv);
2978 } 3018 }
2979 3019
2980 /* Looks like we have a winner */ 3020 /* Looks like we have a winner */
2981 mddev_suspend(mddev); 3021 mddev_suspend(mddev);
2982 mddev->pers->stop(mddev); 3022 mddev->pers->stop(mddev);
3023
3024 if (mddev->pers->sync_request == NULL &&
3025 pers->sync_request != NULL) {
3026 /* need to add the md_redundancy_group */
3027 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3028 printk(KERN_WARNING
3029 "md: cannot register extra attributes for %s\n",
3030 mdname(mddev));
3031 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
3032 }
3033 if (mddev->pers->sync_request != NULL &&
3034 pers->sync_request == NULL) {
3035 /* need to remove the md_redundancy_group */
3036 if (mddev->to_remove == NULL)
3037 mddev->to_remove = &md_redundancy_group;
3038 }
3039
3040 if (mddev->pers->sync_request == NULL &&
3041 mddev->external) {
3042 /* We are converting from a no-redundancy array
3043 * to a redundancy array and metadata is managed
3044 * externally so we need to be sure that writes
3045 * won't block due to a need to transition
3046 * clean->dirty
3047 * until external management is started.
3048 */
3049 mddev->in_sync = 0;
3050 mddev->safemode_delay = 0;
3051 mddev->safemode = 0;
3052 }
3053
2983 module_put(mddev->pers->owner); 3054 module_put(mddev->pers->owner);
2984 /* Invalidate devices that are now superfluous */ 3055 /* Invalidate devices that are now superfluous */
2985 list_for_each_entry(rdev, &mddev->disks, same_set) 3056 list_for_each_entry(rdev, &mddev->disks, same_set)
@@ -2994,11 +3065,20 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2994 mddev->layout = mddev->new_layout; 3065 mddev->layout = mddev->new_layout;
2995 mddev->chunk_sectors = mddev->new_chunk_sectors; 3066 mddev->chunk_sectors = mddev->new_chunk_sectors;
2996 mddev->delta_disks = 0; 3067 mddev->delta_disks = 0;
3068 if (mddev->pers->sync_request == NULL) {
3069 /* this is now an array without redundancy, so
3070 * it must always be in_sync
3071 */
3072 mddev->in_sync = 1;
3073 del_timer_sync(&mddev->safemode_timer);
3074 }
2997 pers->run(mddev); 3075 pers->run(mddev);
2998 mddev_resume(mddev); 3076 mddev_resume(mddev);
2999 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3077 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3000 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3078 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3001 md_wakeup_thread(mddev->thread); 3079 md_wakeup_thread(mddev->thread);
3080 sysfs_notify(&mddev->kobj, NULL, "level");
3081 md_new_event(mddev);
3002 return rv; 3082 return rv;
3003} 3083}
3004 3084
@@ -3237,6 +3317,7 @@ array_state_show(mddev_t *mddev, char *page)
3237} 3317}
3238 3318
3239static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3319static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3320static int md_set_readonly(mddev_t * mddev, int is_open);
3240static int do_md_run(mddev_t * mddev); 3321static int do_md_run(mddev_t * mddev);
3241static int restart_array(mddev_t *mddev); 3322static int restart_array(mddev_t *mddev);
3242 3323
@@ -3267,7 +3348,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
3267 break; /* not supported yet */ 3348 break; /* not supported yet */
3268 case readonly: 3349 case readonly:
3269 if (mddev->pers) 3350 if (mddev->pers)
3270 err = do_md_stop(mddev, 1, 0); 3351 err = md_set_readonly(mddev, 0);
3271 else { 3352 else {
3272 mddev->ro = 1; 3353 mddev->ro = 1;
3273 set_disk_ro(mddev->gendisk, 1); 3354 set_disk_ro(mddev->gendisk, 1);
@@ -3277,7 +3358,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
3277 case read_auto: 3358 case read_auto:
3278 if (mddev->pers) { 3359 if (mddev->pers) {
3279 if (mddev->ro == 0) 3360 if (mddev->ro == 0)
3280 err = do_md_stop(mddev, 1, 0); 3361 err = md_set_readonly(mddev, 0);
3281 else if (mddev->ro == 1) 3362 else if (mddev->ro == 1)
3282 err = restart_array(mddev); 3363 err = restart_array(mddev);
3283 if (err == 0) { 3364 if (err == 0) {
@@ -4082,15 +4163,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
4082{ 4163{
4083 mddev_t *mddev = container_of(ws, mddev_t, del_work); 4164 mddev_t *mddev = container_of(ws, mddev_t, del_work);
4084 4165
4085 if (mddev->private) {
4086 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
4087 if (mddev->private != (void*)1)
4088 sysfs_remove_group(&mddev->kobj, mddev->private);
4089 if (mddev->sysfs_action)
4090 sysfs_put(mddev->sysfs_action);
4091 mddev->sysfs_action = NULL;
4092 mddev->private = NULL;
4093 }
4094 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4166 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4095 kobject_del(&mddev->kobj); 4167 kobject_del(&mddev->kobj);
4096 kobject_put(&mddev->kobj); 4168 kobject_put(&mddev->kobj);
@@ -4234,11 +4306,10 @@ static void md_safemode_timeout(unsigned long data)
4234 4306
4235static int start_dirty_degraded; 4307static int start_dirty_degraded;
4236 4308
4237static int do_md_run(mddev_t * mddev) 4309static int md_run(mddev_t *mddev)
4238{ 4310{
4239 int err; 4311 int err;
4240 mdk_rdev_t *rdev; 4312 mdk_rdev_t *rdev;
4241 struct gendisk *disk;
4242 struct mdk_personality *pers; 4313 struct mdk_personality *pers;
4243 4314
4244 if (list_empty(&mddev->disks)) 4315 if (list_empty(&mddev->disks))
@@ -4248,6 +4319,13 @@ static int do_md_run(mddev_t * mddev)
4248 if (mddev->pers) 4319 if (mddev->pers)
4249 return -EBUSY; 4320 return -EBUSY;
4250 4321
4322 /* These two calls synchronise us with the
4323 * sysfs_remove_group calls in mddev_unlock,
4324 * so they must have completed.
4325 */
4326 mutex_lock(&mddev->open_mutex);
4327 mutex_unlock(&mddev->open_mutex);
4328
4251 /* 4329 /*
4252 * Analyze all RAID superblock(s) 4330 * Analyze all RAID superblock(s)
4253 */ 4331 */
@@ -4296,8 +4374,6 @@ static int do_md_run(mddev_t * mddev)
4296 sysfs_notify_dirent(rdev->sysfs_state); 4374 sysfs_notify_dirent(rdev->sysfs_state);
4297 } 4375 }
4298 4376
4299 disk = mddev->gendisk;
4300
4301 spin_lock(&pers_lock); 4377 spin_lock(&pers_lock);
4302 pers = find_pers(mddev->level, mddev->clevel); 4378 pers = find_pers(mddev->level, mddev->clevel);
4303 if (!pers || !try_module_get(pers->owner)) { 4379 if (!pers || !try_module_get(pers->owner)) {
@@ -4425,22 +4501,32 @@ static int do_md_run(mddev_t * mddev)
4425 if (mddev->flags) 4501 if (mddev->flags)
4426 md_update_sb(mddev, 0); 4502 md_update_sb(mddev, 0);
4427 4503
4428 set_capacity(disk, mddev->array_sectors);
4429
4430 md_wakeup_thread(mddev->thread); 4504 md_wakeup_thread(mddev->thread);
4431 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4505 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4432 4506
4433 revalidate_disk(mddev->gendisk);
4434 mddev->changed = 1;
4435 md_new_event(mddev); 4507 md_new_event(mddev);
4436 sysfs_notify_dirent(mddev->sysfs_state); 4508 sysfs_notify_dirent(mddev->sysfs_state);
4437 if (mddev->sysfs_action) 4509 if (mddev->sysfs_action)
4438 sysfs_notify_dirent(mddev->sysfs_action); 4510 sysfs_notify_dirent(mddev->sysfs_action);
4439 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4511 sysfs_notify(&mddev->kobj, NULL, "degraded");
4440 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4441 return 0; 4512 return 0;
4442} 4513}
4443 4514
4515static int do_md_run(mddev_t *mddev)
4516{
4517 int err;
4518
4519 err = md_run(mddev);
4520 if (err)
4521 goto out;
4522
4523 set_capacity(mddev->gendisk, mddev->array_sectors);
4524 revalidate_disk(mddev->gendisk);
4525 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4526out:
4527 return err;
4528}
4529
4444static int restart_array(mddev_t *mddev) 4530static int restart_array(mddev_t *mddev)
4445{ 4531{
4446 struct gendisk *disk = mddev->gendisk; 4532 struct gendisk *disk = mddev->gendisk;
@@ -4491,9 +4577,110 @@ void restore_bitmap_write_access(struct file *file)
4491 spin_unlock(&inode->i_lock); 4577 spin_unlock(&inode->i_lock);
4492} 4578}
4493 4579
4580static void md_clean(mddev_t *mddev)
4581{
4582 mddev->array_sectors = 0;
4583 mddev->external_size = 0;
4584 mddev->dev_sectors = 0;
4585 mddev->raid_disks = 0;
4586 mddev->recovery_cp = 0;
4587 mddev->resync_min = 0;
4588 mddev->resync_max = MaxSector;
4589 mddev->reshape_position = MaxSector;
4590 mddev->external = 0;
4591 mddev->persistent = 0;
4592 mddev->level = LEVEL_NONE;
4593 mddev->clevel[0] = 0;
4594 mddev->flags = 0;
4595 mddev->ro = 0;
4596 mddev->metadata_type[0] = 0;
4597 mddev->chunk_sectors = 0;
4598 mddev->ctime = mddev->utime = 0;
4599 mddev->layout = 0;
4600 mddev->max_disks = 0;
4601 mddev->events = 0;
4602 mddev->can_decrease_events = 0;
4603 mddev->delta_disks = 0;
4604 mddev->new_level = LEVEL_NONE;
4605 mddev->new_layout = 0;
4606 mddev->new_chunk_sectors = 0;
4607 mddev->curr_resync = 0;
4608 mddev->resync_mismatches = 0;
4609 mddev->suspend_lo = mddev->suspend_hi = 0;
4610 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4611 mddev->recovery = 0;
4612 mddev->in_sync = 0;
4613 mddev->degraded = 0;
4614 mddev->barriers_work = 0;
4615 mddev->safemode = 0;
4616 mddev->bitmap_info.offset = 0;
4617 mddev->bitmap_info.default_offset = 0;
4618 mddev->bitmap_info.chunksize = 0;
4619 mddev->bitmap_info.daemon_sleep = 0;
4620 mddev->bitmap_info.max_write_behind = 0;
4621}
4622
4623static void md_stop_writes(mddev_t *mddev)
4624{
4625 if (mddev->sync_thread) {
4626 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4627 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4628 md_unregister_thread(mddev->sync_thread);
4629 mddev->sync_thread = NULL;
4630 }
4631
4632 del_timer_sync(&mddev->safemode_timer);
4633
4634 bitmap_flush(mddev);
4635 md_super_wait(mddev);
4636
4637 if (!mddev->in_sync || mddev->flags) {
4638 /* mark array as shutdown cleanly */
4639 mddev->in_sync = 1;
4640 md_update_sb(mddev, 1);
4641 }
4642}
4643
4644static void md_stop(mddev_t *mddev)
4645{
4646 md_stop_writes(mddev);
4647
4648 mddev->pers->stop(mddev);
4649 if (mddev->pers->sync_request && mddev->to_remove == NULL)
4650 mddev->to_remove = &md_redundancy_group;
4651 module_put(mddev->pers->owner);
4652 mddev->pers = NULL;
4653 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4654}
4655
4656static int md_set_readonly(mddev_t *mddev, int is_open)
4657{
4658 int err = 0;
4659 mutex_lock(&mddev->open_mutex);
4660 if (atomic_read(&mddev->openers) > is_open) {
4661 printk("md: %s still in use.\n",mdname(mddev));
4662 err = -EBUSY;
4663 goto out;
4664 }
4665 if (mddev->pers) {
4666 md_stop_writes(mddev);
4667
4668 err = -ENXIO;
4669 if (mddev->ro==1)
4670 goto out;
4671 mddev->ro = 1;
4672 set_disk_ro(mddev->gendisk, 1);
4673 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4674 sysfs_notify_dirent(mddev->sysfs_state);
4675 err = 0;
4676 }
4677out:
4678 mutex_unlock(&mddev->open_mutex);
4679 return err;
4680}
4681
4494/* mode: 4682/* mode:
4495 * 0 - completely stop and dis-assemble array 4683 * 0 - completely stop and dis-assemble array
4496 * 1 - switch to readonly
4497 * 2 - stop but do not disassemble array 4684 * 2 - stop but do not disassemble array
4498 */ 4685 */
4499static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4686static int do_md_stop(mddev_t * mddev, int mode, int is_open)
@@ -4508,64 +4695,32 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4508 err = -EBUSY; 4695 err = -EBUSY;
4509 } else if (mddev->pers) { 4696 } else if (mddev->pers) {
4510 4697
4511 if (mddev->sync_thread) { 4698 if (mddev->ro)
4512 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4699 set_disk_ro(disk, 0);
4513 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4514 md_unregister_thread(mddev->sync_thread);
4515 mddev->sync_thread = NULL;
4516 }
4517
4518 del_timer_sync(&mddev->safemode_timer);
4519 4700
4520 switch(mode) { 4701 md_stop(mddev);
4521 case 1: /* readonly */ 4702 mddev->queue->merge_bvec_fn = NULL;
4522 err = -ENXIO; 4703 mddev->queue->unplug_fn = NULL;
4523 if (mddev->ro==1) 4704 mddev->queue->backing_dev_info.congested_fn = NULL;
4524 goto out;
4525 mddev->ro = 1;
4526 break;
4527 case 0: /* disassemble */
4528 case 2: /* stop */
4529 bitmap_flush(mddev);
4530 md_super_wait(mddev);
4531 if (mddev->ro)
4532 set_disk_ro(disk, 0);
4533 4705
4534 mddev->pers->stop(mddev); 4706 /* tell userspace to handle 'inactive' */
4535 mddev->queue->merge_bvec_fn = NULL; 4707 sysfs_notify_dirent(mddev->sysfs_state);
4536 mddev->queue->unplug_fn = NULL;
4537 mddev->queue->backing_dev_info.congested_fn = NULL;
4538 module_put(mddev->pers->owner);
4539 if (mddev->pers->sync_request && mddev->private == NULL)
4540 mddev->private = (void*)1;
4541 mddev->pers = NULL;
4542 /* tell userspace to handle 'inactive' */
4543 sysfs_notify_dirent(mddev->sysfs_state);
4544 4708
4545 list_for_each_entry(rdev, &mddev->disks, same_set) 4709 list_for_each_entry(rdev, &mddev->disks, same_set)
4546 if (rdev->raid_disk >= 0) { 4710 if (rdev->raid_disk >= 0) {
4547 char nm[20]; 4711 char nm[20];
4548 sprintf(nm, "rd%d", rdev->raid_disk); 4712 sprintf(nm, "rd%d", rdev->raid_disk);
4549 sysfs_remove_link(&mddev->kobj, nm); 4713 sysfs_remove_link(&mddev->kobj, nm);
4550 } 4714 }
4551 4715
4552 set_capacity(disk, 0); 4716 set_capacity(disk, 0);
4553 mddev->changed = 1; 4717 revalidate_disk(disk);
4554 4718
4555 if (mddev->ro) 4719 if (mddev->ro)
4556 mddev->ro = 0; 4720 mddev->ro = 0;
4557 } 4721
4558 if (!mddev->in_sync || mddev->flags) {
4559 /* mark array as shutdown cleanly */
4560 mddev->in_sync = 1;
4561 md_update_sb(mddev, 1);
4562 }
4563 if (mode == 1)
4564 set_disk_ro(disk, 1);
4565 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4566 err = 0; 4722 err = 0;
4567 } 4723 }
4568out:
4569 mutex_unlock(&mddev->open_mutex); 4724 mutex_unlock(&mddev->open_mutex);
4570 if (err) 4725 if (err)
4571 return err; 4726 return err;
@@ -4586,52 +4741,12 @@ out:
4586 4741
4587 export_array(mddev); 4742 export_array(mddev);
4588 4743
4589 mddev->array_sectors = 0; 4744 md_clean(mddev);
4590 mddev->external_size = 0;
4591 mddev->dev_sectors = 0;
4592 mddev->raid_disks = 0;
4593 mddev->recovery_cp = 0;
4594 mddev->resync_min = 0;
4595 mddev->resync_max = MaxSector;
4596 mddev->reshape_position = MaxSector;
4597 mddev->external = 0;
4598 mddev->persistent = 0;
4599 mddev->level = LEVEL_NONE;
4600 mddev->clevel[0] = 0;
4601 mddev->flags = 0;
4602 mddev->ro = 0;
4603 mddev->metadata_type[0] = 0;
4604 mddev->chunk_sectors = 0;
4605 mddev->ctime = mddev->utime = 0;
4606 mddev->layout = 0;
4607 mddev->max_disks = 0;
4608 mddev->events = 0;
4609 mddev->delta_disks = 0;
4610 mddev->new_level = LEVEL_NONE;
4611 mddev->new_layout = 0;
4612 mddev->new_chunk_sectors = 0;
4613 mddev->curr_resync = 0;
4614 mddev->resync_mismatches = 0;
4615 mddev->suspend_lo = mddev->suspend_hi = 0;
4616 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4617 mddev->recovery = 0;
4618 mddev->in_sync = 0;
4619 mddev->changed = 0;
4620 mddev->degraded = 0;
4621 mddev->barriers_work = 0;
4622 mddev->safemode = 0;
4623 mddev->bitmap_info.offset = 0;
4624 mddev->bitmap_info.default_offset = 0;
4625 mddev->bitmap_info.chunksize = 0;
4626 mddev->bitmap_info.daemon_sleep = 0;
4627 mddev->bitmap_info.max_write_behind = 0;
4628 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4745 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4629 if (mddev->hold_active == UNTIL_STOP) 4746 if (mddev->hold_active == UNTIL_STOP)
4630 mddev->hold_active = 0; 4747 mddev->hold_active = 0;
4631 4748
4632 } else if (mddev->pers) 4749 }
4633 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4634 mdname(mddev));
4635 err = 0; 4750 err = 0;
4636 blk_integrity_unregister(disk); 4751 blk_integrity_unregister(disk);
4637 md_new_event(mddev); 4752 md_new_event(mddev);
@@ -5349,7 +5464,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
5349 if (mddev->pers->check_reshape == NULL) 5464 if (mddev->pers->check_reshape == NULL)
5350 return -EINVAL; 5465 return -EINVAL;
5351 if (raid_disks <= 0 || 5466 if (raid_disks <= 0 ||
5352 raid_disks >= mddev->max_disks) 5467 (mddev->max_disks && raid_disks >= mddev->max_disks))
5353 return -EINVAL; 5468 return -EINVAL;
5354 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5469 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5355 return -EBUSY; 5470 return -EBUSY;
@@ -5486,7 +5601,7 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5486 5601
5487 geo->heads = 2; 5602 geo->heads = 2;
5488 geo->sectors = 4; 5603 geo->sectors = 4;
5489 geo->cylinders = get_capacity(mddev->gendisk) / 8; 5604 geo->cylinders = mddev->array_sectors / 8;
5490 return 0; 5605 return 0;
5491} 5606}
5492 5607
@@ -5496,6 +5611,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
5496 int err = 0; 5611 int err = 0;
5497 void __user *argp = (void __user *)arg; 5612 void __user *argp = (void __user *)arg;
5498 mddev_t *mddev = NULL; 5613 mddev_t *mddev = NULL;
5614 int ro;
5499 5615
5500 if (!capable(CAP_SYS_ADMIN)) 5616 if (!capable(CAP_SYS_ADMIN))
5501 return -EACCES; 5617 return -EACCES;
@@ -5628,9 +5744,37 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
5628 goto done_unlock; 5744 goto done_unlock;
5629 5745
5630 case STOP_ARRAY_RO: 5746 case STOP_ARRAY_RO:
5631 err = do_md_stop(mddev, 1, 1); 5747 err = md_set_readonly(mddev, 1);
5632 goto done_unlock; 5748 goto done_unlock;
5633 5749
5750 case BLKROSET:
5751 if (get_user(ro, (int __user *)(arg))) {
5752 err = -EFAULT;
5753 goto done_unlock;
5754 }
5755 err = -EINVAL;
5756
5757 /* if the bdev is going readonly the value of mddev->ro
5758 * does not matter, no writes are coming
5759 */
5760 if (ro)
5761 goto done_unlock;
5762
5763 /* are we are already prepared for writes? */
5764 if (mddev->ro != 1)
5765 goto done_unlock;
5766
5767 /* transitioning to readauto need only happen for
5768 * arrays that call md_write_start
5769 */
5770 if (mddev->pers) {
5771 err = restart_array(mddev);
5772 if (err == 0) {
5773 mddev->ro = 2;
5774 set_disk_ro(mddev->gendisk, 0);
5775 }
5776 }
5777 goto done_unlock;
5634 } 5778 }
5635 5779
5636 /* 5780 /*
@@ -5751,7 +5895,6 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5751 atomic_inc(&mddev->openers); 5895 atomic_inc(&mddev->openers);
5752 mutex_unlock(&mddev->open_mutex); 5896 mutex_unlock(&mddev->open_mutex);
5753 5897
5754 check_disk_change(bdev);
5755 out: 5898 out:
5756 return err; 5899 return err;
5757} 5900}
@@ -5766,21 +5909,6 @@ static int md_release(struct gendisk *disk, fmode_t mode)
5766 5909
5767 return 0; 5910 return 0;
5768} 5911}
5769
5770static int md_media_changed(struct gendisk *disk)
5771{
5772 mddev_t *mddev = disk->private_data;
5773
5774 return mddev->changed;
5775}
5776
5777static int md_revalidate(struct gendisk *disk)
5778{
5779 mddev_t *mddev = disk->private_data;
5780
5781 mddev->changed = 0;
5782 return 0;
5783}
5784static const struct block_device_operations md_fops = 5912static const struct block_device_operations md_fops =
5785{ 5913{
5786 .owner = THIS_MODULE, 5914 .owner = THIS_MODULE,
@@ -5791,8 +5919,6 @@ static const struct block_device_operations md_fops =
5791 .compat_ioctl = md_compat_ioctl, 5919 .compat_ioctl = md_compat_ioctl,
5792#endif 5920#endif
5793 .getgeo = md_getgeo, 5921 .getgeo = md_getgeo,
5794 .media_changed = md_media_changed,
5795 .revalidate_disk= md_revalidate,
5796}; 5922};
5797 5923
5798static int md_thread(void * arg) 5924static int md_thread(void * arg)
@@ -5906,7 +6032,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5906 mddev->pers->error_handler(mddev,rdev); 6032 mddev->pers->error_handler(mddev,rdev);
5907 if (mddev->degraded) 6033 if (mddev->degraded)
5908 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6034 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5909 set_bit(StateChanged, &rdev->flags); 6035 sysfs_notify_dirent(rdev->sysfs_state);
5910 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6036 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5911 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6037 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5912 md_wakeup_thread(mddev->thread); 6038 md_wakeup_thread(mddev->thread);
@@ -6898,11 +7024,6 @@ void md_check_recovery(mddev_t *mddev)
6898 if (mddev->flags) 7024 if (mddev->flags)
6899 md_update_sb(mddev, 0); 7025 md_update_sb(mddev, 0);
6900 7026
6901 list_for_each_entry(rdev, &mddev->disks, same_set)
6902 if (test_and_clear_bit(StateChanged, &rdev->flags))
6903 sysfs_notify_dirent(rdev->sysfs_state);
6904
6905
6906 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 7027 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6907 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 7028 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6908 /* resync/recovery still happening */ 7029 /* resync/recovery still happening */
@@ -7039,7 +7160,7 @@ static int md_notify_reboot(struct notifier_block *this,
7039 * appears to still be in use. Hence 7160 * appears to still be in use. Hence
7040 * the '100'. 7161 * the '100'.
7041 */ 7162 */
7042 do_md_stop(mddev, 1, 100); 7163 md_set_readonly(mddev, 100);
7043 mddev_unlock(mddev); 7164 mddev_unlock(mddev);
7044 } 7165 }
7045 /* 7166 /*
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8e4c75c00d46..7ab5ea155452 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -74,9 +74,6 @@ struct mdk_rdev_s
74#define Blocked 8 /* An error occured on an externally 74#define Blocked 8 /* An error occured on an externally
75 * managed array, don't allow writes 75 * managed array, don't allow writes
76 * until it is cleared */ 76 * until it is cleared */
77#define StateChanged 9 /* Faulty or Blocked has changed during
78 * interrupt, so it needs to be
79 * notified by the thread */
80 wait_queue_head_t blocked_wait; 77 wait_queue_head_t blocked_wait;
81 78
82 int desc_nr; /* descriptor index in the superblock */ 79 int desc_nr; /* descriptor index in the superblock */
@@ -153,6 +150,12 @@ struct mddev_s
153 int external_size; /* size managed 150 int external_size; /* size managed
154 * externally */ 151 * externally */
155 __u64 events; 152 __u64 events;
153 /* If the last 'event' was simply a clean->dirty transition, and
154 * we didn't write it to the spares, then it is safe and simple
155 * to just decrement the event count on a dirty->clean transition.
156 * So we record that possibility here.
157 */
158 int can_decrease_events;
156 159
157 char uuid[16]; 160 char uuid[16];
158 161
@@ -240,7 +243,6 @@ struct mddev_s
240 atomic_t active; /* general refcount */ 243 atomic_t active; /* general refcount */
241 atomic_t openers; /* number of active opens */ 244 atomic_t openers; /* number of active opens */
242 245
243 int changed; /* true if we might need to reread partition info */
244 int degraded; /* whether md should consider 246 int degraded; /* whether md should consider
245 * adding a spare 247 * adding a spare
246 */ 248 */
@@ -279,9 +281,6 @@ struct mddev_s
279 atomic_t writes_pending; 281 atomic_t writes_pending;
280 struct request_queue *queue; /* for plugging ... */ 282 struct request_queue *queue; /* for plugging ... */
281 283
282 atomic_t write_behind; /* outstanding async IO */
283 unsigned int max_write_behind; /* 0 = sync */
284
285 struct bitmap *bitmap; /* the bitmap for the device */ 284 struct bitmap *bitmap; /* the bitmap for the device */
286 struct { 285 struct {
287 struct file *file; /* the bitmap file */ 286 struct file *file; /* the bitmap file */
@@ -305,6 +304,7 @@ struct mddev_s
305 atomic_t max_corr_read_errors; /* max read retries */ 304 atomic_t max_corr_read_errors; /* max read retries */
306 struct list_head all_mddevs; 305 struct list_head all_mddevs;
307 306
307 struct attribute_group *to_remove;
308 /* Generic barrier handling. 308 /* Generic barrier handling.
309 * If there is a pending barrier request, all other 309 * If there is a pending barrier request, all other
310 * writes are blocked while the devices are flushed. 310 * writes are blocked while the devices are flushed.
@@ -336,7 +336,7 @@ struct mdk_personality
336 int level; 336 int level;
337 struct list_head list; 337 struct list_head list;
338 struct module *owner; 338 struct module *owner;
339 int (*make_request)(struct request_queue *q, struct bio *bio); 339 int (*make_request)(mddev_t *mddev, struct bio *bio);
340 int (*run)(mddev_t *mddev); 340 int (*run)(mddev_t *mddev);
341 int (*stop)(mddev_t *mddev); 341 int (*stop)(mddev_t *mddev);
342 void (*status)(struct seq_file *seq, mddev_t *mddev); 342 void (*status)(struct seq_file *seq, mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 789bf535d29c..410fb60699ac 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -85,7 +85,7 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
85static void multipath_end_request(struct bio *bio, int error) 85static void multipath_end_request(struct bio *bio, int error)
86{ 86{
87 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 87 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
88 struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); 88 struct multipath_bh *mp_bh = bio->bi_private;
89 multipath_conf_t *conf = mp_bh->mddev->private; 89 multipath_conf_t *conf = mp_bh->mddev->private;
90 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; 90 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
91 91
@@ -136,14 +136,11 @@ static void multipath_unplug(struct request_queue *q)
136} 136}
137 137
138 138
139static int multipath_make_request (struct request_queue *q, struct bio * bio) 139static int multipath_make_request(mddev_t *mddev, struct bio * bio)
140{ 140{
141 mddev_t *mddev = q->queuedata;
142 multipath_conf_t *conf = mddev->private; 141 multipath_conf_t *conf = mddev->private;
143 struct multipath_bh * mp_bh; 142 struct multipath_bh * mp_bh;
144 struct multipath_info *multipath; 143 struct multipath_info *multipath;
145 const int rw = bio_data_dir(bio);
146 int cpu;
147 144
148 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 145 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
149 md_barrier_request(mddev, bio); 146 md_barrier_request(mddev, bio);
@@ -155,12 +152,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
155 mp_bh->master_bio = bio; 152 mp_bh->master_bio = bio;
156 mp_bh->mddev = mddev; 153 mp_bh->mddev = mddev;
157 154
158 cpu = part_stat_lock();
159 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
160 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
161 bio_sectors(bio));
162 part_stat_unlock();
163
164 mp_bh->path = multipath_map(conf); 155 mp_bh->path = multipath_map(conf);
165 if (mp_bh->path < 0) { 156 if (mp_bh->path < 0) {
166 bio_endio(bio, -EIO); 157 bio_endio(bio, -EIO);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c3bec024612e..e70f004c99e8 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -23,15 +23,17 @@
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include "md.h" 24#include "md.h"
25#include "raid0.h" 25#include "raid0.h"
26#include "raid5.h"
26 27
27static void raid0_unplug(struct request_queue *q) 28static void raid0_unplug(struct request_queue *q)
28{ 29{
29 mddev_t *mddev = q->queuedata; 30 mddev_t *mddev = q->queuedata;
30 raid0_conf_t *conf = mddev->private; 31 raid0_conf_t *conf = mddev->private;
31 mdk_rdev_t **devlist = conf->devlist; 32 mdk_rdev_t **devlist = conf->devlist;
33 int raid_disks = conf->strip_zone[0].nb_dev;
32 int i; 34 int i;
33 35
34 for (i=0; i<mddev->raid_disks; i++) { 36 for (i=0; i < raid_disks; i++) {
35 struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); 37 struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
36 38
37 blk_unplug(r_queue); 39 blk_unplug(r_queue);
@@ -43,12 +45,13 @@ static int raid0_congested(void *data, int bits)
43 mddev_t *mddev = data; 45 mddev_t *mddev = data;
44 raid0_conf_t *conf = mddev->private; 46 raid0_conf_t *conf = mddev->private;
45 mdk_rdev_t **devlist = conf->devlist; 47 mdk_rdev_t **devlist = conf->devlist;
48 int raid_disks = conf->strip_zone[0].nb_dev;
46 int i, ret = 0; 49 int i, ret = 0;
47 50
48 if (mddev_congested(mddev, bits)) 51 if (mddev_congested(mddev, bits))
49 return 1; 52 return 1;
50 53
51 for (i = 0; i < mddev->raid_disks && !ret ; i++) { 54 for (i = 0; i < raid_disks && !ret ; i++) {
52 struct request_queue *q = bdev_get_queue(devlist[i]->bdev); 55 struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
53 56
54 ret |= bdi_congested(&q->backing_dev_info, bits); 57 ret |= bdi_congested(&q->backing_dev_info, bits);
@@ -66,16 +69,17 @@ static void dump_zones(mddev_t *mddev)
66 sector_t zone_start = 0; 69 sector_t zone_start = 0;
67 char b[BDEVNAME_SIZE]; 70 char b[BDEVNAME_SIZE];
68 raid0_conf_t *conf = mddev->private; 71 raid0_conf_t *conf = mddev->private;
72 int raid_disks = conf->strip_zone[0].nb_dev;
69 printk(KERN_INFO "******* %s configuration *********\n", 73 printk(KERN_INFO "******* %s configuration *********\n",
70 mdname(mddev)); 74 mdname(mddev));
71 h = 0; 75 h = 0;
72 for (j = 0; j < conf->nr_strip_zones; j++) { 76 for (j = 0; j < conf->nr_strip_zones; j++) {
73 printk(KERN_INFO "zone%d=[", j); 77 printk(KERN_INFO "zone%d=[", j);
74 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 78 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
75 printk("%s/", 79 printk(KERN_CONT "%s/",
76 bdevname(conf->devlist[j*mddev->raid_disks 80 bdevname(conf->devlist[j*raid_disks
77 + k]->bdev, b)); 81 + k]->bdev, b));
78 printk("]\n"); 82 printk(KERN_CONT "]\n");
79 83
80 zone_size = conf->strip_zone[j].zone_end - zone_start; 84 zone_size = conf->strip_zone[j].zone_end - zone_start;
81 printk(KERN_INFO " zone offset=%llukb " 85 printk(KERN_INFO " zone offset=%llukb "
@@ -88,7 +92,7 @@ static void dump_zones(mddev_t *mddev)
88 printk(KERN_INFO "**********************************\n\n"); 92 printk(KERN_INFO "**********************************\n\n");
89} 93}
90 94
91static int create_strip_zones(mddev_t *mddev) 95static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
92{ 96{
93 int i, c, err; 97 int i, c, err;
94 sector_t curr_zone_end, sectors; 98 sector_t curr_zone_end, sectors;
@@ -101,8 +105,9 @@ static int create_strip_zones(mddev_t *mddev)
101 if (!conf) 105 if (!conf)
102 return -ENOMEM; 106 return -ENOMEM;
103 list_for_each_entry(rdev1, &mddev->disks, same_set) { 107 list_for_each_entry(rdev1, &mddev->disks, same_set) {
104 printk(KERN_INFO "raid0: looking at %s\n", 108 printk(KERN_INFO "md/raid0:%s: looking at %s\n",
105 bdevname(rdev1->bdev,b)); 109 mdname(mddev),
110 bdevname(rdev1->bdev, b));
106 c = 0; 111 c = 0;
107 112
108 /* round size to chunk_size */ 113 /* round size to chunk_size */
@@ -111,14 +116,16 @@ static int create_strip_zones(mddev_t *mddev)
111 rdev1->sectors = sectors * mddev->chunk_sectors; 116 rdev1->sectors = sectors * mddev->chunk_sectors;
112 117
113 list_for_each_entry(rdev2, &mddev->disks, same_set) { 118 list_for_each_entry(rdev2, &mddev->disks, same_set) {
114 printk(KERN_INFO "raid0: comparing %s(%llu)", 119 printk(KERN_INFO "md/raid0:%s: comparing %s(%llu)",
120 mdname(mddev),
115 bdevname(rdev1->bdev,b), 121 bdevname(rdev1->bdev,b),
116 (unsigned long long)rdev1->sectors); 122 (unsigned long long)rdev1->sectors);
117 printk(KERN_INFO " with %s(%llu)\n", 123 printk(KERN_CONT " with %s(%llu)\n",
118 bdevname(rdev2->bdev,b), 124 bdevname(rdev2->bdev,b),
119 (unsigned long long)rdev2->sectors); 125 (unsigned long long)rdev2->sectors);
120 if (rdev2 == rdev1) { 126 if (rdev2 == rdev1) {
121 printk(KERN_INFO "raid0: END\n"); 127 printk(KERN_INFO "md/raid0:%s: END\n",
128 mdname(mddev));
122 break; 129 break;
123 } 130 }
124 if (rdev2->sectors == rdev1->sectors) { 131 if (rdev2->sectors == rdev1->sectors) {
@@ -126,20 +133,24 @@ static int create_strip_zones(mddev_t *mddev)
126 * Not unique, don't count it as a new 133 * Not unique, don't count it as a new
127 * group 134 * group
128 */ 135 */
129 printk(KERN_INFO "raid0: EQUAL\n"); 136 printk(KERN_INFO "md/raid0:%s: EQUAL\n",
137 mdname(mddev));
130 c = 1; 138 c = 1;
131 break; 139 break;
132 } 140 }
133 printk(KERN_INFO "raid0: NOT EQUAL\n"); 141 printk(KERN_INFO "md/raid0:%s: NOT EQUAL\n",
142 mdname(mddev));
134 } 143 }
135 if (!c) { 144 if (!c) {
136 printk(KERN_INFO "raid0: ==> UNIQUE\n"); 145 printk(KERN_INFO "md/raid0:%s: ==> UNIQUE\n",
146 mdname(mddev));
137 conf->nr_strip_zones++; 147 conf->nr_strip_zones++;
138 printk(KERN_INFO "raid0: %d zones\n", 148 printk(KERN_INFO "md/raid0:%s: %d zones\n",
139 conf->nr_strip_zones); 149 mdname(mddev), conf->nr_strip_zones);
140 } 150 }
141 } 151 }
142 printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); 152 printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n",
153 mdname(mddev), conf->nr_strip_zones);
143 err = -ENOMEM; 154 err = -ENOMEM;
144 conf->strip_zone = kzalloc(sizeof(struct strip_zone)* 155 conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
145 conf->nr_strip_zones, GFP_KERNEL); 156 conf->nr_strip_zones, GFP_KERNEL);
@@ -162,14 +173,18 @@ static int create_strip_zones(mddev_t *mddev)
162 list_for_each_entry(rdev1, &mddev->disks, same_set) { 173 list_for_each_entry(rdev1, &mddev->disks, same_set) {
163 int j = rdev1->raid_disk; 174 int j = rdev1->raid_disk;
164 175
176 if (mddev->level == 10)
177 /* taking over a raid10-n2 array */
178 j /= 2;
179
165 if (j < 0 || j >= mddev->raid_disks) { 180 if (j < 0 || j >= mddev->raid_disks) {
166 printk(KERN_ERR "raid0: bad disk number %d - " 181 printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
167 "aborting!\n", j); 182 "aborting!\n", mdname(mddev), j);
168 goto abort; 183 goto abort;
169 } 184 }
170 if (dev[j]) { 185 if (dev[j]) {
171 printk(KERN_ERR "raid0: multiple devices for %d - " 186 printk(KERN_ERR "md/raid0:%s: multiple devices for %d - "
172 "aborting!\n", j); 187 "aborting!\n", mdname(mddev), j);
173 goto abort; 188 goto abort;
174 } 189 }
175 dev[j] = rdev1; 190 dev[j] = rdev1;
@@ -191,8 +206,8 @@ static int create_strip_zones(mddev_t *mddev)
191 cnt++; 206 cnt++;
192 } 207 }
193 if (cnt != mddev->raid_disks) { 208 if (cnt != mddev->raid_disks) {
194 printk(KERN_ERR "raid0: too few disks (%d of %d) - " 209 printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
195 "aborting!\n", cnt, mddev->raid_disks); 210 "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);
196 goto abort; 211 goto abort;
197 } 212 }
198 zone->nb_dev = cnt; 213 zone->nb_dev = cnt;
@@ -208,39 +223,44 @@ static int create_strip_zones(mddev_t *mddev)
208 zone = conf->strip_zone + i; 223 zone = conf->strip_zone + i;
209 dev = conf->devlist + i * mddev->raid_disks; 224 dev = conf->devlist + i * mddev->raid_disks;
210 225
211 printk(KERN_INFO "raid0: zone %d\n", i); 226 printk(KERN_INFO "md/raid0:%s: zone %d\n",
227 mdname(mddev), i);
212 zone->dev_start = smallest->sectors; 228 zone->dev_start = smallest->sectors;
213 smallest = NULL; 229 smallest = NULL;
214 c = 0; 230 c = 0;
215 231
216 for (j=0; j<cnt; j++) { 232 for (j=0; j<cnt; j++) {
217 rdev = conf->devlist[j]; 233 rdev = conf->devlist[j];
218 printk(KERN_INFO "raid0: checking %s ...", 234 printk(KERN_INFO "md/raid0:%s: checking %s ...",
219 bdevname(rdev->bdev, b)); 235 mdname(mddev),
236 bdevname(rdev->bdev, b));
220 if (rdev->sectors <= zone->dev_start) { 237 if (rdev->sectors <= zone->dev_start) {
221 printk(KERN_INFO " nope.\n"); 238 printk(KERN_CONT " nope.\n");
222 continue; 239 continue;
223 } 240 }
224 printk(KERN_INFO " contained as device %d\n", c); 241 printk(KERN_CONT " contained as device %d\n", c);
225 dev[c] = rdev; 242 dev[c] = rdev;
226 c++; 243 c++;
227 if (!smallest || rdev->sectors < smallest->sectors) { 244 if (!smallest || rdev->sectors < smallest->sectors) {
228 smallest = rdev; 245 smallest = rdev;
229 printk(KERN_INFO " (%llu) is smallest!.\n", 246 printk(KERN_INFO "md/raid0:%s: (%llu) is smallest!.\n",
230 (unsigned long long)rdev->sectors); 247 mdname(mddev),
248 (unsigned long long)rdev->sectors);
231 } 249 }
232 } 250 }
233 251
234 zone->nb_dev = c; 252 zone->nb_dev = c;
235 sectors = (smallest->sectors - zone->dev_start) * c; 253 sectors = (smallest->sectors - zone->dev_start) * c;
236 printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", 254 printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
237 zone->nb_dev, (unsigned long long)sectors); 255 mdname(mddev),
256 zone->nb_dev, (unsigned long long)sectors);
238 257
239 curr_zone_end += sectors; 258 curr_zone_end += sectors;
240 zone->zone_end = curr_zone_end; 259 zone->zone_end = curr_zone_end;
241 260
242 printk(KERN_INFO "raid0: current zone start: %llu\n", 261 printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n",
243 (unsigned long long)smallest->sectors); 262 mdname(mddev),
263 (unsigned long long)smallest->sectors);
244 } 264 }
245 mddev->queue->unplug_fn = raid0_unplug; 265 mddev->queue->unplug_fn = raid0_unplug;
246 mddev->queue->backing_dev_info.congested_fn = raid0_congested; 266 mddev->queue->backing_dev_info.congested_fn = raid0_congested;
@@ -251,7 +271,7 @@ static int create_strip_zones(mddev_t *mddev)
251 * chunk size is a multiple of that sector size 271 * chunk size is a multiple of that sector size
252 */ 272 */
253 if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { 273 if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
254 printk(KERN_ERR "%s chunk_size of %d not valid\n", 274 printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
255 mdname(mddev), 275 mdname(mddev),
256 mddev->chunk_sectors << 9); 276 mddev->chunk_sectors << 9);
257 goto abort; 277 goto abort;
@@ -261,14 +281,15 @@ static int create_strip_zones(mddev_t *mddev)
261 blk_queue_io_opt(mddev->queue, 281 blk_queue_io_opt(mddev->queue,
262 (mddev->chunk_sectors << 9) * mddev->raid_disks); 282 (mddev->chunk_sectors << 9) * mddev->raid_disks);
263 283
264 printk(KERN_INFO "raid0: done.\n"); 284 printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev));
265 mddev->private = conf; 285 *private_conf = conf;
286
266 return 0; 287 return 0;
267abort: 288abort:
268 kfree(conf->strip_zone); 289 kfree(conf->strip_zone);
269 kfree(conf->devlist); 290 kfree(conf->devlist);
270 kfree(conf); 291 kfree(conf);
271 mddev->private = NULL; 292 *private_conf = NULL;
272 return err; 293 return err;
273} 294}
274 295
@@ -319,10 +340,12 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
319 340
320static int raid0_run(mddev_t *mddev) 341static int raid0_run(mddev_t *mddev)
321{ 342{
343 raid0_conf_t *conf;
322 int ret; 344 int ret;
323 345
324 if (mddev->chunk_sectors == 0) { 346 if (mddev->chunk_sectors == 0) {
325 printk(KERN_ERR "md/raid0: chunk size must be set.\n"); 347 printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n",
348 mdname(mddev));
326 return -EINVAL; 349 return -EINVAL;
327 } 350 }
328 if (md_check_no_bitmap(mddev)) 351 if (md_check_no_bitmap(mddev))
@@ -330,15 +353,27 @@ static int raid0_run(mddev_t *mddev)
330 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); 353 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
331 mddev->queue->queue_lock = &mddev->queue->__queue_lock; 354 mddev->queue->queue_lock = &mddev->queue->__queue_lock;
332 355
333 ret = create_strip_zones(mddev); 356 /* if private is not null, we are here after takeover */
334 if (ret < 0) 357 if (mddev->private == NULL) {
335 return ret; 358 ret = create_strip_zones(mddev, &conf);
359 if (ret < 0)
360 return ret;
361 mddev->private = conf;
362 }
363 conf = mddev->private;
364 if (conf->scale_raid_disks) {
365 int i;
366 for (i=0; i < conf->strip_zone[0].nb_dev; i++)
367 conf->devlist[i]->raid_disk /= conf->scale_raid_disks;
368 /* FIXME update sysfs rd links */
369 }
336 370
337 /* calculate array device size */ 371 /* calculate array device size */
338 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); 372 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
339 373
340 printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", 374 printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n",
341 (unsigned long long)mddev->array_sectors); 375 mdname(mddev),
376 (unsigned long long)mddev->array_sectors);
342 /* calculate the max read-ahead size. 377 /* calculate the max read-ahead size.
343 * For read-ahead of large files to be effective, we need to 378 * For read-ahead of large files to be effective, we need to
344 * readahead at least twice a whole stripe. i.e. number of devices 379 * readahead at least twice a whole stripe. i.e. number of devices
@@ -402,6 +437,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
402 unsigned int sect_in_chunk; 437 unsigned int sect_in_chunk;
403 sector_t chunk; 438 sector_t chunk;
404 raid0_conf_t *conf = mddev->private; 439 raid0_conf_t *conf = mddev->private;
440 int raid_disks = conf->strip_zone[0].nb_dev;
405 unsigned int chunk_sects = mddev->chunk_sectors; 441 unsigned int chunk_sects = mddev->chunk_sectors;
406 442
407 if (is_power_of_2(chunk_sects)) { 443 if (is_power_of_2(chunk_sects)) {
@@ -424,7 +460,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
424 * + the position in the chunk 460 * + the position in the chunk
425 */ 461 */
426 *sector_offset = (chunk * chunk_sects) + sect_in_chunk; 462 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
427 return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks 463 return conf->devlist[(zone - conf->strip_zone)*raid_disks
428 + sector_div(sector, zone->nb_dev)]; 464 + sector_div(sector, zone->nb_dev)];
429} 465}
430 466
@@ -444,27 +480,18 @@ static inline int is_io_in_chunk_boundary(mddev_t *mddev,
444 } 480 }
445} 481}
446 482
447static int raid0_make_request(struct request_queue *q, struct bio *bio) 483static int raid0_make_request(mddev_t *mddev, struct bio *bio)
448{ 484{
449 mddev_t *mddev = q->queuedata;
450 unsigned int chunk_sects; 485 unsigned int chunk_sects;
451 sector_t sector_offset; 486 sector_t sector_offset;
452 struct strip_zone *zone; 487 struct strip_zone *zone;
453 mdk_rdev_t *tmp_dev; 488 mdk_rdev_t *tmp_dev;
454 const int rw = bio_data_dir(bio);
455 int cpu;
456 489
457 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 490 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
458 md_barrier_request(mddev, bio); 491 md_barrier_request(mddev, bio);
459 return 0; 492 return 0;
460 } 493 }
461 494
462 cpu = part_stat_lock();
463 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
464 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
465 bio_sectors(bio));
466 part_stat_unlock();
467
468 chunk_sects = mddev->chunk_sectors; 495 chunk_sects = mddev->chunk_sectors;
469 if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { 496 if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
470 sector_t sector = bio->bi_sector; 497 sector_t sector = bio->bi_sector;
@@ -482,9 +509,9 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
482 else 509 else
483 bp = bio_split(bio, chunk_sects - 510 bp = bio_split(bio, chunk_sects -
484 sector_div(sector, chunk_sects)); 511 sector_div(sector, chunk_sects));
485 if (raid0_make_request(q, &bp->bio1)) 512 if (raid0_make_request(mddev, &bp->bio1))
486 generic_make_request(&bp->bio1); 513 generic_make_request(&bp->bio1);
487 if (raid0_make_request(q, &bp->bio2)) 514 if (raid0_make_request(mddev, &bp->bio2))
488 generic_make_request(&bp->bio2); 515 generic_make_request(&bp->bio2);
489 516
490 bio_pair_release(bp); 517 bio_pair_release(bp);
@@ -504,9 +531,10 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
504 return 1; 531 return 1;
505 532
506bad_map: 533bad_map:
507 printk("raid0_make_request bug: can't convert block across chunks" 534 printk("md/raid0:%s: make_request bug: can't convert block across chunks"
508 " or bigger than %dk %llu %d\n", chunk_sects / 2, 535 " or bigger than %dk %llu %d\n",
509 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 536 mdname(mddev), chunk_sects / 2,
537 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
510 538
511 bio_io_error(bio); 539 bio_io_error(bio);
512 return 0; 540 return 0;
@@ -519,6 +547,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
519 int j, k, h; 547 int j, k, h;
520 char b[BDEVNAME_SIZE]; 548 char b[BDEVNAME_SIZE];
521 raid0_conf_t *conf = mddev->private; 549 raid0_conf_t *conf = mddev->private;
550 int raid_disks = conf->strip_zone[0].nb_dev;
522 551
523 sector_t zone_size; 552 sector_t zone_size;
524 sector_t zone_start = 0; 553 sector_t zone_start = 0;
@@ -529,7 +558,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
529 seq_printf(seq, "=["); 558 seq_printf(seq, "=[");
530 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 559 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
531 seq_printf(seq, "%s/", bdevname( 560 seq_printf(seq, "%s/", bdevname(
532 conf->devlist[j*mddev->raid_disks + k] 561 conf->devlist[j*raid_disks + k]
533 ->bdev, b)); 562 ->bdev, b));
534 563
535 zone_size = conf->strip_zone[j].zone_end - zone_start; 564 zone_size = conf->strip_zone[j].zone_end - zone_start;
@@ -544,6 +573,104 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
544 return; 573 return;
545} 574}
546 575
576static void *raid0_takeover_raid5(mddev_t *mddev)
577{
578 mdk_rdev_t *rdev;
579 raid0_conf_t *priv_conf;
580
581 if (mddev->degraded != 1) {
582 printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
583 mdname(mddev),
584 mddev->degraded);
585 return ERR_PTR(-EINVAL);
586 }
587
588 list_for_each_entry(rdev, &mddev->disks, same_set) {
589 /* check slot number for a disk */
590 if (rdev->raid_disk == mddev->raid_disks-1) {
591 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
592 mdname(mddev));
593 return ERR_PTR(-EINVAL);
594 }
595 }
596
597 /* Set new parameters */
598 mddev->new_level = 0;
599 mddev->new_chunk_sectors = mddev->chunk_sectors;
600 mddev->raid_disks--;
601 mddev->delta_disks = -1;
602 /* make sure it will be not marked as dirty */
603 mddev->recovery_cp = MaxSector;
604
605 create_strip_zones(mddev, &priv_conf);
606 return priv_conf;
607}
608
609static void *raid0_takeover_raid10(mddev_t *mddev)
610{
611 raid0_conf_t *priv_conf;
612
613 /* Check layout:
614 * - far_copies must be 1
615 * - near_copies must be 2
616 * - disks number must be even
617 * - all mirrors must be already degraded
618 */
619 if (mddev->layout != ((1 << 8) + 2)) {
620 printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n",
621 mdname(mddev),
622 mddev->layout);
623 return ERR_PTR(-EINVAL);
624 }
625 if (mddev->raid_disks & 1) {
626 printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n",
627 mdname(mddev));
628 return ERR_PTR(-EINVAL);
629 }
630 if (mddev->degraded != (mddev->raid_disks>>1)) {
631 printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n",
632 mdname(mddev));
633 return ERR_PTR(-EINVAL);
634 }
635
636 /* Set new parameters */
637 mddev->new_level = 0;
638 mddev->new_chunk_sectors = mddev->chunk_sectors;
639 mddev->delta_disks = - mddev->raid_disks / 2;
640 mddev->raid_disks += mddev->delta_disks;
641 mddev->degraded = 0;
642 /* make sure it will be not marked as dirty */
643 mddev->recovery_cp = MaxSector;
644
645 create_strip_zones(mddev, &priv_conf);
646 priv_conf->scale_raid_disks = 2;
647 return priv_conf;
648}
649
650static void *raid0_takeover(mddev_t *mddev)
651{
652 /* raid0 can take over:
653 * raid5 - providing it is Raid4 layout and one disk is faulty
654 * raid10 - assuming we have all necessary active disks
655 */
656 if (mddev->level == 5) {
657 if (mddev->layout == ALGORITHM_PARITY_N)
658 return raid0_takeover_raid5(mddev);
659
660 printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
661 mdname(mddev), ALGORITHM_PARITY_N);
662 }
663
664 if (mddev->level == 10)
665 return raid0_takeover_raid10(mddev);
666
667 return ERR_PTR(-EINVAL);
668}
669
670static void raid0_quiesce(mddev_t *mddev, int state)
671{
672}
673
547static struct mdk_personality raid0_personality= 674static struct mdk_personality raid0_personality=
548{ 675{
549 .name = "raid0", 676 .name = "raid0",
@@ -554,6 +681,8 @@ static struct mdk_personality raid0_personality=
554 .stop = raid0_stop, 681 .stop = raid0_stop,
555 .status = raid0_status, 682 .status = raid0_status,
556 .size = raid0_size, 683 .size = raid0_size,
684 .takeover = raid0_takeover,
685 .quiesce = raid0_quiesce,
557}; 686};
558 687
559static int __init raid0_init (void) 688static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 91f8e876ee64..d724e664ca4d 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -13,6 +13,9 @@ struct raid0_private_data
13 struct strip_zone *strip_zone; 13 struct strip_zone *strip_zone;
14 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ 14 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
15 int nr_strip_zones; 15 int nr_strip_zones;
16 int scale_raid_disks; /* divide rdev->raid_disks by this in run()
17 * to handle conversion from raid10
18 */
16}; 19};
17 20
18typedef struct raid0_private_data raid0_conf_t; 21typedef struct raid0_private_data raid0_conf_t;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e59b10e66edb..a948da8012de 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -263,7 +263,7 @@ static inline void update_head_pos(int disk, r1bio_t *r1_bio)
263static void raid1_end_read_request(struct bio *bio, int error) 263static void raid1_end_read_request(struct bio *bio, int error)
264{ 264{
265 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 265 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
266 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 266 r1bio_t *r1_bio = bio->bi_private;
267 int mirror; 267 int mirror;
268 conf_t *conf = r1_bio->mddev->private; 268 conf_t *conf = r1_bio->mddev->private;
269 269
@@ -297,7 +297,8 @@ static void raid1_end_read_request(struct bio *bio, int error)
297 */ 297 */
298 char b[BDEVNAME_SIZE]; 298 char b[BDEVNAME_SIZE];
299 if (printk_ratelimit()) 299 if (printk_ratelimit())
300 printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", 300 printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
301 mdname(conf->mddev),
301 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 302 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
302 reschedule_retry(r1_bio); 303 reschedule_retry(r1_bio);
303 } 304 }
@@ -308,7 +309,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
308static void raid1_end_write_request(struct bio *bio, int error) 309static void raid1_end_write_request(struct bio *bio, int error)
309{ 310{
310 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 311 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
311 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 312 r1bio_t *r1_bio = bio->bi_private;
312 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 313 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
313 conf_t *conf = r1_bio->mddev->private; 314 conf_t *conf = r1_bio->mddev->private;
314 struct bio *to_put = NULL; 315 struct bio *to_put = NULL;
@@ -418,7 +419,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
418 */ 419 */
419static int read_balance(conf_t *conf, r1bio_t *r1_bio) 420static int read_balance(conf_t *conf, r1bio_t *r1_bio)
420{ 421{
421 const unsigned long this_sector = r1_bio->sector; 422 const sector_t this_sector = r1_bio->sector;
422 int new_disk = conf->last_used, disk = new_disk; 423 int new_disk = conf->last_used, disk = new_disk;
423 int wonly_disk = -1; 424 int wonly_disk = -1;
424 const int sectors = r1_bio->sectors; 425 const int sectors = r1_bio->sectors;
@@ -434,7 +435,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
434 retry: 435 retry:
435 if (conf->mddev->recovery_cp < MaxSector && 436 if (conf->mddev->recovery_cp < MaxSector &&
436 (this_sector + sectors >= conf->next_resync)) { 437 (this_sector + sectors >= conf->next_resync)) {
437 /* Choose the first operation device, for consistancy */ 438 /* Choose the first operational device, for consistancy */
438 new_disk = 0; 439 new_disk = 0;
439 440
440 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 441 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
@@ -774,9 +775,8 @@ do_sync_io:
774 return NULL; 775 return NULL;
775} 776}
776 777
777static int make_request(struct request_queue *q, struct bio * bio) 778static int make_request(mddev_t *mddev, struct bio * bio)
778{ 779{
779 mddev_t *mddev = q->queuedata;
780 conf_t *conf = mddev->private; 780 conf_t *conf = mddev->private;
781 mirror_info_t *mirror; 781 mirror_info_t *mirror;
782 r1bio_t *r1_bio; 782 r1bio_t *r1_bio;
@@ -788,7 +788,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
788 struct page **behind_pages = NULL; 788 struct page **behind_pages = NULL;
789 const int rw = bio_data_dir(bio); 789 const int rw = bio_data_dir(bio);
790 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); 790 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
791 int cpu;
792 bool do_barriers; 791 bool do_barriers;
793 mdk_rdev_t *blocked_rdev; 792 mdk_rdev_t *blocked_rdev;
794 793
@@ -834,12 +833,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
834 833
835 bitmap = mddev->bitmap; 834 bitmap = mddev->bitmap;
836 835
837 cpu = part_stat_lock();
838 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
839 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
840 bio_sectors(bio));
841 part_stat_unlock();
842
843 /* 836 /*
844 * make_request() can abort the operation when READA is being 837 * make_request() can abort the operation when READA is being
845 * used and no empty request is available. 838 * used and no empty request is available.
@@ -866,6 +859,15 @@ static int make_request(struct request_queue *q, struct bio * bio)
866 } 859 }
867 mirror = conf->mirrors + rdisk; 860 mirror = conf->mirrors + rdisk;
868 861
862 if (test_bit(WriteMostly, &mirror->rdev->flags) &&
863 bitmap) {
864 /* Reading from a write-mostly device must
865 * take care not to over-take any writes
866 * that are 'behind'
867 */
868 wait_event(bitmap->behind_wait,
869 atomic_read(&bitmap->behind_writes) == 0);
870 }
869 r1_bio->read_disk = rdisk; 871 r1_bio->read_disk = rdisk;
870 872
871 read_bio = bio_clone(bio, GFP_NOIO); 873 read_bio = bio_clone(bio, GFP_NOIO);
@@ -912,9 +914,10 @@ static int make_request(struct request_queue *q, struct bio * bio)
912 if (test_bit(Faulty, &rdev->flags)) { 914 if (test_bit(Faulty, &rdev->flags)) {
913 rdev_dec_pending(rdev, mddev); 915 rdev_dec_pending(rdev, mddev);
914 r1_bio->bios[i] = NULL; 916 r1_bio->bios[i] = NULL;
915 } else 917 } else {
916 r1_bio->bios[i] = bio; 918 r1_bio->bios[i] = bio;
917 targets++; 919 targets++;
920 }
918 } else 921 } else
919 r1_bio->bios[i] = NULL; 922 r1_bio->bios[i] = NULL;
920 } 923 }
@@ -942,10 +945,14 @@ static int make_request(struct request_queue *q, struct bio * bio)
942 set_bit(R1BIO_Degraded, &r1_bio->state); 945 set_bit(R1BIO_Degraded, &r1_bio->state);
943 } 946 }
944 947
945 /* do behind I/O ? */ 948 /* do behind I/O ?
949 * Not if there are too many, or cannot allocate memory,
950 * or a reader on WriteMostly is waiting for behind writes
951 * to flush */
946 if (bitmap && 952 if (bitmap &&
947 (atomic_read(&bitmap->behind_writes) 953 (atomic_read(&bitmap->behind_writes)
948 < mddev->bitmap_info.max_write_behind) && 954 < mddev->bitmap_info.max_write_behind) &&
955 !waitqueue_active(&bitmap->behind_wait) &&
949 (behind_pages = alloc_behind_pages(bio)) != NULL) 956 (behind_pages = alloc_behind_pages(bio)) != NULL)
950 set_bit(R1BIO_BehindIO, &r1_bio->state); 957 set_bit(R1BIO_BehindIO, &r1_bio->state);
951 958
@@ -1070,21 +1077,22 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1070 } else 1077 } else
1071 set_bit(Faulty, &rdev->flags); 1078 set_bit(Faulty, &rdev->flags);
1072 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1079 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1073 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" 1080 printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
1074 "raid1: Operation continuing on %d devices.\n", 1081 KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
1075 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1082 mdname(mddev), bdevname(rdev->bdev, b),
1083 mdname(mddev), conf->raid_disks - mddev->degraded);
1076} 1084}
1077 1085
1078static void print_conf(conf_t *conf) 1086static void print_conf(conf_t *conf)
1079{ 1087{
1080 int i; 1088 int i;
1081 1089
1082 printk("RAID1 conf printout:\n"); 1090 printk(KERN_DEBUG "RAID1 conf printout:\n");
1083 if (!conf) { 1091 if (!conf) {
1084 printk("(!conf)\n"); 1092 printk(KERN_DEBUG "(!conf)\n");
1085 return; 1093 return;
1086 } 1094 }
1087 printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1095 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1088 conf->raid_disks); 1096 conf->raid_disks);
1089 1097
1090 rcu_read_lock(); 1098 rcu_read_lock();
@@ -1092,7 +1100,7 @@ static void print_conf(conf_t *conf)
1092 char b[BDEVNAME_SIZE]; 1100 char b[BDEVNAME_SIZE];
1093 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 1101 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
1094 if (rdev) 1102 if (rdev)
1095 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 1103 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1096 i, !test_bit(In_sync, &rdev->flags), 1104 i, !test_bit(In_sync, &rdev->flags),
1097 !test_bit(Faulty, &rdev->flags), 1105 !test_bit(Faulty, &rdev->flags),
1098 bdevname(rdev->bdev,b)); 1106 bdevname(rdev->bdev,b));
@@ -1223,7 +1231,7 @@ abort:
1223 1231
1224static void end_sync_read(struct bio *bio, int error) 1232static void end_sync_read(struct bio *bio, int error)
1225{ 1233{
1226 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1234 r1bio_t *r1_bio = bio->bi_private;
1227 int i; 1235 int i;
1228 1236
1229 for (i=r1_bio->mddev->raid_disks; i--; ) 1237 for (i=r1_bio->mddev->raid_disks; i--; )
@@ -1246,7 +1254,7 @@ static void end_sync_read(struct bio *bio, int error)
1246static void end_sync_write(struct bio *bio, int error) 1254static void end_sync_write(struct bio *bio, int error)
1247{ 1255{
1248 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1256 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1249 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1257 r1bio_t *r1_bio = bio->bi_private;
1250 mddev_t *mddev = r1_bio->mddev; 1258 mddev_t *mddev = r1_bio->mddev;
1251 conf_t *conf = mddev->private; 1259 conf_t *conf = mddev->private;
1252 int i; 1260 int i;
@@ -1453,9 +1461,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1453 char b[BDEVNAME_SIZE]; 1461 char b[BDEVNAME_SIZE];
1454 /* Cannot read from anywhere, array is toast */ 1462 /* Cannot read from anywhere, array is toast */
1455 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1463 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1456 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1464 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1457 " for block %llu\n", 1465 " for block %llu\n",
1458 bdevname(bio->bi_bdev,b), 1466 mdname(mddev),
1467 bdevname(bio->bi_bdev, b),
1459 (unsigned long long)r1_bio->sector); 1468 (unsigned long long)r1_bio->sector);
1460 md_done_sync(mddev, r1_bio->sectors, 0); 1469 md_done_sync(mddev, r1_bio->sectors, 0);
1461 put_buf(r1_bio); 1470 put_buf(r1_bio);
@@ -1577,7 +1586,7 @@ static void fix_read_error(conf_t *conf, int read_disk,
1577 else { 1586 else {
1578 atomic_add(s, &rdev->corrected_errors); 1587 atomic_add(s, &rdev->corrected_errors);
1579 printk(KERN_INFO 1588 printk(KERN_INFO
1580 "raid1:%s: read error corrected " 1589 "md/raid1:%s: read error corrected "
1581 "(%d sectors at %llu on %s)\n", 1590 "(%d sectors at %llu on %s)\n",
1582 mdname(mddev), s, 1591 mdname(mddev), s,
1583 (unsigned long long)(sect + 1592 (unsigned long long)(sect +
@@ -1682,8 +1691,9 @@ static void raid1d(mddev_t *mddev)
1682 1691
1683 bio = r1_bio->bios[r1_bio->read_disk]; 1692 bio = r1_bio->bios[r1_bio->read_disk];
1684 if ((disk=read_balance(conf, r1_bio)) == -1) { 1693 if ((disk=read_balance(conf, r1_bio)) == -1) {
1685 printk(KERN_ALERT "raid1: %s: unrecoverable I/O" 1694 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1686 " read error for block %llu\n", 1695 " read error for block %llu\n",
1696 mdname(mddev),
1687 bdevname(bio->bi_bdev,b), 1697 bdevname(bio->bi_bdev,b),
1688 (unsigned long long)r1_bio->sector); 1698 (unsigned long long)r1_bio->sector);
1689 raid_end_bio_io(r1_bio); 1699 raid_end_bio_io(r1_bio);
@@ -1697,10 +1707,11 @@ static void raid1d(mddev_t *mddev)
1697 r1_bio->bios[r1_bio->read_disk] = bio; 1707 r1_bio->bios[r1_bio->read_disk] = bio;
1698 rdev = conf->mirrors[disk].rdev; 1708 rdev = conf->mirrors[disk].rdev;
1699 if (printk_ratelimit()) 1709 if (printk_ratelimit())
1700 printk(KERN_ERR "raid1: %s: redirecting sector %llu to" 1710 printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
1701 " another mirror\n", 1711 " other mirror: %s\n",
1702 bdevname(rdev->bdev,b), 1712 mdname(mddev),
1703 (unsigned long long)r1_bio->sector); 1713 (unsigned long long)r1_bio->sector,
1714 bdevname(rdev->bdev,b));
1704 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1715 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1705 bio->bi_bdev = rdev->bdev; 1716 bio->bi_bdev = rdev->bdev;
1706 bio->bi_end_io = raid1_end_read_request; 1717 bio->bi_end_io = raid1_end_read_request;
@@ -1755,13 +1766,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1755 int still_degraded = 0; 1766 int still_degraded = 0;
1756 1767
1757 if (!conf->r1buf_pool) 1768 if (!conf->r1buf_pool)
1758 {
1759/*
1760 printk("sync start - bitmap %p\n", mddev->bitmap);
1761*/
1762 if (init_resync(conf)) 1769 if (init_resync(conf))
1763 return 0; 1770 return 0;
1764 }
1765 1771
1766 max_sector = mddev->dev_sectors; 1772 max_sector = mddev->dev_sectors;
1767 if (sector_nr >= max_sector) { 1773 if (sector_nr >= max_sector) {
@@ -2042,7 +2048,7 @@ static conf_t *setup_conf(mddev_t *mddev)
2042 2048
2043 err = -EIO; 2049 err = -EIO;
2044 if (conf->last_used < 0) { 2050 if (conf->last_used < 0) {
2045 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 2051 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2046 mdname(mddev)); 2052 mdname(mddev));
2047 goto abort; 2053 goto abort;
2048 } 2054 }
@@ -2050,7 +2056,7 @@ static conf_t *setup_conf(mddev_t *mddev)
2050 conf->thread = md_register_thread(raid1d, mddev, NULL); 2056 conf->thread = md_register_thread(raid1d, mddev, NULL);
2051 if (!conf->thread) { 2057 if (!conf->thread) {
2052 printk(KERN_ERR 2058 printk(KERN_ERR
2053 "raid1: couldn't allocate thread for %s\n", 2059 "md/raid1:%s: couldn't allocate thread\n",
2054 mdname(mddev)); 2060 mdname(mddev));
2055 goto abort; 2061 goto abort;
2056 } 2062 }
@@ -2076,12 +2082,12 @@ static int run(mddev_t *mddev)
2076 mdk_rdev_t *rdev; 2082 mdk_rdev_t *rdev;
2077 2083
2078 if (mddev->level != 1) { 2084 if (mddev->level != 1) {
2079 printk("raid1: %s: raid level not set to mirroring (%d)\n", 2085 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
2080 mdname(mddev), mddev->level); 2086 mdname(mddev), mddev->level);
2081 return -EIO; 2087 return -EIO;
2082 } 2088 }
2083 if (mddev->reshape_position != MaxSector) { 2089 if (mddev->reshape_position != MaxSector) {
2084 printk("raid1: %s: reshape_position set but not supported\n", 2090 printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
2085 mdname(mddev)); 2091 mdname(mddev));
2086 return -EIO; 2092 return -EIO;
2087 } 2093 }
@@ -2124,11 +2130,11 @@ static int run(mddev_t *mddev)
2124 mddev->recovery_cp = MaxSector; 2130 mddev->recovery_cp = MaxSector;
2125 2131
2126 if (mddev->recovery_cp != MaxSector) 2132 if (mddev->recovery_cp != MaxSector)
2127 printk(KERN_NOTICE "raid1: %s is not clean" 2133 printk(KERN_NOTICE "md/raid1:%s: not clean"
2128 " -- starting background reconstruction\n", 2134 " -- starting background reconstruction\n",
2129 mdname(mddev)); 2135 mdname(mddev));
2130 printk(KERN_INFO 2136 printk(KERN_INFO
2131 "raid1: raid set %s active with %d out of %d mirrors\n", 2137 "md/raid1:%s: active with %d out of %d mirrors\n",
2132 mdname(mddev), mddev->raid_disks - mddev->degraded, 2138 mdname(mddev), mddev->raid_disks - mddev->degraded,
2133 mddev->raid_disks); 2139 mddev->raid_disks);
2134 2140
@@ -2152,15 +2158,14 @@ static int stop(mddev_t *mddev)
2152{ 2158{
2153 conf_t *conf = mddev->private; 2159 conf_t *conf = mddev->private;
2154 struct bitmap *bitmap = mddev->bitmap; 2160 struct bitmap *bitmap = mddev->bitmap;
2155 int behind_wait = 0;
2156 2161
2157 /* wait for behind writes to complete */ 2162 /* wait for behind writes to complete */
2158 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 2163 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
2159 behind_wait++; 2164 printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
2160 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); 2165 mdname(mddev));
2161 set_current_state(TASK_UNINTERRUPTIBLE);
2162 schedule_timeout(HZ); /* wait a second */
2163 /* need to kick something here to make sure I/O goes? */ 2166 /* need to kick something here to make sure I/O goes? */
2167 wait_event(bitmap->behind_wait,
2168 atomic_read(&bitmap->behind_writes) == 0);
2164 } 2169 }
2165 2170
2166 raise_barrier(conf); 2171 raise_barrier(conf);
@@ -2191,7 +2196,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2191 if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) 2196 if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
2192 return -EINVAL; 2197 return -EINVAL;
2193 set_capacity(mddev->gendisk, mddev->array_sectors); 2198 set_capacity(mddev->gendisk, mddev->array_sectors);
2194 mddev->changed = 1;
2195 revalidate_disk(mddev->gendisk); 2199 revalidate_disk(mddev->gendisk);
2196 if (sectors > mddev->dev_sectors && 2200 if (sectors > mddev->dev_sectors &&
2197 mddev->recovery_cp == MaxSector) { 2201 mddev->recovery_cp == MaxSector) {
@@ -2286,9 +2290,9 @@ static int raid1_reshape(mddev_t *mddev)
2286 if (sysfs_create_link(&mddev->kobj, 2290 if (sysfs_create_link(&mddev->kobj,
2287 &rdev->kobj, nm)) 2291 &rdev->kobj, nm))
2288 printk(KERN_WARNING 2292 printk(KERN_WARNING
2289 "md/raid1: cannot register " 2293 "md/raid1:%s: cannot register "
2290 "%s for %s\n", 2294 "%s\n",
2291 nm, mdname(mddev)); 2295 mdname(mddev), nm);
2292 } 2296 }
2293 if (rdev) 2297 if (rdev)
2294 newmirrors[d2++].rdev = rdev; 2298 newmirrors[d2++].rdev = rdev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e2766d8251a1..03724992cdf2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -24,6 +24,7 @@
24#include <linux/seq_file.h> 24#include <linux/seq_file.h>
25#include "md.h" 25#include "md.h"
26#include "raid10.h" 26#include "raid10.h"
27#include "raid0.h"
27#include "bitmap.h" 28#include "bitmap.h"
28 29
29/* 30/*
@@ -255,7 +256,7 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
255static void raid10_end_read_request(struct bio *bio, int error) 256static void raid10_end_read_request(struct bio *bio, int error)
256{ 257{
257 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
258 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 259 r10bio_t *r10_bio = bio->bi_private;
259 int slot, dev; 260 int slot, dev;
260 conf_t *conf = r10_bio->mddev->private; 261 conf_t *conf = r10_bio->mddev->private;
261 262
@@ -285,7 +286,8 @@ static void raid10_end_read_request(struct bio *bio, int error)
285 */ 286 */
286 char b[BDEVNAME_SIZE]; 287 char b[BDEVNAME_SIZE];
287 if (printk_ratelimit()) 288 if (printk_ratelimit())
288 printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", 289 printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
290 mdname(conf->mddev),
289 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 291 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
290 reschedule_retry(r10_bio); 292 reschedule_retry(r10_bio);
291 } 293 }
@@ -296,7 +298,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
296static void raid10_end_write_request(struct bio *bio, int error) 298static void raid10_end_write_request(struct bio *bio, int error)
297{ 299{
298 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 300 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
299 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 301 r10bio_t *r10_bio = bio->bi_private;
300 int slot, dev; 302 int slot, dev;
301 conf_t *conf = r10_bio->mddev->private; 303 conf_t *conf = r10_bio->mddev->private;
302 304
@@ -494,7 +496,7 @@ static int raid10_mergeable_bvec(struct request_queue *q,
494 */ 496 */
495static int read_balance(conf_t *conf, r10bio_t *r10_bio) 497static int read_balance(conf_t *conf, r10bio_t *r10_bio)
496{ 498{
497 const unsigned long this_sector = r10_bio->sector; 499 const sector_t this_sector = r10_bio->sector;
498 int disk, slot, nslot; 500 int disk, slot, nslot;
499 const int sectors = r10_bio->sectors; 501 const int sectors = r10_bio->sectors;
500 sector_t new_distance, current_distance; 502 sector_t new_distance, current_distance;
@@ -601,7 +603,7 @@ static void unplug_slaves(mddev_t *mddev)
601 int i; 603 int i;
602 604
603 rcu_read_lock(); 605 rcu_read_lock();
604 for (i=0; i<mddev->raid_disks; i++) { 606 for (i=0; i < conf->raid_disks; i++) {
605 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 607 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
606 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 608 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
607 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); 609 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
@@ -635,7 +637,7 @@ static int raid10_congested(void *data, int bits)
635 if (mddev_congested(mddev, bits)) 637 if (mddev_congested(mddev, bits))
636 return 1; 638 return 1;
637 rcu_read_lock(); 639 rcu_read_lock();
638 for (i = 0; i < mddev->raid_disks && ret == 0; i++) { 640 for (i = 0; i < conf->raid_disks && ret == 0; i++) {
639 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 641 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
640 if (rdev && !test_bit(Faulty, &rdev->flags)) { 642 if (rdev && !test_bit(Faulty, &rdev->flags)) {
641 struct request_queue *q = bdev_get_queue(rdev->bdev); 643 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -788,14 +790,12 @@ static void unfreeze_array(conf_t *conf)
788 spin_unlock_irq(&conf->resync_lock); 790 spin_unlock_irq(&conf->resync_lock);
789} 791}
790 792
791static int make_request(struct request_queue *q, struct bio * bio) 793static int make_request(mddev_t *mddev, struct bio * bio)
792{ 794{
793 mddev_t *mddev = q->queuedata;
794 conf_t *conf = mddev->private; 795 conf_t *conf = mddev->private;
795 mirror_info_t *mirror; 796 mirror_info_t *mirror;
796 r10bio_t *r10_bio; 797 r10bio_t *r10_bio;
797 struct bio *read_bio; 798 struct bio *read_bio;
798 int cpu;
799 int i; 799 int i;
800 int chunk_sects = conf->chunk_mask + 1; 800 int chunk_sects = conf->chunk_mask + 1;
801 const int rw = bio_data_dir(bio); 801 const int rw = bio_data_dir(bio);
@@ -825,16 +825,16 @@ static int make_request(struct request_queue *q, struct bio * bio)
825 */ 825 */
826 bp = bio_split(bio, 826 bp = bio_split(bio,
827 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); 827 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
828 if (make_request(q, &bp->bio1)) 828 if (make_request(mddev, &bp->bio1))
829 generic_make_request(&bp->bio1); 829 generic_make_request(&bp->bio1);
830 if (make_request(q, &bp->bio2)) 830 if (make_request(mddev, &bp->bio2))
831 generic_make_request(&bp->bio2); 831 generic_make_request(&bp->bio2);
832 832
833 bio_pair_release(bp); 833 bio_pair_release(bp);
834 return 0; 834 return 0;
835 bad_map: 835 bad_map:
836 printk("raid10_make_request bug: can't convert block across chunks" 836 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
837 " or bigger than %dk %llu %d\n", chunk_sects/2, 837 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
838 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 838 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
839 839
840 bio_io_error(bio); 840 bio_io_error(bio);
@@ -850,12 +850,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
850 */ 850 */
851 wait_barrier(conf); 851 wait_barrier(conf);
852 852
853 cpu = part_stat_lock();
854 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
855 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
856 bio_sectors(bio));
857 part_stat_unlock();
858
859 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 853 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
860 854
861 r10_bio->master_bio = bio; 855 r10_bio->master_bio = bio;
@@ -1039,9 +1033,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1039 } 1033 }
1040 set_bit(Faulty, &rdev->flags); 1034 set_bit(Faulty, &rdev->flags);
1041 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1035 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1042 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" 1036 printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
1043 "raid10: Operation continuing on %d devices.\n", 1037 KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
1044 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1038 mdname(mddev), bdevname(rdev->bdev, b),
1039 mdname(mddev), conf->raid_disks - mddev->degraded);
1045} 1040}
1046 1041
1047static void print_conf(conf_t *conf) 1042static void print_conf(conf_t *conf)
@@ -1049,19 +1044,19 @@ static void print_conf(conf_t *conf)
1049 int i; 1044 int i;
1050 mirror_info_t *tmp; 1045 mirror_info_t *tmp;
1051 1046
1052 printk("RAID10 conf printout:\n"); 1047 printk(KERN_DEBUG "RAID10 conf printout:\n");
1053 if (!conf) { 1048 if (!conf) {
1054 printk("(!conf)\n"); 1049 printk(KERN_DEBUG "(!conf)\n");
1055 return; 1050 return;
1056 } 1051 }
1057 printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1052 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1058 conf->raid_disks); 1053 conf->raid_disks);
1059 1054
1060 for (i = 0; i < conf->raid_disks; i++) { 1055 for (i = 0; i < conf->raid_disks; i++) {
1061 char b[BDEVNAME_SIZE]; 1056 char b[BDEVNAME_SIZE];
1062 tmp = conf->mirrors + i; 1057 tmp = conf->mirrors + i;
1063 if (tmp->rdev) 1058 if (tmp->rdev)
1064 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 1059 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1065 i, !test_bit(In_sync, &tmp->rdev->flags), 1060 i, !test_bit(In_sync, &tmp->rdev->flags),
1066 !test_bit(Faulty, &tmp->rdev->flags), 1061 !test_bit(Faulty, &tmp->rdev->flags),
1067 bdevname(tmp->rdev->bdev,b)); 1062 bdevname(tmp->rdev->bdev,b));
@@ -1132,7 +1127,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1132 int mirror; 1127 int mirror;
1133 mirror_info_t *p; 1128 mirror_info_t *p;
1134 int first = 0; 1129 int first = 0;
1135 int last = mddev->raid_disks - 1; 1130 int last = conf->raid_disks - 1;
1136 1131
1137 if (mddev->recovery_cp < MaxSector) 1132 if (mddev->recovery_cp < MaxSector)
1138 /* only hot-add to in-sync arrays, as recovery is 1133 /* only hot-add to in-sync arrays, as recovery is
@@ -1224,7 +1219,7 @@ abort:
1224 1219
1225static void end_sync_read(struct bio *bio, int error) 1220static void end_sync_read(struct bio *bio, int error)
1226{ 1221{
1227 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1222 r10bio_t *r10_bio = bio->bi_private;
1228 conf_t *conf = r10_bio->mddev->private; 1223 conf_t *conf = r10_bio->mddev->private;
1229 int i,d; 1224 int i,d;
1230 1225
@@ -1261,7 +1256,7 @@ static void end_sync_read(struct bio *bio, int error)
1261static void end_sync_write(struct bio *bio, int error) 1256static void end_sync_write(struct bio *bio, int error)
1262{ 1257{
1263 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1264 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1259 r10bio_t *r10_bio = bio->bi_private;
1265 mddev_t *mddev = r10_bio->mddev; 1260 mddev_t *mddev = r10_bio->mddev;
1266 conf_t *conf = mddev->private; 1261 conf_t *conf = mddev->private;
1267 int i,d; 1262 int i,d;
@@ -1510,13 +1505,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1510 if (cur_read_error_count > max_read_errors) { 1505 if (cur_read_error_count > max_read_errors) {
1511 rcu_read_unlock(); 1506 rcu_read_unlock();
1512 printk(KERN_NOTICE 1507 printk(KERN_NOTICE
1513 "raid10: %s: Raid device exceeded " 1508 "md/raid10:%s: %s: Raid device exceeded "
1514 "read_error threshold " 1509 "read_error threshold "
1515 "[cur %d:max %d]\n", 1510 "[cur %d:max %d]\n",
1511 mdname(mddev),
1516 b, cur_read_error_count, max_read_errors); 1512 b, cur_read_error_count, max_read_errors);
1517 printk(KERN_NOTICE 1513 printk(KERN_NOTICE
1518 "raid10: %s: Failing raid " 1514 "md/raid10:%s: %s: Failing raid "
1519 "device\n", b); 1515 "device\n", mdname(mddev), b);
1520 md_error(mddev, conf->mirrors[d].rdev); 1516 md_error(mddev, conf->mirrors[d].rdev);
1521 return; 1517 return;
1522 } 1518 }
@@ -1586,15 +1582,16 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1586 == 0) { 1582 == 0) {
1587 /* Well, this device is dead */ 1583 /* Well, this device is dead */
1588 printk(KERN_NOTICE 1584 printk(KERN_NOTICE
1589 "raid10:%s: read correction " 1585 "md/raid10:%s: read correction "
1590 "write failed" 1586 "write failed"
1591 " (%d sectors at %llu on %s)\n", 1587 " (%d sectors at %llu on %s)\n",
1592 mdname(mddev), s, 1588 mdname(mddev), s,
1593 (unsigned long long)(sect+ 1589 (unsigned long long)(sect+
1594 rdev->data_offset), 1590 rdev->data_offset),
1595 bdevname(rdev->bdev, b)); 1591 bdevname(rdev->bdev, b));
1596 printk(KERN_NOTICE "raid10:%s: failing " 1592 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1597 "drive\n", 1593 "drive\n",
1594 mdname(mddev),
1598 bdevname(rdev->bdev, b)); 1595 bdevname(rdev->bdev, b));
1599 md_error(mddev, rdev); 1596 md_error(mddev, rdev);
1600 } 1597 }
@@ -1622,20 +1619,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1622 READ) == 0) { 1619 READ) == 0) {
1623 /* Well, this device is dead */ 1620 /* Well, this device is dead */
1624 printk(KERN_NOTICE 1621 printk(KERN_NOTICE
1625 "raid10:%s: unable to read back " 1622 "md/raid10:%s: unable to read back "
1626 "corrected sectors" 1623 "corrected sectors"
1627 " (%d sectors at %llu on %s)\n", 1624 " (%d sectors at %llu on %s)\n",
1628 mdname(mddev), s, 1625 mdname(mddev), s,
1629 (unsigned long long)(sect+ 1626 (unsigned long long)(sect+
1630 rdev->data_offset), 1627 rdev->data_offset),
1631 bdevname(rdev->bdev, b)); 1628 bdevname(rdev->bdev, b));
1632 printk(KERN_NOTICE "raid10:%s: failing drive\n", 1629 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1630 mdname(mddev),
1633 bdevname(rdev->bdev, b)); 1631 bdevname(rdev->bdev, b));
1634 1632
1635 md_error(mddev, rdev); 1633 md_error(mddev, rdev);
1636 } else { 1634 } else {
1637 printk(KERN_INFO 1635 printk(KERN_INFO
1638 "raid10:%s: read error corrected" 1636 "md/raid10:%s: read error corrected"
1639 " (%d sectors at %llu on %s)\n", 1637 " (%d sectors at %llu on %s)\n",
1640 mdname(mddev), s, 1638 mdname(mddev), s,
1641 (unsigned long long)(sect+ 1639 (unsigned long long)(sect+
@@ -1710,8 +1708,9 @@ static void raid10d(mddev_t *mddev)
1710 mddev->ro ? IO_BLOCKED : NULL; 1708 mddev->ro ? IO_BLOCKED : NULL;
1711 mirror = read_balance(conf, r10_bio); 1709 mirror = read_balance(conf, r10_bio);
1712 if (mirror == -1) { 1710 if (mirror == -1) {
1713 printk(KERN_ALERT "raid10: %s: unrecoverable I/O" 1711 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1714 " read error for block %llu\n", 1712 " read error for block %llu\n",
1713 mdname(mddev),
1715 bdevname(bio->bi_bdev,b), 1714 bdevname(bio->bi_bdev,b),
1716 (unsigned long long)r10_bio->sector); 1715 (unsigned long long)r10_bio->sector);
1717 raid_end_bio_io(r10_bio); 1716 raid_end_bio_io(r10_bio);
@@ -1721,8 +1720,9 @@ static void raid10d(mddev_t *mddev)
1721 bio_put(bio); 1720 bio_put(bio);
1722 rdev = conf->mirrors[mirror].rdev; 1721 rdev = conf->mirrors[mirror].rdev;
1723 if (printk_ratelimit()) 1722 if (printk_ratelimit())
1724 printk(KERN_ERR "raid10: %s: redirecting sector %llu to" 1723 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
1725 " another mirror\n", 1724 " another mirror\n",
1725 mdname(mddev),
1726 bdevname(rdev->bdev,b), 1726 bdevname(rdev->bdev,b),
1727 (unsigned long long)r10_bio->sector); 1727 (unsigned long long)r10_bio->sector);
1728 bio = bio_clone(r10_bio->master_bio, GFP_NOIO); 1728 bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
@@ -1980,7 +1980,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1980 r10_bio = rb2; 1980 r10_bio = rb2;
1981 if (!test_and_set_bit(MD_RECOVERY_INTR, 1981 if (!test_and_set_bit(MD_RECOVERY_INTR,
1982 &mddev->recovery)) 1982 &mddev->recovery))
1983 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", 1983 printk(KERN_INFO "md/raid10:%s: insufficient "
1984 "working devices for recovery.\n",
1984 mdname(mddev)); 1985 mdname(mddev));
1985 break; 1986 break;
1986 } 1987 }
@@ -2140,9 +2141,9 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2140 conf_t *conf = mddev->private; 2141 conf_t *conf = mddev->private;
2141 2142
2142 if (!raid_disks) 2143 if (!raid_disks)
2143 raid_disks = mddev->raid_disks; 2144 raid_disks = conf->raid_disks;
2144 if (!sectors) 2145 if (!sectors)
2145 sectors = mddev->dev_sectors; 2146 sectors = conf->dev_sectors;
2146 2147
2147 size = sectors >> conf->chunk_shift; 2148 size = sectors >> conf->chunk_shift;
2148 sector_div(size, conf->far_copies); 2149 sector_div(size, conf->far_copies);
@@ -2152,62 +2153,61 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2152 return size << conf->chunk_shift; 2153 return size << conf->chunk_shift;
2153} 2154}
2154 2155
2155static int run(mddev_t *mddev) 2156
2157static conf_t *setup_conf(mddev_t *mddev)
2156{ 2158{
2157 conf_t *conf; 2159 conf_t *conf = NULL;
2158 int i, disk_idx, chunk_size;
2159 mirror_info_t *disk;
2160 mdk_rdev_t *rdev;
2161 int nc, fc, fo; 2160 int nc, fc, fo;
2162 sector_t stride, size; 2161 sector_t stride, size;
2162 int err = -EINVAL;
2163 2163
2164 if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || 2164 if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
2165 !is_power_of_2(mddev->chunk_sectors)) { 2165 !is_power_of_2(mddev->chunk_sectors)) {
2166 printk(KERN_ERR "md/raid10: chunk size must be " 2166 printk(KERN_ERR "md/raid10:%s: chunk size must be "
2167 "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); 2167 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
2168 return -EINVAL; 2168 mdname(mddev), PAGE_SIZE);
2169 goto out;
2169 } 2170 }
2170 2171
2171 nc = mddev->layout & 255; 2172 nc = mddev->layout & 255;
2172 fc = (mddev->layout >> 8) & 255; 2173 fc = (mddev->layout >> 8) & 255;
2173 fo = mddev->layout & (1<<16); 2174 fo = mddev->layout & (1<<16);
2175
2174 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 2176 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2175 (mddev->layout >> 17)) { 2177 (mddev->layout >> 17)) {
2176 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", 2178 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
2177 mdname(mddev), mddev->layout); 2179 mdname(mddev), mddev->layout);
2178 goto out; 2180 goto out;
2179 } 2181 }
2180 /* 2182
2181 * copy the already verified devices into our private RAID10 2183 err = -ENOMEM;
2182 * bookkeeping area. [whatever we allocate in run(),
2183 * should be freed in stop()]
2184 */
2185 conf = kzalloc(sizeof(conf_t), GFP_KERNEL); 2184 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2186 mddev->private = conf; 2185 if (!conf)
2187 if (!conf) {
2188 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2189 mdname(mddev));
2190 goto out; 2186 goto out;
2191 } 2187
2192 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 2188 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2193 GFP_KERNEL); 2189 GFP_KERNEL);
2194 if (!conf->mirrors) { 2190 if (!conf->mirrors)
2195 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 2191 goto out;
2196 mdname(mddev));
2197 goto out_free_conf;
2198 }
2199 2192
2200 conf->tmppage = alloc_page(GFP_KERNEL); 2193 conf->tmppage = alloc_page(GFP_KERNEL);
2201 if (!conf->tmppage) 2194 if (!conf->tmppage)
2202 goto out_free_conf; 2195 goto out;
2196
2203 2197
2204 conf->raid_disks = mddev->raid_disks; 2198 conf->raid_disks = mddev->raid_disks;
2205 conf->near_copies = nc; 2199 conf->near_copies = nc;
2206 conf->far_copies = fc; 2200 conf->far_copies = fc;
2207 conf->copies = nc*fc; 2201 conf->copies = nc*fc;
2208 conf->far_offset = fo; 2202 conf->far_offset = fo;
2209 conf->chunk_mask = mddev->chunk_sectors - 1; 2203 conf->chunk_mask = mddev->new_chunk_sectors - 1;
2210 conf->chunk_shift = ffz(~mddev->chunk_sectors); 2204 conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
2205
2206 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2207 r10bio_pool_free, conf);
2208 if (!conf->r10bio_pool)
2209 goto out;
2210
2211 size = mddev->dev_sectors >> conf->chunk_shift; 2211 size = mddev->dev_sectors >> conf->chunk_shift;
2212 sector_div(size, fc); 2212 sector_div(size, fc);
2213 size = size * conf->raid_disks; 2213 size = size * conf->raid_disks;
@@ -2221,7 +2221,8 @@ static int run(mddev_t *mddev)
2221 */ 2221 */
2222 stride += conf->raid_disks - 1; 2222 stride += conf->raid_disks - 1;
2223 sector_div(stride, conf->raid_disks); 2223 sector_div(stride, conf->raid_disks);
2224 mddev->dev_sectors = stride << conf->chunk_shift; 2224
2225 conf->dev_sectors = stride << conf->chunk_shift;
2225 2226
2226 if (fo) 2227 if (fo)
2227 stride = 1; 2228 stride = 1;
@@ -2229,18 +2230,63 @@ static int run(mddev_t *mddev)
2229 sector_div(stride, fc); 2230 sector_div(stride, fc);
2230 conf->stride = stride << conf->chunk_shift; 2231 conf->stride = stride << conf->chunk_shift;
2231 2232
2232 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2233 r10bio_pool_free, conf);
2234 if (!conf->r10bio_pool) {
2235 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2236 mdname(mddev));
2237 goto out_free_conf;
2238 }
2239 2233
2240 conf->mddev = mddev;
2241 spin_lock_init(&conf->device_lock); 2234 spin_lock_init(&conf->device_lock);
2235 INIT_LIST_HEAD(&conf->retry_list);
2236
2237 spin_lock_init(&conf->resync_lock);
2238 init_waitqueue_head(&conf->wait_barrier);
2239
2240 conf->thread = md_register_thread(raid10d, mddev, NULL);
2241 if (!conf->thread)
2242 goto out;
2243
2244 conf->scale_disks = 0;
2245 conf->mddev = mddev;
2246 return conf;
2247
2248 out:
2249 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
2250 mdname(mddev));
2251 if (conf) {
2252 if (conf->r10bio_pool)
2253 mempool_destroy(conf->r10bio_pool);
2254 kfree(conf->mirrors);
2255 safe_put_page(conf->tmppage);
2256 kfree(conf);
2257 }
2258 return ERR_PTR(err);
2259}
2260
2261static int run(mddev_t *mddev)
2262{
2263 conf_t *conf;
2264 int i, disk_idx, chunk_size;
2265 mirror_info_t *disk;
2266 mdk_rdev_t *rdev;
2267 sector_t size;
2268
2269 /*
2270 * copy the already verified devices into our private RAID10
2271 * bookkeeping area. [whatever we allocate in run(),
2272 * should be freed in stop()]
2273 */
2274
2275 if (mddev->private == NULL) {
2276 conf = setup_conf(mddev);
2277 if (IS_ERR(conf))
2278 return PTR_ERR(conf);
2279 mddev->private = conf;
2280 }
2281 conf = mddev->private;
2282 if (!conf)
2283 goto out;
2284
2242 mddev->queue->queue_lock = &conf->device_lock; 2285 mddev->queue->queue_lock = &conf->device_lock;
2243 2286
2287 mddev->thread = conf->thread;
2288 conf->thread = NULL;
2289
2244 chunk_size = mddev->chunk_sectors << 9; 2290 chunk_size = mddev->chunk_sectors << 9;
2245 blk_queue_io_min(mddev->queue, chunk_size); 2291 blk_queue_io_min(mddev->queue, chunk_size);
2246 if (conf->raid_disks % conf->near_copies) 2292 if (conf->raid_disks % conf->near_copies)
@@ -2251,9 +2297,14 @@ static int run(mddev_t *mddev)
2251 2297
2252 list_for_each_entry(rdev, &mddev->disks, same_set) { 2298 list_for_each_entry(rdev, &mddev->disks, same_set) {
2253 disk_idx = rdev->raid_disk; 2299 disk_idx = rdev->raid_disk;
2254 if (disk_idx >= mddev->raid_disks 2300 if (disk_idx >= conf->raid_disks
2255 || disk_idx < 0) 2301 || disk_idx < 0)
2256 continue; 2302 continue;
2303 if (conf->scale_disks) {
2304 disk_idx *= conf->scale_disks;
2305 rdev->raid_disk = disk_idx;
2306 /* MOVE 'rd%d' link !! */
2307 }
2257 disk = conf->mirrors + disk_idx; 2308 disk = conf->mirrors + disk_idx;
2258 2309
2259 disk->rdev = rdev; 2310 disk->rdev = rdev;
@@ -2271,14 +2322,9 @@ static int run(mddev_t *mddev)
2271 2322
2272 disk->head_position = 0; 2323 disk->head_position = 0;
2273 } 2324 }
2274 INIT_LIST_HEAD(&conf->retry_list);
2275
2276 spin_lock_init(&conf->resync_lock);
2277 init_waitqueue_head(&conf->wait_barrier);
2278
2279 /* need to check that every block has at least one working mirror */ 2325 /* need to check that every block has at least one working mirror */
2280 if (!enough(conf)) { 2326 if (!enough(conf)) {
2281 printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", 2327 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
2282 mdname(mddev)); 2328 mdname(mddev));
2283 goto out_free_conf; 2329 goto out_free_conf;
2284 } 2330 }
@@ -2297,28 +2343,21 @@ static int run(mddev_t *mddev)
2297 } 2343 }
2298 } 2344 }
2299 2345
2300
2301 mddev->thread = md_register_thread(raid10d, mddev, NULL);
2302 if (!mddev->thread) {
2303 printk(KERN_ERR
2304 "raid10: couldn't allocate thread for %s\n",
2305 mdname(mddev));
2306 goto out_free_conf;
2307 }
2308
2309 if (mddev->recovery_cp != MaxSector) 2346 if (mddev->recovery_cp != MaxSector)
2310 printk(KERN_NOTICE "raid10: %s is not clean" 2347 printk(KERN_NOTICE "md/raid10:%s: not clean"
2311 " -- starting background reconstruction\n", 2348 " -- starting background reconstruction\n",
2312 mdname(mddev)); 2349 mdname(mddev));
2313 printk(KERN_INFO 2350 printk(KERN_INFO
2314 "raid10: raid set %s active with %d out of %d devices\n", 2351 "md/raid10:%s: active with %d out of %d devices\n",
2315 mdname(mddev), mddev->raid_disks - mddev->degraded, 2352 mdname(mddev), conf->raid_disks - mddev->degraded,
2316 mddev->raid_disks); 2353 conf->raid_disks);
2317 /* 2354 /*
2318 * Ok, everything is just fine now 2355 * Ok, everything is just fine now
2319 */ 2356 */
2320 md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); 2357 mddev->dev_sectors = conf->dev_sectors;
2321 mddev->resync_max_sectors = raid10_size(mddev, 0, 0); 2358 size = raid10_size(mddev, 0, 0);
2359 md_set_array_sectors(mddev, size);
2360 mddev->resync_max_sectors = size;
2322 2361
2323 mddev->queue->unplug_fn = raid10_unplug; 2362 mddev->queue->unplug_fn = raid10_unplug;
2324 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 2363 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
@@ -2336,7 +2375,7 @@ static int run(mddev_t *mddev)
2336 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2375 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2337 } 2376 }
2338 2377
2339 if (conf->near_copies < mddev->raid_disks) 2378 if (conf->near_copies < conf->raid_disks)
2340 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2379 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2341 md_integrity_register(mddev); 2380 md_integrity_register(mddev);
2342 return 0; 2381 return 0;
@@ -2348,6 +2387,7 @@ out_free_conf:
2348 kfree(conf->mirrors); 2387 kfree(conf->mirrors);
2349 kfree(conf); 2388 kfree(conf);
2350 mddev->private = NULL; 2389 mddev->private = NULL;
2390 md_unregister_thread(mddev->thread);
2351out: 2391out:
2352 return -EIO; 2392 return -EIO;
2353} 2393}
@@ -2384,6 +2424,61 @@ static void raid10_quiesce(mddev_t *mddev, int state)
2384 } 2424 }
2385} 2425}
2386 2426
2427static void *raid10_takeover_raid0(mddev_t *mddev)
2428{
2429 mdk_rdev_t *rdev;
2430 conf_t *conf;
2431
2432 if (mddev->degraded > 0) {
2433 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
2434 mdname(mddev));
2435 return ERR_PTR(-EINVAL);
2436 }
2437
2438 /* Update slot numbers to obtain
2439 * degraded raid10 with missing mirrors
2440 */
2441 list_for_each_entry(rdev, &mddev->disks, same_set) {
2442 rdev->raid_disk *= 2;
2443 }
2444
2445 /* Set new parameters */
2446 mddev->new_level = 10;
2447 /* new layout: far_copies = 1, near_copies = 2 */
2448 mddev->new_layout = (1<<8) + 2;
2449 mddev->new_chunk_sectors = mddev->chunk_sectors;
2450 mddev->delta_disks = mddev->raid_disks;
2451 mddev->degraded = mddev->raid_disks;
2452 mddev->raid_disks *= 2;
2453 /* make sure it will be not marked as dirty */
2454 mddev->recovery_cp = MaxSector;
2455
2456 conf = setup_conf(mddev);
2457 conf->scale_disks = 2;
2458 return conf;
2459}
2460
2461static void *raid10_takeover(mddev_t *mddev)
2462{
2463 struct raid0_private_data *raid0_priv;
2464
2465 /* raid10 can take over:
2466 * raid0 - providing it has only two drives
2467 */
2468 if (mddev->level == 0) {
2469 /* for raid0 takeover only one zone is supported */
2470 raid0_priv = mddev->private;
2471 if (raid0_priv->nr_strip_zones > 1) {
2472 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
2473 " with more than one zone.\n",
2474 mdname(mddev));
2475 return ERR_PTR(-EINVAL);
2476 }
2477 return raid10_takeover_raid0(mddev);
2478 }
2479 return ERR_PTR(-EINVAL);
2480}
2481
2387static struct mdk_personality raid10_personality = 2482static struct mdk_personality raid10_personality =
2388{ 2483{
2389 .name = "raid10", 2484 .name = "raid10",
@@ -2400,6 +2495,7 @@ static struct mdk_personality raid10_personality =
2400 .sync_request = sync_request, 2495 .sync_request = sync_request,
2401 .quiesce = raid10_quiesce, 2496 .quiesce = raid10_quiesce,
2402 .size = raid10_size, 2497 .size = raid10_size,
2498 .takeover = raid10_takeover,
2403}; 2499};
2404 2500
2405static int __init raid_init(void) 2501static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 59cd1efb8d30..3824a087e17c 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,9 +33,16 @@ struct r10_private_data_s {
33 * 1 stripe. 33 * 1 stripe.
34 */ 34 */
35 35
36 sector_t dev_sectors; /* temp copy of mddev->dev_sectors */
37
36 int chunk_shift; /* shift from chunks to sectors */ 38 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask; 39 sector_t chunk_mask;
38 40
41 int scale_disks; /* When starting array, multiply
42 * each ->raid_disk by this.
43 * Need for raid0->raid10 migration
44 */
45
39 struct list_head retry_list; 46 struct list_head retry_list;
40 /* queue pending writes and submit them on unplug */ 47 /* queue pending writes and submit them on unplug */
41 struct bio_list pending_bio_list; 48 struct bio_list pending_bio_list;
@@ -57,6 +64,11 @@ struct r10_private_data_s {
57 mempool_t *r10bio_pool; 64 mempool_t *r10bio_pool;
58 mempool_t *r10buf_pool; 65 mempool_t *r10buf_pool;
59 struct page *tmppage; 66 struct page *tmppage;
67
68 /* When taking over an array from a different personality, we store
69 * the new thread here until we fully activate the array.
70 */
71 struct mdk_thread_s *thread;
60}; 72};
61 73
62typedef struct r10_private_data_s conf_t; 74typedef struct r10_private_data_s conf_t;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 15348c393b5d..9ea17d6c799b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -53,6 +53,7 @@
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include "md.h" 54#include "md.h"
55#include "raid5.h" 55#include "raid5.h"
56#include "raid0.h"
56#include "bitmap.h" 57#include "bitmap.h"
57 58
58/* 59/*
@@ -1509,7 +1510,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1509 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1510 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1510 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1511 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1511 rdev = conf->disks[i].rdev; 1512 rdev = conf->disks[i].rdev;
1512 printk_rl(KERN_INFO "raid5:%s: read error corrected" 1513 printk_rl(KERN_INFO "md/raid:%s: read error corrected"
1513 " (%lu sectors at %llu on %s)\n", 1514 " (%lu sectors at %llu on %s)\n",
1514 mdname(conf->mddev), STRIPE_SECTORS, 1515 mdname(conf->mddev), STRIPE_SECTORS,
1515 (unsigned long long)(sh->sector 1516 (unsigned long long)(sh->sector
@@ -1529,7 +1530,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1529 atomic_inc(&rdev->read_errors); 1530 atomic_inc(&rdev->read_errors);
1530 if (conf->mddev->degraded >= conf->max_degraded) 1531 if (conf->mddev->degraded >= conf->max_degraded)
1531 printk_rl(KERN_WARNING 1532 printk_rl(KERN_WARNING
1532 "raid5:%s: read error not correctable " 1533 "md/raid:%s: read error not correctable "
1533 "(sector %llu on %s).\n", 1534 "(sector %llu on %s).\n",
1534 mdname(conf->mddev), 1535 mdname(conf->mddev),
1535 (unsigned long long)(sh->sector 1536 (unsigned long long)(sh->sector
@@ -1538,7 +1539,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1538 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1539 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1539 /* Oh, no!!! */ 1540 /* Oh, no!!! */
1540 printk_rl(KERN_WARNING 1541 printk_rl(KERN_WARNING
1541 "raid5:%s: read error NOT corrected!! " 1542 "md/raid:%s: read error NOT corrected!! "
1542 "(sector %llu on %s).\n", 1543 "(sector %llu on %s).\n",
1543 mdname(conf->mddev), 1544 mdname(conf->mddev),
1544 (unsigned long long)(sh->sector 1545 (unsigned long long)(sh->sector
@@ -1547,7 +1548,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1547 else if (atomic_read(&rdev->read_errors) 1548 else if (atomic_read(&rdev->read_errors)
1548 > conf->max_nr_stripes) 1549 > conf->max_nr_stripes)
1549 printk(KERN_WARNING 1550 printk(KERN_WARNING
1550 "raid5:%s: Too many read errors, failing device %s.\n", 1551 "md/raid:%s: Too many read errors, failing device %s.\n",
1551 mdname(conf->mddev), bdn); 1552 mdname(conf->mddev), bdn);
1552 else 1553 else
1553 retry = 1; 1554 retry = 1;
@@ -1619,8 +1620,8 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1619static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1620static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1620{ 1621{
1621 char b[BDEVNAME_SIZE]; 1622 char b[BDEVNAME_SIZE];
1622 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1623 raid5_conf_t *conf = mddev->private;
1623 pr_debug("raid5: error called\n"); 1624 pr_debug("raid456: error called\n");
1624 1625
1625 if (!test_bit(Faulty, &rdev->flags)) { 1626 if (!test_bit(Faulty, &rdev->flags)) {
1626 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1627 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1636,9 +1637,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1636 } 1637 }
1637 set_bit(Faulty, &rdev->flags); 1638 set_bit(Faulty, &rdev->flags);
1638 printk(KERN_ALERT 1639 printk(KERN_ALERT
1639 "raid5: Disk failure on %s, disabling device.\n" 1640 "md/raid:%s: Disk failure on %s, disabling device.\n"
1640 "raid5: Operation continuing on %d devices.\n", 1641 KERN_ALERT
1641 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1642 "md/raid:%s: Operation continuing on %d devices.\n",
1643 mdname(mddev),
1644 bdevname(rdev->bdev, b),
1645 mdname(mddev),
1646 conf->raid_disks - mddev->degraded);
1642 } 1647 }
1643} 1648}
1644 1649
@@ -1714,8 +1719,6 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1714 pd_idx = data_disks; 1719 pd_idx = data_disks;
1715 break; 1720 break;
1716 default: 1721 default:
1717 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1718 algorithm);
1719 BUG(); 1722 BUG();
1720 } 1723 }
1721 break; 1724 break;
@@ -1832,10 +1835,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1832 qd_idx = raid_disks - 1; 1835 qd_idx = raid_disks - 1;
1833 break; 1836 break;
1834 1837
1835
1836 default: 1838 default:
1837 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1838 algorithm);
1839 BUG(); 1839 BUG();
1840 } 1840 }
1841 break; 1841 break;
@@ -1898,8 +1898,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1898 case ALGORITHM_PARITY_N: 1898 case ALGORITHM_PARITY_N:
1899 break; 1899 break;
1900 default: 1900 default:
1901 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1902 algorithm);
1903 BUG(); 1901 BUG();
1904 } 1902 }
1905 break; 1903 break;
@@ -1958,8 +1956,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1958 i -= 1; 1956 i -= 1;
1959 break; 1957 break;
1960 default: 1958 default:
1961 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1962 algorithm);
1963 BUG(); 1959 BUG();
1964 } 1960 }
1965 break; 1961 break;
@@ -1972,7 +1968,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1972 previous, &dummy1, &sh2); 1968 previous, &dummy1, &sh2);
1973 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 1969 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
1974 || sh2.qd_idx != sh->qd_idx) { 1970 || sh2.qd_idx != sh->qd_idx) {
1975 printk(KERN_ERR "compute_blocknr: map not correct\n"); 1971 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
1972 mdname(conf->mddev));
1976 return 0; 1973 return 0;
1977 } 1974 }
1978 return r_sector; 1975 return r_sector;
@@ -3709,10 +3706,10 @@ static void raid5_align_endio(struct bio *bi, int error)
3709 3706
3710 bio_put(bi); 3707 bio_put(bi);
3711 3708
3712 mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
3713 conf = mddev->private;
3714 rdev = (void*)raid_bi->bi_next; 3709 rdev = (void*)raid_bi->bi_next;
3715 raid_bi->bi_next = NULL; 3710 raid_bi->bi_next = NULL;
3711 mddev = rdev->mddev;
3712 conf = mddev->private;
3716 3713
3717 rdev_dec_pending(rdev, conf->mddev); 3714 rdev_dec_pending(rdev, conf->mddev);
3718 3715
@@ -3749,9 +3746,8 @@ static int bio_fits_rdev(struct bio *bi)
3749} 3746}
3750 3747
3751 3748
3752static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) 3749static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3753{ 3750{
3754 mddev_t *mddev = q->queuedata;
3755 raid5_conf_t *conf = mddev->private; 3751 raid5_conf_t *conf = mddev->private;
3756 int dd_idx; 3752 int dd_idx;
3757 struct bio* align_bi; 3753 struct bio* align_bi;
@@ -3866,16 +3862,15 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3866 return sh; 3862 return sh;
3867} 3863}
3868 3864
3869static int make_request(struct request_queue *q, struct bio * bi) 3865static int make_request(mddev_t *mddev, struct bio * bi)
3870{ 3866{
3871 mddev_t *mddev = q->queuedata;
3872 raid5_conf_t *conf = mddev->private; 3867 raid5_conf_t *conf = mddev->private;
3873 int dd_idx; 3868 int dd_idx;
3874 sector_t new_sector; 3869 sector_t new_sector;
3875 sector_t logical_sector, last_sector; 3870 sector_t logical_sector, last_sector;
3876 struct stripe_head *sh; 3871 struct stripe_head *sh;
3877 const int rw = bio_data_dir(bi); 3872 const int rw = bio_data_dir(bi);
3878 int cpu, remaining; 3873 int remaining;
3879 3874
3880 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3875 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3881 /* Drain all pending writes. We only really need 3876 /* Drain all pending writes. We only really need
@@ -3890,15 +3885,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
3890 3885
3891 md_write_start(mddev, bi); 3886 md_write_start(mddev, bi);
3892 3887
3893 cpu = part_stat_lock();
3894 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
3895 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
3896 bio_sectors(bi));
3897 part_stat_unlock();
3898
3899 if (rw == READ && 3888 if (rw == READ &&
3900 mddev->reshape_position == MaxSector && 3889 mddev->reshape_position == MaxSector &&
3901 chunk_aligned_read(q,bi)) 3890 chunk_aligned_read(mddev,bi))
3902 return 0; 3891 return 0;
3903 3892
3904 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3893 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
@@ -3946,7 +3935,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3946 new_sector = raid5_compute_sector(conf, logical_sector, 3935 new_sector = raid5_compute_sector(conf, logical_sector,
3947 previous, 3936 previous,
3948 &dd_idx, NULL); 3937 &dd_idx, NULL);
3949 pr_debug("raid5: make_request, sector %llu logical %llu\n", 3938 pr_debug("raid456: make_request, sector %llu logical %llu\n",
3950 (unsigned long long)new_sector, 3939 (unsigned long long)new_sector,
3951 (unsigned long long)logical_sector); 3940 (unsigned long long)logical_sector);
3952 3941
@@ -4054,7 +4043,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4054 * As the reads complete, handle_stripe will copy the data 4043 * As the reads complete, handle_stripe will copy the data
4055 * into the destination stripe and release that stripe. 4044 * into the destination stripe and release that stripe.
4056 */ 4045 */
4057 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4046 raid5_conf_t *conf = mddev->private;
4058 struct stripe_head *sh; 4047 struct stripe_head *sh;
4059 sector_t first_sector, last_sector; 4048 sector_t first_sector, last_sector;
4060 int raid_disks = conf->previous_raid_disks; 4049 int raid_disks = conf->previous_raid_disks;
@@ -4263,7 +4252,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4263/* FIXME go_faster isn't used */ 4252/* FIXME go_faster isn't used */
4264static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 4253static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
4265{ 4254{
4266 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4255 raid5_conf_t *conf = mddev->private;
4267 struct stripe_head *sh; 4256 struct stripe_head *sh;
4268 sector_t max_sector = mddev->dev_sectors; 4257 sector_t max_sector = mddev->dev_sectors;
4269 int sync_blocks; 4258 int sync_blocks;
@@ -4725,7 +4714,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4725 if (mddev->new_level != 5 4714 if (mddev->new_level != 5
4726 && mddev->new_level != 4 4715 && mddev->new_level != 4
4727 && mddev->new_level != 6) { 4716 && mddev->new_level != 6) {
4728 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", 4717 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
4729 mdname(mddev), mddev->new_level); 4718 mdname(mddev), mddev->new_level);
4730 return ERR_PTR(-EIO); 4719 return ERR_PTR(-EIO);
4731 } 4720 }
@@ -4733,12 +4722,12 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4733 && !algorithm_valid_raid5(mddev->new_layout)) || 4722 && !algorithm_valid_raid5(mddev->new_layout)) ||
4734 (mddev->new_level == 6 4723 (mddev->new_level == 6
4735 && !algorithm_valid_raid6(mddev->new_layout))) { 4724 && !algorithm_valid_raid6(mddev->new_layout))) {
4736 printk(KERN_ERR "raid5: %s: layout %d not supported\n", 4725 printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
4737 mdname(mddev), mddev->new_layout); 4726 mdname(mddev), mddev->new_layout);
4738 return ERR_PTR(-EIO); 4727 return ERR_PTR(-EIO);
4739 } 4728 }
4740 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4729 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
4741 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", 4730 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
4742 mdname(mddev), mddev->raid_disks); 4731 mdname(mddev), mddev->raid_disks);
4743 return ERR_PTR(-EINVAL); 4732 return ERR_PTR(-EINVAL);
4744 } 4733 }
@@ -4746,8 +4735,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4746 if (!mddev->new_chunk_sectors || 4735 if (!mddev->new_chunk_sectors ||
4747 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4736 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
4748 !is_power_of_2(mddev->new_chunk_sectors)) { 4737 !is_power_of_2(mddev->new_chunk_sectors)) {
4749 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 4738 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
4750 mddev->new_chunk_sectors << 9, mdname(mddev)); 4739 mdname(mddev), mddev->new_chunk_sectors << 9);
4751 return ERR_PTR(-EINVAL); 4740 return ERR_PTR(-EINVAL);
4752 } 4741 }
4753 4742
@@ -4789,7 +4778,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4789 if (raid5_alloc_percpu(conf) != 0) 4778 if (raid5_alloc_percpu(conf) != 0)
4790 goto abort; 4779 goto abort;
4791 4780
4792 pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4781 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
4793 4782
4794 list_for_each_entry(rdev, &mddev->disks, same_set) { 4783 list_for_each_entry(rdev, &mddev->disks, same_set) {
4795 raid_disk = rdev->raid_disk; 4784 raid_disk = rdev->raid_disk;
@@ -4802,9 +4791,9 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4802 4791
4803 if (test_bit(In_sync, &rdev->flags)) { 4792 if (test_bit(In_sync, &rdev->flags)) {
4804 char b[BDEVNAME_SIZE]; 4793 char b[BDEVNAME_SIZE];
4805 printk(KERN_INFO "raid5: device %s operational as raid" 4794 printk(KERN_INFO "md/raid:%s: device %s operational as raid"
4806 " disk %d\n", bdevname(rdev->bdev,b), 4795 " disk %d\n",
4807 raid_disk); 4796 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
4808 } else 4797 } else
4809 /* Cannot rely on bitmap to complete recovery */ 4798 /* Cannot rely on bitmap to complete recovery */
4810 conf->fullsync = 1; 4799 conf->fullsync = 1;
@@ -4828,16 +4817,17 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4828 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4817 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4829 if (grow_stripes(conf, conf->max_nr_stripes)) { 4818 if (grow_stripes(conf, conf->max_nr_stripes)) {
4830 printk(KERN_ERR 4819 printk(KERN_ERR
4831 "raid5: couldn't allocate %dkB for buffers\n", memory); 4820 "md/raid:%s: couldn't allocate %dkB for buffers\n",
4821 mdname(mddev), memory);
4832 goto abort; 4822 goto abort;
4833 } else 4823 } else
4834 printk(KERN_INFO "raid5: allocated %dkB for %s\n", 4824 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
4835 memory, mdname(mddev)); 4825 mdname(mddev), memory);
4836 4826
4837 conf->thread = md_register_thread(raid5d, mddev, NULL); 4827 conf->thread = md_register_thread(raid5d, mddev, NULL);
4838 if (!conf->thread) { 4828 if (!conf->thread) {
4839 printk(KERN_ERR 4829 printk(KERN_ERR
4840 "raid5: couldn't allocate thread for %s\n", 4830 "md/raid:%s: couldn't allocate thread.\n",
4841 mdname(mddev)); 4831 mdname(mddev));
4842 goto abort; 4832 goto abort;
4843 } 4833 }
@@ -4888,7 +4878,7 @@ static int run(mddev_t *mddev)
4888 sector_t reshape_offset = 0; 4878 sector_t reshape_offset = 0;
4889 4879
4890 if (mddev->recovery_cp != MaxSector) 4880 if (mddev->recovery_cp != MaxSector)
4891 printk(KERN_NOTICE "raid5: %s is not clean" 4881 printk(KERN_NOTICE "md/raid:%s: not clean"
4892 " -- starting background reconstruction\n", 4882 " -- starting background reconstruction\n",
4893 mdname(mddev)); 4883 mdname(mddev));
4894 if (mddev->reshape_position != MaxSector) { 4884 if (mddev->reshape_position != MaxSector) {
@@ -4902,7 +4892,7 @@ static int run(mddev_t *mddev)
4902 int max_degraded = (mddev->level == 6 ? 2 : 1); 4892 int max_degraded = (mddev->level == 6 ? 2 : 1);
4903 4893
4904 if (mddev->new_level != mddev->level) { 4894 if (mddev->new_level != mddev->level) {
4905 printk(KERN_ERR "raid5: %s: unsupported reshape " 4895 printk(KERN_ERR "md/raid:%s: unsupported reshape "
4906 "required - aborting.\n", 4896 "required - aborting.\n",
4907 mdname(mddev)); 4897 mdname(mddev));
4908 return -EINVAL; 4898 return -EINVAL;
@@ -4915,8 +4905,8 @@ static int run(mddev_t *mddev)
4915 here_new = mddev->reshape_position; 4905 here_new = mddev->reshape_position;
4916 if (sector_div(here_new, mddev->new_chunk_sectors * 4906 if (sector_div(here_new, mddev->new_chunk_sectors *
4917 (mddev->raid_disks - max_degraded))) { 4907 (mddev->raid_disks - max_degraded))) {
4918 printk(KERN_ERR "raid5: reshape_position not " 4908 printk(KERN_ERR "md/raid:%s: reshape_position not "
4919 "on a stripe boundary\n"); 4909 "on a stripe boundary\n", mdname(mddev));
4920 return -EINVAL; 4910 return -EINVAL;
4921 } 4911 }
4922 reshape_offset = here_new * mddev->new_chunk_sectors; 4912 reshape_offset = here_new * mddev->new_chunk_sectors;
@@ -4937,8 +4927,9 @@ static int run(mddev_t *mddev)
4937 if ((here_new * mddev->new_chunk_sectors != 4927 if ((here_new * mddev->new_chunk_sectors !=
4938 here_old * mddev->chunk_sectors) || 4928 here_old * mddev->chunk_sectors) ||
4939 mddev->ro == 0) { 4929 mddev->ro == 0) {
4940 printk(KERN_ERR "raid5: in-place reshape must be started" 4930 printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
4941 " in read-only mode - aborting\n"); 4931 " in read-only mode - aborting\n",
4932 mdname(mddev));
4942 return -EINVAL; 4933 return -EINVAL;
4943 } 4934 }
4944 } else if (mddev->delta_disks < 0 4935 } else if (mddev->delta_disks < 0
@@ -4947,11 +4938,13 @@ static int run(mddev_t *mddev)
4947 : (here_new * mddev->new_chunk_sectors >= 4938 : (here_new * mddev->new_chunk_sectors >=
4948 here_old * mddev->chunk_sectors)) { 4939 here_old * mddev->chunk_sectors)) {
4949 /* Reading from the same stripe as writing to - bad */ 4940 /* Reading from the same stripe as writing to - bad */
4950 printk(KERN_ERR "raid5: reshape_position too early for " 4941 printk(KERN_ERR "md/raid:%s: reshape_position too early for "
4951 "auto-recovery - aborting.\n"); 4942 "auto-recovery - aborting.\n",
4943 mdname(mddev));
4952 return -EINVAL; 4944 return -EINVAL;
4953 } 4945 }
4954 printk(KERN_INFO "raid5: reshape will continue\n"); 4946 printk(KERN_INFO "md/raid:%s: reshape will continue\n",
4947 mdname(mddev));
4955 /* OK, we should be able to continue; */ 4948 /* OK, we should be able to continue; */
4956 } else { 4949 } else {
4957 BUG_ON(mddev->level != mddev->new_level); 4950 BUG_ON(mddev->level != mddev->new_level);
@@ -4993,18 +4986,6 @@ static int run(mddev_t *mddev)
4993 mddev->minor_version > 90) 4986 mddev->minor_version > 90)
4994 rdev->recovery_offset = reshape_offset; 4987 rdev->recovery_offset = reshape_offset;
4995 4988
4996 printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n",
4997 rdev->raid_disk, working_disks, conf->prev_algo,
4998 conf->previous_raid_disks, conf->max_degraded,
4999 conf->algorithm, conf->raid_disks,
5000 only_parity(rdev->raid_disk,
5001 conf->prev_algo,
5002 conf->previous_raid_disks,
5003 conf->max_degraded),
5004 only_parity(rdev->raid_disk,
5005 conf->algorithm,
5006 conf->raid_disks,
5007 conf->max_degraded));
5008 if (rdev->recovery_offset < reshape_offset) { 4989 if (rdev->recovery_offset < reshape_offset) {
5009 /* We need to check old and new layout */ 4990 /* We need to check old and new layout */
5010 if (!only_parity(rdev->raid_disk, 4991 if (!only_parity(rdev->raid_disk,
@@ -5025,7 +5006,7 @@ static int run(mddev_t *mddev)
5025 - working_disks); 5006 - working_disks);
5026 5007
5027 if (mddev->degraded > conf->max_degraded) { 5008 if (mddev->degraded > conf->max_degraded) {
5028 printk(KERN_ERR "raid5: not enough operational devices for %s" 5009 printk(KERN_ERR "md/raid:%s: not enough operational devices"
5029 " (%d/%d failed)\n", 5010 " (%d/%d failed)\n",
5030 mdname(mddev), mddev->degraded, conf->raid_disks); 5011 mdname(mddev), mddev->degraded, conf->raid_disks);
5031 goto abort; 5012 goto abort;
@@ -5039,32 +5020,32 @@ static int run(mddev_t *mddev)
5039 mddev->recovery_cp != MaxSector) { 5020 mddev->recovery_cp != MaxSector) {
5040 if (mddev->ok_start_degraded) 5021 if (mddev->ok_start_degraded)
5041 printk(KERN_WARNING 5022 printk(KERN_WARNING
5042 "raid5: starting dirty degraded array: %s" 5023 "md/raid:%s: starting dirty degraded array"
5043 "- data corruption possible.\n", 5024 " - data corruption possible.\n",
5044 mdname(mddev)); 5025 mdname(mddev));
5045 else { 5026 else {
5046 printk(KERN_ERR 5027 printk(KERN_ERR
5047 "raid5: cannot start dirty degraded array for %s\n", 5028 "md/raid:%s: cannot start dirty degraded array.\n",
5048 mdname(mddev)); 5029 mdname(mddev));
5049 goto abort; 5030 goto abort;
5050 } 5031 }
5051 } 5032 }
5052 5033
5053 if (mddev->degraded == 0) 5034 if (mddev->degraded == 0)
5054 printk("raid5: raid level %d set %s active with %d out of %d" 5035 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
5055 " devices, algorithm %d\n", conf->level, mdname(mddev), 5036 " devices, algorithm %d\n", mdname(mddev), conf->level,
5056 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5037 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
5057 mddev->new_layout); 5038 mddev->new_layout);
5058 else 5039 else
5059 printk(KERN_ALERT "raid5: raid level %d set %s active with %d" 5040 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
5060 " out of %d devices, algorithm %d\n", conf->level, 5041 " out of %d devices, algorithm %d\n",
5061 mdname(mddev), mddev->raid_disks - mddev->degraded, 5042 mdname(mddev), conf->level,
5062 mddev->raid_disks, mddev->new_layout); 5043 mddev->raid_disks - mddev->degraded,
5044 mddev->raid_disks, mddev->new_layout);
5063 5045
5064 print_raid5_conf(conf); 5046 print_raid5_conf(conf);
5065 5047
5066 if (conf->reshape_progress != MaxSector) { 5048 if (conf->reshape_progress != MaxSector) {
5067 printk("...ok start reshape thread\n");
5068 conf->reshape_safe = conf->reshape_progress; 5049 conf->reshape_safe = conf->reshape_progress;
5069 atomic_set(&conf->reshape_stripes, 0); 5050 atomic_set(&conf->reshape_stripes, 0);
5070 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5051 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -5087,9 +5068,11 @@ static int run(mddev_t *mddev)
5087 } 5068 }
5088 5069
5089 /* Ok, everything is just fine now */ 5070 /* Ok, everything is just fine now */
5090 if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5071 if (mddev->to_remove == &raid5_attrs_group)
5072 mddev->to_remove = NULL;
5073 else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
5091 printk(KERN_WARNING 5074 printk(KERN_WARNING
5092 "raid5: failed to create sysfs attributes for %s\n", 5075 "md/raid:%s: failed to create sysfs attributes.\n",
5093 mdname(mddev)); 5076 mdname(mddev));
5094 5077
5095 mddev->queue->queue_lock = &conf->device_lock; 5078 mddev->queue->queue_lock = &conf->device_lock;
@@ -5119,22 +5102,21 @@ abort:
5119 free_conf(conf); 5102 free_conf(conf);
5120 } 5103 }
5121 mddev->private = NULL; 5104 mddev->private = NULL;
5122 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 5105 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
5123 return -EIO; 5106 return -EIO;
5124} 5107}
5125 5108
5126
5127
5128static int stop(mddev_t *mddev) 5109static int stop(mddev_t *mddev)
5129{ 5110{
5130 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5111 raid5_conf_t *conf = mddev->private;
5131 5112
5132 md_unregister_thread(mddev->thread); 5113 md_unregister_thread(mddev->thread);
5133 mddev->thread = NULL; 5114 mddev->thread = NULL;
5134 mddev->queue->backing_dev_info.congested_fn = NULL; 5115 mddev->queue->backing_dev_info.congested_fn = NULL;
5135 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5116 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5136 free_conf(conf); 5117 free_conf(conf);
5137 mddev->private = &raid5_attrs_group; 5118 mddev->private = NULL;
5119 mddev->to_remove = &raid5_attrs_group;
5138 return 0; 5120 return 0;
5139} 5121}
5140 5122
@@ -5175,7 +5157,7 @@ static void printall(struct seq_file *seq, raid5_conf_t *conf)
5175 5157
5176static void status(struct seq_file *seq, mddev_t *mddev) 5158static void status(struct seq_file *seq, mddev_t *mddev)
5177{ 5159{
5178 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5160 raid5_conf_t *conf = mddev->private;
5179 int i; 5161 int i;
5180 5162
5181 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5163 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
@@ -5197,21 +5179,22 @@ static void print_raid5_conf (raid5_conf_t *conf)
5197 int i; 5179 int i;
5198 struct disk_info *tmp; 5180 struct disk_info *tmp;
5199 5181
5200 printk("RAID5 conf printout:\n"); 5182 printk(KERN_DEBUG "RAID conf printout:\n");
5201 if (!conf) { 5183 if (!conf) {
5202 printk("(conf==NULL)\n"); 5184 printk("(conf==NULL)\n");
5203 return; 5185 return;
5204 } 5186 }
5205 printk(" --- rd:%d wd:%d\n", conf->raid_disks, 5187 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
5206 conf->raid_disks - conf->mddev->degraded); 5188 conf->raid_disks,
5189 conf->raid_disks - conf->mddev->degraded);
5207 5190
5208 for (i = 0; i < conf->raid_disks; i++) { 5191 for (i = 0; i < conf->raid_disks; i++) {
5209 char b[BDEVNAME_SIZE]; 5192 char b[BDEVNAME_SIZE];
5210 tmp = conf->disks + i; 5193 tmp = conf->disks + i;
5211 if (tmp->rdev) 5194 if (tmp->rdev)
5212 printk(" disk %d, o:%d, dev:%s\n", 5195 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
5213 i, !test_bit(Faulty, &tmp->rdev->flags), 5196 i, !test_bit(Faulty, &tmp->rdev->flags),
5214 bdevname(tmp->rdev->bdev,b)); 5197 bdevname(tmp->rdev->bdev, b));
5215 } 5198 }
5216} 5199}
5217 5200
@@ -5334,7 +5317,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
5334 raid5_size(mddev, sectors, mddev->raid_disks)) 5317 raid5_size(mddev, sectors, mddev->raid_disks))
5335 return -EINVAL; 5318 return -EINVAL;
5336 set_capacity(mddev->gendisk, mddev->array_sectors); 5319 set_capacity(mddev->gendisk, mddev->array_sectors);
5337 mddev->changed = 1;
5338 revalidate_disk(mddev->gendisk); 5320 revalidate_disk(mddev->gendisk);
5339 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5321 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
5340 mddev->recovery_cp = mddev->dev_sectors; 5322 mddev->recovery_cp = mddev->dev_sectors;
@@ -5360,7 +5342,8 @@ static int check_stripe_cache(mddev_t *mddev)
5360 > conf->max_nr_stripes || 5342 > conf->max_nr_stripes ||
5361 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5343 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
5362 > conf->max_nr_stripes) { 5344 > conf->max_nr_stripes) {
5363 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", 5345 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n",
5346 mdname(mddev),
5364 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5347 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
5365 / STRIPE_SIZE)*4); 5348 / STRIPE_SIZE)*4);
5366 return 0; 5349 return 0;
@@ -5431,7 +5414,7 @@ static int raid5_start_reshape(mddev_t *mddev)
5431 */ 5414 */
5432 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5415 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
5433 < mddev->array_sectors) { 5416 < mddev->array_sectors) {
5434 printk(KERN_ERR "md: %s: array size must be reduced " 5417 printk(KERN_ERR "md/raid:%s: array size must be reduced "
5435 "before number of disks\n", mdname(mddev)); 5418 "before number of disks\n", mdname(mddev));
5436 return -EINVAL; 5419 return -EINVAL;
5437 } 5420 }
@@ -5469,9 +5452,9 @@ static int raid5_start_reshape(mddev_t *mddev)
5469 if (sysfs_create_link(&mddev->kobj, 5452 if (sysfs_create_link(&mddev->kobj,
5470 &rdev->kobj, nm)) 5453 &rdev->kobj, nm))
5471 printk(KERN_WARNING 5454 printk(KERN_WARNING
5472 "raid5: failed to create " 5455 "md/raid:%s: failed to create "
5473 " link %s for %s\n", 5456 " link %s\n",
5474 nm, mdname(mddev)); 5457 mdname(mddev), nm);
5475 } else 5458 } else
5476 break; 5459 break;
5477 } 5460 }
@@ -5548,7 +5531,6 @@ static void raid5_finish_reshape(mddev_t *mddev)
5548 if (mddev->delta_disks > 0) { 5531 if (mddev->delta_disks > 0) {
5549 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5532 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5550 set_capacity(mddev->gendisk, mddev->array_sectors); 5533 set_capacity(mddev->gendisk, mddev->array_sectors);
5551 mddev->changed = 1;
5552 revalidate_disk(mddev->gendisk); 5534 revalidate_disk(mddev->gendisk);
5553 } else { 5535 } else {
5554 int d; 5536 int d;
@@ -5613,6 +5595,29 @@ static void raid5_quiesce(mddev_t *mddev, int state)
5613} 5595}
5614 5596
5615 5597
5598static void *raid45_takeover_raid0(mddev_t *mddev, int level)
5599{
5600 struct raid0_private_data *raid0_priv = mddev->private;
5601
5602 /* for raid0 takeover only one zone is supported */
5603 if (raid0_priv->nr_strip_zones > 1) {
5604 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
5605 mdname(mddev));
5606 return ERR_PTR(-EINVAL);
5607 }
5608
5609 mddev->new_level = level;
5610 mddev->new_layout = ALGORITHM_PARITY_N;
5611 mddev->new_chunk_sectors = mddev->chunk_sectors;
5612 mddev->raid_disks += 1;
5613 mddev->delta_disks = 1;
5614 /* make sure it will be not marked as dirty */
5615 mddev->recovery_cp = MaxSector;
5616
5617 return setup_conf(mddev);
5618}
5619
5620
5616static void *raid5_takeover_raid1(mddev_t *mddev) 5621static void *raid5_takeover_raid1(mddev_t *mddev)
5617{ 5622{
5618 int chunksect; 5623 int chunksect;
@@ -5737,12 +5742,13 @@ static int raid6_check_reshape(mddev_t *mddev)
5737static void *raid5_takeover(mddev_t *mddev) 5742static void *raid5_takeover(mddev_t *mddev)
5738{ 5743{
5739 /* raid5 can take over: 5744 /* raid5 can take over:
5740 * raid0 - if all devices are the same - make it a raid4 layout 5745 * raid0 - if there is only one strip zone - make it a raid4 layout
5741 * raid1 - if there are two drives. We need to know the chunk size 5746 * raid1 - if there are two drives. We need to know the chunk size
5742 * raid4 - trivial - just use a raid4 layout. 5747 * raid4 - trivial - just use a raid4 layout.
5743 * raid6 - Providing it is a *_6 layout 5748 * raid6 - Providing it is a *_6 layout
5744 */ 5749 */
5745 5750 if (mddev->level == 0)
5751 return raid45_takeover_raid0(mddev, 5);
5746 if (mddev->level == 1) 5752 if (mddev->level == 1)
5747 return raid5_takeover_raid1(mddev); 5753 return raid5_takeover_raid1(mddev);
5748 if (mddev->level == 4) { 5754 if (mddev->level == 4) {
@@ -5756,6 +5762,22 @@ static void *raid5_takeover(mddev_t *mddev)
5756 return ERR_PTR(-EINVAL); 5762 return ERR_PTR(-EINVAL);
5757} 5763}
5758 5764
5765static void *raid4_takeover(mddev_t *mddev)
5766{
5767 /* raid4 can take over:
5768 * raid0 - if there is only one strip zone
5769 * raid5 - if layout is right
5770 */
5771 if (mddev->level == 0)
5772 return raid45_takeover_raid0(mddev, 4);
5773 if (mddev->level == 5 &&
5774 mddev->layout == ALGORITHM_PARITY_N) {
5775 mddev->new_layout = 0;
5776 mddev->new_level = 4;
5777 return setup_conf(mddev);
5778 }
5779 return ERR_PTR(-EINVAL);
5780}
5759 5781
5760static struct mdk_personality raid5_personality; 5782static struct mdk_personality raid5_personality;
5761 5783
@@ -5871,6 +5893,7 @@ static struct mdk_personality raid4_personality =
5871 .start_reshape = raid5_start_reshape, 5893 .start_reshape = raid5_start_reshape,
5872 .finish_reshape = raid5_finish_reshape, 5894 .finish_reshape = raid5_finish_reshape,
5873 .quiesce = raid5_quiesce, 5895 .quiesce = raid5_quiesce,
5896 .takeover = raid4_takeover,
5874}; 5897};
5875 5898
5876static int __init raid5_init(void) 5899static int __init raid5_init(void)