aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c543
1 files changed, 332 insertions, 211 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index cefd63daff31..46b3a044eadf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,8 +215,11 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
215 */ 215 */
216static int md_make_request(struct request_queue *q, struct bio *bio) 216static int md_make_request(struct request_queue *q, struct bio *bio)
217{ 217{
218 const int rw = bio_data_dir(bio);
218 mddev_t *mddev = q->queuedata; 219 mddev_t *mddev = q->queuedata;
219 int rv; 220 int rv;
221 int cpu;
222
220 if (mddev == NULL || mddev->pers == NULL) { 223 if (mddev == NULL || mddev->pers == NULL) {
221 bio_io_error(bio); 224 bio_io_error(bio);
222 return 0; 225 return 0;
@@ -237,13 +240,27 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
237 } 240 }
238 atomic_inc(&mddev->active_io); 241 atomic_inc(&mddev->active_io);
239 rcu_read_unlock(); 242 rcu_read_unlock();
240 rv = mddev->pers->make_request(q, bio); 243
244 rv = mddev->pers->make_request(mddev, bio);
245
246 cpu = part_stat_lock();
247 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
248 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
249 bio_sectors(bio));
250 part_stat_unlock();
251
241 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 252 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
242 wake_up(&mddev->sb_wait); 253 wake_up(&mddev->sb_wait);
243 254
244 return rv; 255 return rv;
245} 256}
246 257
258/* mddev_suspend makes sure no new requests are submitted
259 * to the device, and that any requests that have been submitted
260 * are completely handled.
261 * Once ->stop is called and completes, the module will be completely
262 * unused.
263 */
247static void mddev_suspend(mddev_t *mddev) 264static void mddev_suspend(mddev_t *mddev)
248{ 265{
249 BUG_ON(mddev->suspended); 266 BUG_ON(mddev->suspended);
@@ -251,13 +268,6 @@ static void mddev_suspend(mddev_t *mddev)
251 synchronize_rcu(); 268 synchronize_rcu();
252 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 269 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
253 mddev->pers->quiesce(mddev, 1); 270 mddev->pers->quiesce(mddev, 1);
254 md_unregister_thread(mddev->thread);
255 mddev->thread = NULL;
256 /* we now know that no code is executing in the personality module,
257 * except possibly the tail end of a ->bi_end_io function, but that
258 * is certain to complete before the module has a chance to get
259 * unloaded
260 */
261} 271}
262 272
263static void mddev_resume(mddev_t *mddev) 273static void mddev_resume(mddev_t *mddev)
@@ -344,7 +354,7 @@ static void md_submit_barrier(struct work_struct *ws)
344 bio_endio(bio, 0); 354 bio_endio(bio, 0);
345 else { 355 else {
346 bio->bi_rw &= ~(1<<BIO_RW_BARRIER); 356 bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
347 if (mddev->pers->make_request(mddev->queue, bio)) 357 if (mddev->pers->make_request(mddev, bio))
348 generic_make_request(bio); 358 generic_make_request(bio);
349 mddev->barrier = POST_REQUEST_BARRIER; 359 mddev->barrier = POST_REQUEST_BARRIER;
350 submit_barriers(mddev); 360 submit_barriers(mddev);
@@ -406,6 +416,27 @@ static void mddev_put(mddev_t *mddev)
406 spin_unlock(&all_mddevs_lock); 416 spin_unlock(&all_mddevs_lock);
407} 417}
408 418
419static void mddev_init(mddev_t *mddev)
420{
421 mutex_init(&mddev->open_mutex);
422 mutex_init(&mddev->reconfig_mutex);
423 mutex_init(&mddev->bitmap_info.mutex);
424 INIT_LIST_HEAD(&mddev->disks);
425 INIT_LIST_HEAD(&mddev->all_mddevs);
426 init_timer(&mddev->safemode_timer);
427 atomic_set(&mddev->active, 1);
428 atomic_set(&mddev->openers, 0);
429 atomic_set(&mddev->active_io, 0);
430 spin_lock_init(&mddev->write_lock);
431 atomic_set(&mddev->flush_pending, 0);
432 init_waitqueue_head(&mddev->sb_wait);
433 init_waitqueue_head(&mddev->recovery_wait);
434 mddev->reshape_position = MaxSector;
435 mddev->resync_min = 0;
436 mddev->resync_max = MaxSector;
437 mddev->level = LEVEL_NONE;
438}
439
409static mddev_t * mddev_find(dev_t unit) 440static mddev_t * mddev_find(dev_t unit)
410{ 441{
411 mddev_t *mddev, *new = NULL; 442 mddev_t *mddev, *new = NULL;
@@ -472,23 +503,7 @@ static mddev_t * mddev_find(dev_t unit)
472 else 503 else
473 new->md_minor = MINOR(unit) >> MdpMinorShift; 504 new->md_minor = MINOR(unit) >> MdpMinorShift;
474 505
475 mutex_init(&new->open_mutex); 506 mddev_init(new);
476 mutex_init(&new->reconfig_mutex);
477 mutex_init(&new->bitmap_info.mutex);
478 INIT_LIST_HEAD(&new->disks);
479 INIT_LIST_HEAD(&new->all_mddevs);
480 init_timer(&new->safemode_timer);
481 atomic_set(&new->active, 1);
482 atomic_set(&new->openers, 0);
483 atomic_set(&new->active_io, 0);
484 spin_lock_init(&new->write_lock);
485 atomic_set(&new->flush_pending, 0);
486 init_waitqueue_head(&new->sb_wait);
487 init_waitqueue_head(&new->recovery_wait);
488 new->reshape_position = MaxSector;
489 new->resync_min = 0;
490 new->resync_max = MaxSector;
491 new->level = LEVEL_NONE;
492 507
493 goto retry; 508 goto retry;
494} 509}
@@ -508,9 +523,36 @@ static inline int mddev_trylock(mddev_t * mddev)
508 return mutex_trylock(&mddev->reconfig_mutex); 523 return mutex_trylock(&mddev->reconfig_mutex);
509} 524}
510 525
511static inline void mddev_unlock(mddev_t * mddev) 526static struct attribute_group md_redundancy_group;
527
528static void mddev_unlock(mddev_t * mddev)
512{ 529{
513 mutex_unlock(&mddev->reconfig_mutex); 530 if (mddev->to_remove) {
531 /* These cannot be removed under reconfig_mutex as
532 * an access to the files will try to take reconfig_mutex
533 * while holding the file unremovable, which leads to
534 * a deadlock.
535 * So hold open_mutex instead - we are allowed to take
536 * it while holding reconfig_mutex, and md_run can
537 * use it to wait for the remove to complete.
538 */
539 struct attribute_group *to_remove = mddev->to_remove;
540 mddev->to_remove = NULL;
541 mutex_lock(&mddev->open_mutex);
542 mutex_unlock(&mddev->reconfig_mutex);
543
544 if (to_remove != &md_redundancy_group)
545 sysfs_remove_group(&mddev->kobj, to_remove);
546 if (mddev->pers == NULL ||
547 mddev->pers->sync_request == NULL) {
548 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
549 if (mddev->sysfs_action)
550 sysfs_put(mddev->sysfs_action);
551 mddev->sysfs_action = NULL;
552 }
553 mutex_unlock(&mddev->open_mutex);
554 } else
555 mutex_unlock(&mddev->reconfig_mutex);
514 556
515 md_wakeup_thread(mddev->thread); 557 md_wakeup_thread(mddev->thread);
516} 558}
@@ -1029,10 +1071,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1029 mddev->bitmap_info.default_offset; 1071 mddev->bitmap_info.default_offset;
1030 1072
1031 } else if (mddev->pers == NULL) { 1073 } else if (mddev->pers == NULL) {
1032 /* Insist on good event counter while assembling */ 1074 /* Insist on good event counter while assembling, except
1075 * for spares (which don't need an event count) */
1033 ++ev1; 1076 ++ev1;
1034 if (ev1 < mddev->events) 1077 if (sb->disks[rdev->desc_nr].state & (
1035 return -EINVAL; 1078 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1079 if (ev1 < mddev->events)
1080 return -EINVAL;
1036 } else if (mddev->bitmap) { 1081 } else if (mddev->bitmap) {
1037 /* if adding to array with a bitmap, then we can accept an 1082 /* if adding to array with a bitmap, then we can accept an
1038 * older device ... but not too old. 1083 * older device ... but not too old.
@@ -1428,10 +1473,14 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1428 } 1473 }
1429 1474
1430 } else if (mddev->pers == NULL) { 1475 } else if (mddev->pers == NULL) {
1431 /* Insist of good event counter while assembling */ 1476 /* Insist of good event counter while assembling, except for
1477 * spares (which don't need an event count) */
1432 ++ev1; 1478 ++ev1;
1433 if (ev1 < mddev->events) 1479 if (rdev->desc_nr >= 0 &&
1434 return -EINVAL; 1480 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1481 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1482 if (ev1 < mddev->events)
1483 return -EINVAL;
1435 } else if (mddev->bitmap) { 1484 } else if (mddev->bitmap) {
1436 /* If adding to array with a bitmap, then we can accept an 1485 /* If adding to array with a bitmap, then we can accept an
1437 * older device, but not too old. 1486 * older device, but not too old.
@@ -1766,7 +1815,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1766 kobject_del(&rdev->kobj); 1815 kobject_del(&rdev->kobj);
1767 goto fail; 1816 goto fail;
1768 } 1817 }
1769 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); 1818 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, NULL, "state");
1770 1819
1771 list_add_rcu(&rdev->same_set, &mddev->disks); 1820 list_add_rcu(&rdev->same_set, &mddev->disks);
1772 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1821 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
@@ -2047,7 +2096,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)
2047 if (rdev->sb_events == mddev->events || 2096 if (rdev->sb_events == mddev->events ||
2048 (nospares && 2097 (nospares &&
2049 rdev->raid_disk < 0 && 2098 rdev->raid_disk < 0 &&
2050 (rdev->sb_events&1)==0 &&
2051 rdev->sb_events+1 == mddev->events)) { 2099 rdev->sb_events+1 == mddev->events)) {
2052 /* Don't update this superblock */ 2100 /* Don't update this superblock */
2053 rdev->sb_loaded = 2; 2101 rdev->sb_loaded = 2;
@@ -2100,28 +2148,14 @@ repeat:
2100 * and 'events' is odd, we can roll back to the previous clean state */ 2148 * and 'events' is odd, we can roll back to the previous clean state */
2101 if (nospares 2149 if (nospares
2102 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2150 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2103 && (mddev->events & 1) 2151 && mddev->can_decrease_events
2104 && mddev->events != 1) 2152 && mddev->events != 1) {
2105 mddev->events--; 2153 mddev->events--;
2106 else { 2154 mddev->can_decrease_events = 0;
2155 } else {
2107 /* otherwise we have to go forward and ... */ 2156 /* otherwise we have to go forward and ... */
2108 mddev->events ++; 2157 mddev->events ++;
2109 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 2158 mddev->can_decrease_events = nospares;
2110 /* .. if the array isn't clean, an 'even' event must also go
2111 * to spares. */
2112 if ((mddev->events&1)==0) {
2113 nospares = 0;
2114 sync_req = 2; /* force a second update to get the
2115 * even/odd in sync */
2116 }
2117 } else {
2118 /* otherwise an 'odd' event must go to spares */
2119 if ((mddev->events&1)) {
2120 nospares = 0;
2121 sync_req = 2; /* force a second update to get the
2122 * even/odd in sync */
2123 }
2124 }
2125 } 2159 }
2126 2160
2127 if (!mddev->events) { 2161 if (!mddev->events) {
@@ -2365,6 +2399,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2365 return err; 2399 return err;
2366 sprintf(nm, "rd%d", rdev->raid_disk); 2400 sprintf(nm, "rd%d", rdev->raid_disk);
2367 sysfs_remove_link(&rdev->mddev->kobj, nm); 2401 sysfs_remove_link(&rdev->mddev->kobj, nm);
2402 rdev->raid_disk = -1;
2368 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2403 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2369 md_wakeup_thread(rdev->mddev->thread); 2404 md_wakeup_thread(rdev->mddev->thread);
2370 } else if (rdev->mddev->pers) { 2405 } else if (rdev->mddev->pers) {
@@ -2780,8 +2815,9 @@ static void analyze_sbs(mddev_t * mddev)
2780 2815
2781 i = 0; 2816 i = 0;
2782 rdev_for_each(rdev, tmp, mddev) { 2817 rdev_for_each(rdev, tmp, mddev) {
2783 if (rdev->desc_nr >= mddev->max_disks || 2818 if (mddev->max_disks &&
2784 i > mddev->max_disks) { 2819 (rdev->desc_nr >= mddev->max_disks ||
2820 i > mddev->max_disks)) {
2785 printk(KERN_WARNING 2821 printk(KERN_WARNING
2786 "md: %s: %s: only %d devices permitted\n", 2822 "md: %s: %s: only %d devices permitted\n",
2787 mdname(mddev), bdevname(rdev->bdev, b), 2823 mdname(mddev), bdevname(rdev->bdev, b),
@@ -2897,9 +2933,10 @@ level_show(mddev_t *mddev, char *page)
2897static ssize_t 2933static ssize_t
2898level_store(mddev_t *mddev, const char *buf, size_t len) 2934level_store(mddev_t *mddev, const char *buf, size_t len)
2899{ 2935{
2900 char level[16]; 2936 char clevel[16];
2901 ssize_t rv = len; 2937 ssize_t rv = len;
2902 struct mdk_personality *pers; 2938 struct mdk_personality *pers;
2939 long level;
2903 void *priv; 2940 void *priv;
2904 mdk_rdev_t *rdev; 2941 mdk_rdev_t *rdev;
2905 2942
@@ -2932,19 +2969,22 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2932 } 2969 }
2933 2970
2934 /* Now find the new personality */ 2971 /* Now find the new personality */
2935 if (len == 0 || len >= sizeof(level)) 2972 if (len == 0 || len >= sizeof(clevel))
2936 return -EINVAL; 2973 return -EINVAL;
2937 strncpy(level, buf, len); 2974 strncpy(clevel, buf, len);
2938 if (level[len-1] == '\n') 2975 if (clevel[len-1] == '\n')
2939 len--; 2976 len--;
2940 level[len] = 0; 2977 clevel[len] = 0;
2978 if (strict_strtol(clevel, 10, &level))
2979 level = LEVEL_NONE;
2941 2980
2942 request_module("md-%s", level); 2981 if (request_module("md-%s", clevel) != 0)
2982 request_module("md-level-%s", clevel);
2943 spin_lock(&pers_lock); 2983 spin_lock(&pers_lock);
2944 pers = find_pers(LEVEL_NONE, level); 2984 pers = find_pers(level, clevel);
2945 if (!pers || !try_module_get(pers->owner)) { 2985 if (!pers || !try_module_get(pers->owner)) {
2946 spin_unlock(&pers_lock); 2986 spin_unlock(&pers_lock);
2947 printk(KERN_WARNING "md: personality %s not loaded\n", level); 2987 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
2948 return -EINVAL; 2988 return -EINVAL;
2949 } 2989 }
2950 spin_unlock(&pers_lock); 2990 spin_unlock(&pers_lock);
@@ -2957,7 +2997,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2957 if (!pers->takeover) { 2997 if (!pers->takeover) {
2958 module_put(pers->owner); 2998 module_put(pers->owner);
2959 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 2999 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2960 mdname(mddev), level); 3000 mdname(mddev), clevel);
2961 return -EINVAL; 3001 return -EINVAL;
2962 } 3002 }
2963 3003
@@ -2973,13 +3013,44 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2973 mddev->delta_disks = 0; 3013 mddev->delta_disks = 0;
2974 module_put(pers->owner); 3014 module_put(pers->owner);
2975 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3015 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2976 mdname(mddev), level); 3016 mdname(mddev), clevel);
2977 return PTR_ERR(priv); 3017 return PTR_ERR(priv);
2978 } 3018 }
2979 3019
2980 /* Looks like we have a winner */ 3020 /* Looks like we have a winner */
2981 mddev_suspend(mddev); 3021 mddev_suspend(mddev);
2982 mddev->pers->stop(mddev); 3022 mddev->pers->stop(mddev);
3023
3024 if (mddev->pers->sync_request == NULL &&
3025 pers->sync_request != NULL) {
3026 /* need to add the md_redundancy_group */
3027 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3028 printk(KERN_WARNING
3029 "md: cannot register extra attributes for %s\n",
3030 mdname(mddev));
3031 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
3032 }
3033 if (mddev->pers->sync_request != NULL &&
3034 pers->sync_request == NULL) {
3035 /* need to remove the md_redundancy_group */
3036 if (mddev->to_remove == NULL)
3037 mddev->to_remove = &md_redundancy_group;
3038 }
3039
3040 if (mddev->pers->sync_request == NULL &&
3041 mddev->external) {
3042 /* We are converting from a no-redundancy array
3043 * to a redundancy array and metadata is managed
3044 * externally so we need to be sure that writes
3045 * won't block due to a need to transition
3046 * clean->dirty
3047 * until external management is started.
3048 */
3049 mddev->in_sync = 0;
3050 mddev->safemode_delay = 0;
3051 mddev->safemode = 0;
3052 }
3053
2983 module_put(mddev->pers->owner); 3054 module_put(mddev->pers->owner);
2984 /* Invalidate devices that are now superfluous */ 3055 /* Invalidate devices that are now superfluous */
2985 list_for_each_entry(rdev, &mddev->disks, same_set) 3056 list_for_each_entry(rdev, &mddev->disks, same_set)
@@ -2994,11 +3065,20 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2994 mddev->layout = mddev->new_layout; 3065 mddev->layout = mddev->new_layout;
2995 mddev->chunk_sectors = mddev->new_chunk_sectors; 3066 mddev->chunk_sectors = mddev->new_chunk_sectors;
2996 mddev->delta_disks = 0; 3067 mddev->delta_disks = 0;
3068 if (mddev->pers->sync_request == NULL) {
3069 /* this is now an array without redundancy, so
3070 * it must always be in_sync
3071 */
3072 mddev->in_sync = 1;
3073 del_timer_sync(&mddev->safemode_timer);
3074 }
2997 pers->run(mddev); 3075 pers->run(mddev);
2998 mddev_resume(mddev); 3076 mddev_resume(mddev);
2999 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3077 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3000 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3078 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3001 md_wakeup_thread(mddev->thread); 3079 md_wakeup_thread(mddev->thread);
3080 sysfs_notify(&mddev->kobj, NULL, "level");
3081 md_new_event(mddev);
3002 return rv; 3082 return rv;
3003} 3083}
3004 3084
@@ -3237,6 +3317,7 @@ array_state_show(mddev_t *mddev, char *page)
3237} 3317}
3238 3318
3239static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3319static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3320static int md_set_readonly(mddev_t * mddev, int is_open);
3240static int do_md_run(mddev_t * mddev); 3321static int do_md_run(mddev_t * mddev);
3241static int restart_array(mddev_t *mddev); 3322static int restart_array(mddev_t *mddev);
3242 3323
@@ -3267,7 +3348,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
3267 break; /* not supported yet */ 3348 break; /* not supported yet */
3268 case readonly: 3349 case readonly:
3269 if (mddev->pers) 3350 if (mddev->pers)
3270 err = do_md_stop(mddev, 1, 0); 3351 err = md_set_readonly(mddev, 0);
3271 else { 3352 else {
3272 mddev->ro = 1; 3353 mddev->ro = 1;
3273 set_disk_ro(mddev->gendisk, 1); 3354 set_disk_ro(mddev->gendisk, 1);
@@ -3277,7 +3358,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
3277 case read_auto: 3358 case read_auto:
3278 if (mddev->pers) { 3359 if (mddev->pers) {
3279 if (mddev->ro == 0) 3360 if (mddev->ro == 0)
3280 err = do_md_stop(mddev, 1, 0); 3361 err = md_set_readonly(mddev, 0);
3281 else if (mddev->ro == 1) 3362 else if (mddev->ro == 1)
3282 err = restart_array(mddev); 3363 err = restart_array(mddev);
3283 if (err == 0) { 3364 if (err == 0) {
@@ -4082,15 +4163,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
4082{ 4163{
4083 mddev_t *mddev = container_of(ws, mddev_t, del_work); 4164 mddev_t *mddev = container_of(ws, mddev_t, del_work);
4084 4165
4085 if (mddev->private) {
4086 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
4087 if (mddev->private != (void*)1)
4088 sysfs_remove_group(&mddev->kobj, mddev->private);
4089 if (mddev->sysfs_action)
4090 sysfs_put(mddev->sysfs_action);
4091 mddev->sysfs_action = NULL;
4092 mddev->private = NULL;
4093 }
4094 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4166 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4095 kobject_del(&mddev->kobj); 4167 kobject_del(&mddev->kobj);
4096 kobject_put(&mddev->kobj); 4168 kobject_put(&mddev->kobj);
@@ -4189,7 +4261,7 @@ static int md_alloc(dev_t dev, char *name)
4189 mutex_unlock(&disks_mutex); 4261 mutex_unlock(&disks_mutex);
4190 if (!error) { 4262 if (!error) {
4191 kobject_uevent(&mddev->kobj, KOBJ_ADD); 4263 kobject_uevent(&mddev->kobj, KOBJ_ADD);
4192 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 4264 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, NULL, "array_state");
4193 } 4265 }
4194 mddev_put(mddev); 4266 mddev_put(mddev);
4195 return error; 4267 return error;
@@ -4234,11 +4306,10 @@ static void md_safemode_timeout(unsigned long data)
4234 4306
4235static int start_dirty_degraded; 4307static int start_dirty_degraded;
4236 4308
4237static int do_md_run(mddev_t * mddev) 4309static int md_run(mddev_t *mddev)
4238{ 4310{
4239 int err; 4311 int err;
4240 mdk_rdev_t *rdev; 4312 mdk_rdev_t *rdev;
4241 struct gendisk *disk;
4242 struct mdk_personality *pers; 4313 struct mdk_personality *pers;
4243 4314
4244 if (list_empty(&mddev->disks)) 4315 if (list_empty(&mddev->disks))
@@ -4248,6 +4319,13 @@ static int do_md_run(mddev_t * mddev)
4248 if (mddev->pers) 4319 if (mddev->pers)
4249 return -EBUSY; 4320 return -EBUSY;
4250 4321
4322 /* These two calls synchronise us with the
4323 * sysfs_remove_group calls in mddev_unlock,
4324 * so they must have completed.
4325 */
4326 mutex_lock(&mddev->open_mutex);
4327 mutex_unlock(&mddev->open_mutex);
4328
4251 /* 4329 /*
4252 * Analyze all RAID superblock(s) 4330 * Analyze all RAID superblock(s)
4253 */ 4331 */
@@ -4296,8 +4374,6 @@ static int do_md_run(mddev_t * mddev)
4296 sysfs_notify_dirent(rdev->sysfs_state); 4374 sysfs_notify_dirent(rdev->sysfs_state);
4297 } 4375 }
4298 4376
4299 disk = mddev->gendisk;
4300
4301 spin_lock(&pers_lock); 4377 spin_lock(&pers_lock);
4302 pers = find_pers(mddev->level, mddev->clevel); 4378 pers = find_pers(mddev->level, mddev->clevel);
4303 if (!pers || !try_module_get(pers->owner)) { 4379 if (!pers || !try_module_get(pers->owner)) {
@@ -4398,7 +4474,7 @@ static int do_md_run(mddev_t * mddev)
4398 printk(KERN_WARNING 4474 printk(KERN_WARNING
4399 "md: cannot register extra attributes for %s\n", 4475 "md: cannot register extra attributes for %s\n",
4400 mdname(mddev)); 4476 mdname(mddev));
4401 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4477 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
4402 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4478 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
4403 mddev->ro = 0; 4479 mddev->ro = 0;
4404 4480
@@ -4425,22 +4501,32 @@ static int do_md_run(mddev_t * mddev)
4425 if (mddev->flags) 4501 if (mddev->flags)
4426 md_update_sb(mddev, 0); 4502 md_update_sb(mddev, 0);
4427 4503
4428 set_capacity(disk, mddev->array_sectors);
4429
4430 md_wakeup_thread(mddev->thread); 4504 md_wakeup_thread(mddev->thread);
4431 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4505 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4432 4506
4433 revalidate_disk(mddev->gendisk);
4434 mddev->changed = 1;
4435 md_new_event(mddev); 4507 md_new_event(mddev);
4436 sysfs_notify_dirent(mddev->sysfs_state); 4508 sysfs_notify_dirent(mddev->sysfs_state);
4437 if (mddev->sysfs_action) 4509 if (mddev->sysfs_action)
4438 sysfs_notify_dirent(mddev->sysfs_action); 4510 sysfs_notify_dirent(mddev->sysfs_action);
4439 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4511 sysfs_notify(&mddev->kobj, NULL, "degraded");
4440 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4441 return 0; 4512 return 0;
4442} 4513}
4443 4514
4515static int do_md_run(mddev_t *mddev)
4516{
4517 int err;
4518
4519 err = md_run(mddev);
4520 if (err)
4521 goto out;
4522
4523 set_capacity(mddev->gendisk, mddev->array_sectors);
4524 revalidate_disk(mddev->gendisk);
4525 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4526out:
4527 return err;
4528}
4529
4444static int restart_array(mddev_t *mddev) 4530static int restart_array(mddev_t *mddev)
4445{ 4531{
4446 struct gendisk *disk = mddev->gendisk; 4532 struct gendisk *disk = mddev->gendisk;
@@ -4491,9 +4577,110 @@ void restore_bitmap_write_access(struct file *file)
4491 spin_unlock(&inode->i_lock); 4577 spin_unlock(&inode->i_lock);
4492} 4578}
4493 4579
4580static void md_clean(mddev_t *mddev)
4581{
4582 mddev->array_sectors = 0;
4583 mddev->external_size = 0;
4584 mddev->dev_sectors = 0;
4585 mddev->raid_disks = 0;
4586 mddev->recovery_cp = 0;
4587 mddev->resync_min = 0;
4588 mddev->resync_max = MaxSector;
4589 mddev->reshape_position = MaxSector;
4590 mddev->external = 0;
4591 mddev->persistent = 0;
4592 mddev->level = LEVEL_NONE;
4593 mddev->clevel[0] = 0;
4594 mddev->flags = 0;
4595 mddev->ro = 0;
4596 mddev->metadata_type[0] = 0;
4597 mddev->chunk_sectors = 0;
4598 mddev->ctime = mddev->utime = 0;
4599 mddev->layout = 0;
4600 mddev->max_disks = 0;
4601 mddev->events = 0;
4602 mddev->can_decrease_events = 0;
4603 mddev->delta_disks = 0;
4604 mddev->new_level = LEVEL_NONE;
4605 mddev->new_layout = 0;
4606 mddev->new_chunk_sectors = 0;
4607 mddev->curr_resync = 0;
4608 mddev->resync_mismatches = 0;
4609 mddev->suspend_lo = mddev->suspend_hi = 0;
4610 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4611 mddev->recovery = 0;
4612 mddev->in_sync = 0;
4613 mddev->degraded = 0;
4614 mddev->barriers_work = 0;
4615 mddev->safemode = 0;
4616 mddev->bitmap_info.offset = 0;
4617 mddev->bitmap_info.default_offset = 0;
4618 mddev->bitmap_info.chunksize = 0;
4619 mddev->bitmap_info.daemon_sleep = 0;
4620 mddev->bitmap_info.max_write_behind = 0;
4621}
4622
4623static void md_stop_writes(mddev_t *mddev)
4624{
4625 if (mddev->sync_thread) {
4626 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4627 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4628 md_unregister_thread(mddev->sync_thread);
4629 mddev->sync_thread = NULL;
4630 }
4631
4632 del_timer_sync(&mddev->safemode_timer);
4633
4634 bitmap_flush(mddev);
4635 md_super_wait(mddev);
4636
4637 if (!mddev->in_sync || mddev->flags) {
4638 /* mark array as shutdown cleanly */
4639 mddev->in_sync = 1;
4640 md_update_sb(mddev, 1);
4641 }
4642}
4643
4644static void md_stop(mddev_t *mddev)
4645{
4646 md_stop_writes(mddev);
4647
4648 mddev->pers->stop(mddev);
4649 if (mddev->pers->sync_request && mddev->to_remove == NULL)
4650 mddev->to_remove = &md_redundancy_group;
4651 module_put(mddev->pers->owner);
4652 mddev->pers = NULL;
4653 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4654}
4655
4656static int md_set_readonly(mddev_t *mddev, int is_open)
4657{
4658 int err = 0;
4659 mutex_lock(&mddev->open_mutex);
4660 if (atomic_read(&mddev->openers) > is_open) {
4661 printk("md: %s still in use.\n",mdname(mddev));
4662 err = -EBUSY;
4663 goto out;
4664 }
4665 if (mddev->pers) {
4666 md_stop_writes(mddev);
4667
4668 err = -ENXIO;
4669 if (mddev->ro==1)
4670 goto out;
4671 mddev->ro = 1;
4672 set_disk_ro(mddev->gendisk, 1);
4673 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4674 sysfs_notify_dirent(mddev->sysfs_state);
4675 err = 0;
4676 }
4677out:
4678 mutex_unlock(&mddev->open_mutex);
4679 return err;
4680}
4681
4494/* mode: 4682/* mode:
4495 * 0 - completely stop and dis-assemble array 4683 * 0 - completely stop and dis-assemble array
4496 * 1 - switch to readonly
4497 * 2 - stop but do not disassemble array 4684 * 2 - stop but do not disassemble array
4498 */ 4685 */
4499static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4686static int do_md_stop(mddev_t * mddev, int mode, int is_open)
@@ -4508,64 +4695,32 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4508 err = -EBUSY; 4695 err = -EBUSY;
4509 } else if (mddev->pers) { 4696 } else if (mddev->pers) {
4510 4697
4511 if (mddev->sync_thread) { 4698 if (mddev->ro)
4512 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4699 set_disk_ro(disk, 0);
4513 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4514 md_unregister_thread(mddev->sync_thread);
4515 mddev->sync_thread = NULL;
4516 }
4517
4518 del_timer_sync(&mddev->safemode_timer);
4519 4700
4520 switch(mode) { 4701 md_stop(mddev);
4521 case 1: /* readonly */ 4702 mddev->queue->merge_bvec_fn = NULL;
4522 err = -ENXIO; 4703 mddev->queue->unplug_fn = NULL;
4523 if (mddev->ro==1) 4704 mddev->queue->backing_dev_info.congested_fn = NULL;
4524 goto out;
4525 mddev->ro = 1;
4526 break;
4527 case 0: /* disassemble */
4528 case 2: /* stop */
4529 bitmap_flush(mddev);
4530 md_super_wait(mddev);
4531 if (mddev->ro)
4532 set_disk_ro(disk, 0);
4533 4705
4534 mddev->pers->stop(mddev); 4706 /* tell userspace to handle 'inactive' */
4535 mddev->queue->merge_bvec_fn = NULL; 4707 sysfs_notify_dirent(mddev->sysfs_state);
4536 mddev->queue->unplug_fn = NULL;
4537 mddev->queue->backing_dev_info.congested_fn = NULL;
4538 module_put(mddev->pers->owner);
4539 if (mddev->pers->sync_request && mddev->private == NULL)
4540 mddev->private = (void*)1;
4541 mddev->pers = NULL;
4542 /* tell userspace to handle 'inactive' */
4543 sysfs_notify_dirent(mddev->sysfs_state);
4544 4708
4545 list_for_each_entry(rdev, &mddev->disks, same_set) 4709 list_for_each_entry(rdev, &mddev->disks, same_set)
4546 if (rdev->raid_disk >= 0) { 4710 if (rdev->raid_disk >= 0) {
4547 char nm[20]; 4711 char nm[20];
4548 sprintf(nm, "rd%d", rdev->raid_disk); 4712 sprintf(nm, "rd%d", rdev->raid_disk);
4549 sysfs_remove_link(&mddev->kobj, nm); 4713 sysfs_remove_link(&mddev->kobj, nm);
4550 } 4714 }
4551 4715
4552 set_capacity(disk, 0); 4716 set_capacity(disk, 0);
4553 mddev->changed = 1; 4717 revalidate_disk(disk);
4554 4718
4555 if (mddev->ro) 4719 if (mddev->ro)
4556 mddev->ro = 0; 4720 mddev->ro = 0;
4557 } 4721
4558 if (!mddev->in_sync || mddev->flags) {
4559 /* mark array as shutdown cleanly */
4560 mddev->in_sync = 1;
4561 md_update_sb(mddev, 1);
4562 }
4563 if (mode == 1)
4564 set_disk_ro(disk, 1);
4565 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4566 err = 0; 4722 err = 0;
4567 } 4723 }
4568out:
4569 mutex_unlock(&mddev->open_mutex); 4724 mutex_unlock(&mddev->open_mutex);
4570 if (err) 4725 if (err)
4571 return err; 4726 return err;
@@ -4586,52 +4741,12 @@ out:
4586 4741
4587 export_array(mddev); 4742 export_array(mddev);
4588 4743
4589 mddev->array_sectors = 0; 4744 md_clean(mddev);
4590 mddev->external_size = 0;
4591 mddev->dev_sectors = 0;
4592 mddev->raid_disks = 0;
4593 mddev->recovery_cp = 0;
4594 mddev->resync_min = 0;
4595 mddev->resync_max = MaxSector;
4596 mddev->reshape_position = MaxSector;
4597 mddev->external = 0;
4598 mddev->persistent = 0;
4599 mddev->level = LEVEL_NONE;
4600 mddev->clevel[0] = 0;
4601 mddev->flags = 0;
4602 mddev->ro = 0;
4603 mddev->metadata_type[0] = 0;
4604 mddev->chunk_sectors = 0;
4605 mddev->ctime = mddev->utime = 0;
4606 mddev->layout = 0;
4607 mddev->max_disks = 0;
4608 mddev->events = 0;
4609 mddev->delta_disks = 0;
4610 mddev->new_level = LEVEL_NONE;
4611 mddev->new_layout = 0;
4612 mddev->new_chunk_sectors = 0;
4613 mddev->curr_resync = 0;
4614 mddev->resync_mismatches = 0;
4615 mddev->suspend_lo = mddev->suspend_hi = 0;
4616 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4617 mddev->recovery = 0;
4618 mddev->in_sync = 0;
4619 mddev->changed = 0;
4620 mddev->degraded = 0;
4621 mddev->barriers_work = 0;
4622 mddev->safemode = 0;
4623 mddev->bitmap_info.offset = 0;
4624 mddev->bitmap_info.default_offset = 0;
4625 mddev->bitmap_info.chunksize = 0;
4626 mddev->bitmap_info.daemon_sleep = 0;
4627 mddev->bitmap_info.max_write_behind = 0;
4628 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4745 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4629 if (mddev->hold_active == UNTIL_STOP) 4746 if (mddev->hold_active == UNTIL_STOP)
4630 mddev->hold_active = 0; 4747 mddev->hold_active = 0;
4631 4748
4632 } else if (mddev->pers) 4749 }
4633 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4634 mdname(mddev));
4635 err = 0; 4750 err = 0;
4636 blk_integrity_unregister(disk); 4751 blk_integrity_unregister(disk);
4637 md_new_event(mddev); 4752 md_new_event(mddev);
@@ -5349,7 +5464,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
5349 if (mddev->pers->check_reshape == NULL) 5464 if (mddev->pers->check_reshape == NULL)
5350 return -EINVAL; 5465 return -EINVAL;
5351 if (raid_disks <= 0 || 5466 if (raid_disks <= 0 ||
5352 raid_disks >= mddev->max_disks) 5467 (mddev->max_disks && raid_disks >= mddev->max_disks))
5353 return -EINVAL; 5468 return -EINVAL;
5354 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5469 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5355 return -EBUSY; 5470 return -EBUSY;
@@ -5486,7 +5601,7 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5486 5601
5487 geo->heads = 2; 5602 geo->heads = 2;
5488 geo->sectors = 4; 5603 geo->sectors = 4;
5489 geo->cylinders = get_capacity(mddev->gendisk) / 8; 5604 geo->cylinders = mddev->array_sectors / 8;
5490 return 0; 5605 return 0;
5491} 5606}
5492 5607
@@ -5496,6 +5611,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
5496 int err = 0; 5611 int err = 0;
5497 void __user *argp = (void __user *)arg; 5612 void __user *argp = (void __user *)arg;
5498 mddev_t *mddev = NULL; 5613 mddev_t *mddev = NULL;
5614 int ro;
5499 5615
5500 if (!capable(CAP_SYS_ADMIN)) 5616 if (!capable(CAP_SYS_ADMIN))
5501 return -EACCES; 5617 return -EACCES;
@@ -5628,9 +5744,37 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
5628 goto done_unlock; 5744 goto done_unlock;
5629 5745
5630 case STOP_ARRAY_RO: 5746 case STOP_ARRAY_RO:
5631 err = do_md_stop(mddev, 1, 1); 5747 err = md_set_readonly(mddev, 1);
5632 goto done_unlock; 5748 goto done_unlock;
5633 5749
5750 case BLKROSET:
5751 if (get_user(ro, (int __user *)(arg))) {
5752 err = -EFAULT;
5753 goto done_unlock;
5754 }
5755 err = -EINVAL;
5756
5757 /* if the bdev is going readonly the value of mddev->ro
5758 * does not matter, no writes are coming
5759 */
5760 if (ro)
5761 goto done_unlock;
5762
5763 /* are we are already prepared for writes? */
5764 if (mddev->ro != 1)
5765 goto done_unlock;
5766
5767 /* transitioning to readauto need only happen for
5768 * arrays that call md_write_start
5769 */
5770 if (mddev->pers) {
5771 err = restart_array(mddev);
5772 if (err == 0) {
5773 mddev->ro = 2;
5774 set_disk_ro(mddev->gendisk, 0);
5775 }
5776 }
5777 goto done_unlock;
5634 } 5778 }
5635 5779
5636 /* 5780 /*
@@ -5751,7 +5895,6 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5751 atomic_inc(&mddev->openers); 5895 atomic_inc(&mddev->openers);
5752 mutex_unlock(&mddev->open_mutex); 5896 mutex_unlock(&mddev->open_mutex);
5753 5897
5754 check_disk_change(bdev);
5755 out: 5898 out:
5756 return err; 5899 return err;
5757} 5900}
@@ -5766,21 +5909,6 @@ static int md_release(struct gendisk *disk, fmode_t mode)
5766 5909
5767 return 0; 5910 return 0;
5768} 5911}
5769
5770static int md_media_changed(struct gendisk *disk)
5771{
5772 mddev_t *mddev = disk->private_data;
5773
5774 return mddev->changed;
5775}
5776
5777static int md_revalidate(struct gendisk *disk)
5778{
5779 mddev_t *mddev = disk->private_data;
5780
5781 mddev->changed = 0;
5782 return 0;
5783}
5784static const struct block_device_operations md_fops = 5912static const struct block_device_operations md_fops =
5785{ 5913{
5786 .owner = THIS_MODULE, 5914 .owner = THIS_MODULE,
@@ -5791,8 +5919,6 @@ static const struct block_device_operations md_fops =
5791 .compat_ioctl = md_compat_ioctl, 5919 .compat_ioctl = md_compat_ioctl,
5792#endif 5920#endif
5793 .getgeo = md_getgeo, 5921 .getgeo = md_getgeo,
5794 .media_changed = md_media_changed,
5795 .revalidate_disk= md_revalidate,
5796}; 5922};
5797 5923
5798static int md_thread(void * arg) 5924static int md_thread(void * arg)
@@ -5906,7 +6032,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5906 mddev->pers->error_handler(mddev,rdev); 6032 mddev->pers->error_handler(mddev,rdev);
5907 if (mddev->degraded) 6033 if (mddev->degraded)
5908 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6034 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5909 set_bit(StateChanged, &rdev->flags); 6035 sysfs_notify_dirent(rdev->sysfs_state);
5910 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6036 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5911 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6037 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5912 md_wakeup_thread(mddev->thread); 6038 md_wakeup_thread(mddev->thread);
@@ -6898,11 +7024,6 @@ void md_check_recovery(mddev_t *mddev)
6898 if (mddev->flags) 7024 if (mddev->flags)
6899 md_update_sb(mddev, 0); 7025 md_update_sb(mddev, 0);
6900 7026
6901 list_for_each_entry(rdev, &mddev->disks, same_set)
6902 if (test_and_clear_bit(StateChanged, &rdev->flags))
6903 sysfs_notify_dirent(rdev->sysfs_state);
6904
6905
6906 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 7027 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6907 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 7028 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6908 /* resync/recovery still happening */ 7029 /* resync/recovery still happening */
@@ -7039,7 +7160,7 @@ static int md_notify_reboot(struct notifier_block *this,
7039 * appears to still be in use. Hence 7160 * appears to still be in use. Hence
7040 * the '100'. 7161 * the '100'.
7041 */ 7162 */
7042 do_md_stop(mddev, 1, 100); 7163 md_set_readonly(mddev, 100);
7043 mddev_unlock(mddev); 7164 mddev_unlock(mddev);
7044 } 7165 }
7045 /* 7166 /*