aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2009-03-30 23:39:39 -0400
committerNeilBrown <neilb@suse.de>2009-03-30 23:39:39 -0400
commit409c57f3801701dfee27a28103dda4831306cb20 (patch)
tree430c8e4ebe879b27250e061dc1a1171b12aaadf0
parente0cf8f045b2023b0b3f919ee93eb94345f648434 (diff)
md: enable suspend/resume of md devices.
To be able to change the 'level' of an md/raid array, we need to suspend the device so that no requests are active - then move some pointers around etc. The code already keeps counts of active requests and the ->quiesce function can be used to wait until those counts hit zero. However the quiesce function blocks new requests once they are all ready 'inside' the personality module, and that is too late if we want to replace the personality modules. So make all md requests come in through a common md_make_request function that keeps track of how many requests have entered the modules but may not yet be on the internal reference counts. Allow md_make_request to be blocked when we want to suspend the device, and make it possible to wait for all those in-transit requests to be added to internal lists so that ->quiesce can wait for them. There is still a problem that when a request completes, we drop the ref count inside the personality code so there is a short time between when the refcount hits zero, and when the personality code is no longer being used. The personality code never blocks (schedule or spinlock) between dropping the refcount and exiting the routine, so this should be safe (as put_module calls synchronize_sched() before unmapping the module code). Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/md.c79
-rw-r--r--drivers/md/md.h2
-rw-r--r--drivers/md/raid1.c3
-rw-r--r--drivers/md/raid10.c3
4 files changed, 72 insertions, 15 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f30f09cb08e8..6cb31f8da14c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -201,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
201 ) 201 )
202 202
203 203
204static int md_fail_request(struct request_queue *q, struct bio *bio) 204/* Rather than calling directly into the personality make_request function,
205 * IO requests come here first so that we can check if the device is
206 * being suspended pending a reconfiguration.
207 * We hold a refcount over the call to ->make_request. By the time that
208 * call has finished, the bio has been linked into some internal structure
209 * and so is visible to ->quiesce(), so we don't need the refcount any more.
210 */
211static int md_make_request(struct request_queue *q, struct bio *bio)
205{ 212{
206 bio_io_error(bio); 213 mddev_t *mddev = q->queuedata;
207 return 0; 214 int rv;
215 if (mddev == NULL || mddev->pers == NULL) {
216 bio_io_error(bio);
217 return 0;
218 }
219 rcu_read_lock();
220 if (mddev->suspended) {
221 DEFINE_WAIT(__wait);
222 for (;;) {
223 prepare_to_wait(&mddev->sb_wait, &__wait,
224 TASK_UNINTERRUPTIBLE);
225 if (!mddev->suspended)
226 break;
227 rcu_read_unlock();
228 schedule();
229 rcu_read_lock();
230 }
231 finish_wait(&mddev->sb_wait, &__wait);
232 }
233 atomic_inc(&mddev->active_io);
234 rcu_read_unlock();
235 rv = mddev->pers->make_request(q, bio);
236 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
237 wake_up(&mddev->sb_wait);
238
239 return rv;
208} 240}
209 241
242static void mddev_suspend(mddev_t *mddev)
243{
244 BUG_ON(mddev->suspended);
245 mddev->suspended = 1;
246 synchronize_rcu();
247 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
248 mddev->pers->quiesce(mddev, 1);
249 md_unregister_thread(mddev->thread);
250 mddev->thread = NULL;
251 /* we now know that no code is executing in the personality module,
252 * except possibly the tail end of a ->bi_end_io function, but that
253 * is certain to complete before the module has a chance to get
254 * unloaded
255 */
256}
257
258static void mddev_resume(mddev_t *mddev)
259{
260 mddev->suspended = 0;
261 wake_up(&mddev->sb_wait);
262 mddev->pers->quiesce(mddev, 0);
263}
264
265
210static inline mddev_t *mddev_get(mddev_t *mddev) 266static inline mddev_t *mddev_get(mddev_t *mddev)
211{ 267{
212 atomic_inc(&mddev->active); 268 atomic_inc(&mddev->active);
@@ -314,6 +370,7 @@ static mddev_t * mddev_find(dev_t unit)
314 init_timer(&new->safemode_timer); 370 init_timer(&new->safemode_timer);
315 atomic_set(&new->active, 1); 371 atomic_set(&new->active, 1);
316 atomic_set(&new->openers, 0); 372 atomic_set(&new->openers, 0);
373 atomic_set(&new->active_io, 0);
317 spin_lock_init(&new->write_lock); 374 spin_lock_init(&new->write_lock);
318 init_waitqueue_head(&new->sb_wait); 375 init_waitqueue_head(&new->sb_wait);
319 init_waitqueue_head(&new->recovery_wait); 376 init_waitqueue_head(&new->recovery_wait);
@@ -3632,10 +3689,12 @@ static int md_alloc(dev_t dev, char *name)
3632 mddev_put(mddev); 3689 mddev_put(mddev);
3633 return -ENOMEM; 3690 return -ENOMEM;
3634 } 3691 }
3692 mddev->queue->queuedata = mddev;
3693
3635 /* Can be unlocked because the queue is new: no concurrency */ 3694 /* Can be unlocked because the queue is new: no concurrency */
3636 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); 3695 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3637 3696
3638 blk_queue_make_request(mddev->queue, md_fail_request); 3697 blk_queue_make_request(mddev->queue, md_make_request);
3639 3698
3640 disk = alloc_disk(1 << shift); 3699 disk = alloc_disk(1 << shift);
3641 if (!disk) { 3700 if (!disk) {
@@ -3938,16 +3997,6 @@ static int do_md_run(mddev_t * mddev)
3938 3997
3939 set_capacity(disk, mddev->array_sectors); 3998 set_capacity(disk, mddev->array_sectors);
3940 3999
3941 /* If we call blk_queue_make_request here, it will
3942 * re-initialise max_sectors etc which may have been
3943 * refined inside -> run. So just set the bits we need to set.
3944 * Most initialisation happended when we called
3945 * blk_queue_make_request(..., md_fail_request)
3946 * earlier.
3947 */
3948 mddev->queue->queuedata = mddev;
3949 mddev->queue->make_request_fn = mddev->pers->make_request;
3950
3951 /* If there is a partially-recovered drive we need to 4000 /* If there is a partially-recovered drive we need to
3952 * start recovery here. If we leave it to md_check_recovery, 4001 * start recovery here. If we leave it to md_check_recovery,
3953 * it will remove the drives and not do the right thing 4002 * it will remove the drives and not do the right thing
@@ -4077,7 +4126,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4077 md_super_wait(mddev); 4126 md_super_wait(mddev);
4078 if (mddev->ro) 4127 if (mddev->ro)
4079 set_disk_ro(disk, 0); 4128 set_disk_ro(disk, 0);
4080 blk_queue_make_request(mddev->queue, md_fail_request); 4129
4081 mddev->pers->stop(mddev); 4130 mddev->pers->stop(mddev);
4082 mddev->queue->merge_bvec_fn = NULL; 4131 mddev->queue->merge_bvec_fn = NULL;
4083 mddev->queue->unplug_fn = NULL; 4132 mddev->queue->unplug_fn = NULL;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index c07ea9118063..84b22d67ba14 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -119,6 +119,8 @@ struct mddev_s
119#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ 119#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
120#define MD_CHANGE_PENDING 2 /* superblock update in progress */ 120#define MD_CHANGE_PENDING 2 /* superblock update in progress */
121 121
122 int suspended;
123 atomic_t active_io;
122 int ro; 124 int ro;
123 125
124 struct gendisk *gendisk; 126 struct gendisk *gendisk;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 779958705abf..7eaca3209364 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2092,6 +2092,9 @@ static int stop(mddev_t *mddev)
2092 /* need to kick something here to make sure I/O goes? */ 2092 /* need to kick something here to make sure I/O goes? */
2093 } 2093 }
2094 2094
2095 raise_barrier(conf);
2096 lower_barrier(conf);
2097
2095 md_unregister_thread(mddev->thread); 2098 md_unregister_thread(mddev->thread);
2096 mddev->thread = NULL; 2099 mddev->thread = NULL;
2097 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2100 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d56cb2ae515f..c2059e25d03f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2211,6 +2211,9 @@ static int stop(mddev_t *mddev)
2211{ 2211{
2212 conf_t *conf = mddev_to_conf(mddev); 2212 conf_t *conf = mddev_to_conf(mddev);
2213 2213
2214 raise_barrier(conf, 0);
2215 lower_barrier(conf);
2216
2214 md_unregister_thread(mddev->thread); 2217 md_unregister_thread(mddev->thread);
2215 mddev->thread = NULL; 2218 mddev->thread = NULL;
2216 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2219 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/