diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 787 |
1 files changed, 461 insertions, 326 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index f20d13e717d5..91e31e260b4a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -36,7 +36,7 @@ | |||
36 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
37 | #include <linux/sysctl.h> | 37 | #include <linux/sysctl.h> |
38 | #include <linux/seq_file.h> | 38 | #include <linux/seq_file.h> |
39 | #include <linux/smp_lock.h> | 39 | #include <linux/mutex.h> |
40 | #include <linux/buffer_head.h> /* for invalidate_bdev */ | 40 | #include <linux/buffer_head.h> /* for invalidate_bdev */ |
41 | #include <linux/poll.h> | 41 | #include <linux/poll.h> |
42 | #include <linux/ctype.h> | 42 | #include <linux/ctype.h> |
@@ -57,7 +57,6 @@ | |||
57 | #define DEBUG 0 | 57 | #define DEBUG 0 |
58 | #define dprintk(x...) ((void)(DEBUG && printk(x))) | 58 | #define dprintk(x...) ((void)(DEBUG && printk(x))) |
59 | 59 | ||
60 | |||
61 | #ifndef MODULE | 60 | #ifndef MODULE |
62 | static void autostart_arrays(int part); | 61 | static void autostart_arrays(int part); |
63 | #endif | 62 | #endif |
@@ -68,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock); | |||
68 | static void md_print_devices(void); | 67 | static void md_print_devices(void); |
69 | 68 | ||
70 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); | 69 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
70 | static struct workqueue_struct *md_wq; | ||
71 | static struct workqueue_struct *md_misc_wq; | ||
71 | 72 | ||
72 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } | 73 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } |
73 | 74 | ||
@@ -148,6 +149,72 @@ static const struct block_device_operations md_fops; | |||
148 | 149 | ||
149 | static int start_readonly; | 150 | static int start_readonly; |
150 | 151 | ||
152 | /* bio_clone_mddev | ||
153 | * like bio_clone, but with a local bio set | ||
154 | */ | ||
155 | |||
156 | static void mddev_bio_destructor(struct bio *bio) | ||
157 | { | ||
158 | mddev_t *mddev, **mddevp; | ||
159 | |||
160 | mddevp = (void*)bio; | ||
161 | mddev = mddevp[-1]; | ||
162 | |||
163 | bio_free(bio, mddev->bio_set); | ||
164 | } | ||
165 | |||
166 | struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | ||
167 | mddev_t *mddev) | ||
168 | { | ||
169 | struct bio *b; | ||
170 | mddev_t **mddevp; | ||
171 | |||
172 | if (!mddev || !mddev->bio_set) | ||
173 | return bio_alloc(gfp_mask, nr_iovecs); | ||
174 | |||
175 | b = bio_alloc_bioset(gfp_mask, nr_iovecs, | ||
176 | mddev->bio_set); | ||
177 | if (!b) | ||
178 | return NULL; | ||
179 | mddevp = (void*)b; | ||
180 | mddevp[-1] = mddev; | ||
181 | b->bi_destructor = mddev_bio_destructor; | ||
182 | return b; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(bio_alloc_mddev); | ||
185 | |||
186 | struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | ||
187 | mddev_t *mddev) | ||
188 | { | ||
189 | struct bio *b; | ||
190 | mddev_t **mddevp; | ||
191 | |||
192 | if (!mddev || !mddev->bio_set) | ||
193 | return bio_clone(bio, gfp_mask); | ||
194 | |||
195 | b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, | ||
196 | mddev->bio_set); | ||
197 | if (!b) | ||
198 | return NULL; | ||
199 | mddevp = (void*)b; | ||
200 | mddevp[-1] = mddev; | ||
201 | b->bi_destructor = mddev_bio_destructor; | ||
202 | __bio_clone(b, bio); | ||
203 | if (bio_integrity(bio)) { | ||
204 | int ret; | ||
205 | |||
206 | ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); | ||
207 | |||
208 | if (ret < 0) { | ||
209 | bio_put(b); | ||
210 | return NULL; | ||
211 | } | ||
212 | } | ||
213 | |||
214 | return b; | ||
215 | } | ||
216 | EXPORT_SYMBOL_GPL(bio_clone_mddev); | ||
217 | |||
151 | /* | 218 | /* |
152 | * We have a system wide 'event count' that is incremented | 219 | * We have a system wide 'event count' that is incremented |
153 | * on any 'interesting' event, and readers of /proc/mdstat | 220 | * on any 'interesting' event, and readers of /proc/mdstat |
@@ -220,18 +287,21 @@ static int md_make_request(struct request_queue *q, struct bio *bio) | |||
220 | mddev_t *mddev = q->queuedata; | 287 | mddev_t *mddev = q->queuedata; |
221 | int rv; | 288 | int rv; |
222 | int cpu; | 289 | int cpu; |
290 | unsigned int sectors; | ||
223 | 291 | ||
224 | if (mddev == NULL || mddev->pers == NULL) { | 292 | if (mddev == NULL || mddev->pers == NULL |
293 | || !mddev->ready) { | ||
225 | bio_io_error(bio); | 294 | bio_io_error(bio); |
226 | return 0; | 295 | return 0; |
227 | } | 296 | } |
297 | smp_rmb(); /* Ensure implications of 'active' are visible */ | ||
228 | rcu_read_lock(); | 298 | rcu_read_lock(); |
229 | if (mddev->suspended || mddev->barrier) { | 299 | if (mddev->suspended) { |
230 | DEFINE_WAIT(__wait); | 300 | DEFINE_WAIT(__wait); |
231 | for (;;) { | 301 | for (;;) { |
232 | prepare_to_wait(&mddev->sb_wait, &__wait, | 302 | prepare_to_wait(&mddev->sb_wait, &__wait, |
233 | TASK_UNINTERRUPTIBLE); | 303 | TASK_UNINTERRUPTIBLE); |
234 | if (!mddev->suspended && !mddev->barrier) | 304 | if (!mddev->suspended) |
235 | break; | 305 | break; |
236 | rcu_read_unlock(); | 306 | rcu_read_unlock(); |
237 | schedule(); | 307 | schedule(); |
@@ -242,12 +312,16 @@ static int md_make_request(struct request_queue *q, struct bio *bio) | |||
242 | atomic_inc(&mddev->active_io); | 312 | atomic_inc(&mddev->active_io); |
243 | rcu_read_unlock(); | 313 | rcu_read_unlock(); |
244 | 314 | ||
315 | /* | ||
316 | * save the sectors now since our bio can | ||
317 | * go away inside make_request | ||
318 | */ | ||
319 | sectors = bio_sectors(bio); | ||
245 | rv = mddev->pers->make_request(mddev, bio); | 320 | rv = mddev->pers->make_request(mddev, bio); |
246 | 321 | ||
247 | cpu = part_stat_lock(); | 322 | cpu = part_stat_lock(); |
248 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | 323 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); |
249 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], | 324 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); |
250 | bio_sectors(bio)); | ||
251 | part_stat_unlock(); | 325 | part_stat_unlock(); |
252 | 326 | ||
253 | if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) | 327 | if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) |
@@ -277,48 +351,45 @@ void mddev_resume(mddev_t *mddev) | |||
277 | mddev->suspended = 0; | 351 | mddev->suspended = 0; |
278 | wake_up(&mddev->sb_wait); | 352 | wake_up(&mddev->sb_wait); |
279 | mddev->pers->quiesce(mddev, 0); | 353 | mddev->pers->quiesce(mddev, 0); |
354 | |||
355 | md_wakeup_thread(mddev->thread); | ||
356 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | ||
280 | } | 357 | } |
281 | EXPORT_SYMBOL_GPL(mddev_resume); | 358 | EXPORT_SYMBOL_GPL(mddev_resume); |
282 | 359 | ||
283 | int mddev_congested(mddev_t *mddev, int bits) | 360 | int mddev_congested(mddev_t *mddev, int bits) |
284 | { | 361 | { |
285 | if (mddev->barrier) | ||
286 | return 1; | ||
287 | return mddev->suspended; | 362 | return mddev->suspended; |
288 | } | 363 | } |
289 | EXPORT_SYMBOL(mddev_congested); | 364 | EXPORT_SYMBOL(mddev_congested); |
290 | 365 | ||
291 | /* | 366 | /* |
292 | * Generic barrier handling for md | 367 | * Generic flush handling for md |
293 | */ | 368 | */ |
294 | 369 | ||
295 | #define POST_REQUEST_BARRIER ((void*)1) | 370 | static void md_end_flush(struct bio *bio, int err) |
296 | |||
297 | static void md_end_barrier(struct bio *bio, int err) | ||
298 | { | 371 | { |
299 | mdk_rdev_t *rdev = bio->bi_private; | 372 | mdk_rdev_t *rdev = bio->bi_private; |
300 | mddev_t *mddev = rdev->mddev; | 373 | mddev_t *mddev = rdev->mddev; |
301 | if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) | ||
302 | set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); | ||
303 | 374 | ||
304 | rdev_dec_pending(rdev, mddev); | 375 | rdev_dec_pending(rdev, mddev); |
305 | 376 | ||
306 | if (atomic_dec_and_test(&mddev->flush_pending)) { | 377 | if (atomic_dec_and_test(&mddev->flush_pending)) { |
307 | if (mddev->barrier == POST_REQUEST_BARRIER) { | 378 | /* The pre-request flush has finished */ |
308 | /* This was a post-request barrier */ | 379 | queue_work(md_wq, &mddev->flush_work); |
309 | mddev->barrier = NULL; | ||
310 | wake_up(&mddev->sb_wait); | ||
311 | } else | ||
312 | /* The pre-request barrier has finished */ | ||
313 | schedule_work(&mddev->barrier_work); | ||
314 | } | 380 | } |
315 | bio_put(bio); | 381 | bio_put(bio); |
316 | } | 382 | } |
317 | 383 | ||
318 | static void submit_barriers(mddev_t *mddev) | 384 | static void md_submit_flush_data(struct work_struct *ws); |
385 | |||
386 | static void submit_flushes(struct work_struct *ws) | ||
319 | { | 387 | { |
388 | mddev_t *mddev = container_of(ws, mddev_t, flush_work); | ||
320 | mdk_rdev_t *rdev; | 389 | mdk_rdev_t *rdev; |
321 | 390 | ||
391 | INIT_WORK(&mddev->flush_work, md_submit_flush_data); | ||
392 | atomic_set(&mddev->flush_pending, 1); | ||
322 | rcu_read_lock(); | 393 | rcu_read_lock(); |
323 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | 394 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) |
324 | if (rdev->raid_disk >= 0 && | 395 | if (rdev->raid_disk >= 0 && |
@@ -331,106 +402,107 @@ static void submit_barriers(mddev_t *mddev) | |||
331 | atomic_inc(&rdev->nr_pending); | 402 | atomic_inc(&rdev->nr_pending); |
332 | atomic_inc(&rdev->nr_pending); | 403 | atomic_inc(&rdev->nr_pending); |
333 | rcu_read_unlock(); | 404 | rcu_read_unlock(); |
334 | bi = bio_alloc(GFP_KERNEL, 0); | 405 | bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); |
335 | bi->bi_end_io = md_end_barrier; | 406 | bi->bi_end_io = md_end_flush; |
336 | bi->bi_private = rdev; | 407 | bi->bi_private = rdev; |
337 | bi->bi_bdev = rdev->bdev; | 408 | bi->bi_bdev = rdev->bdev; |
338 | atomic_inc(&mddev->flush_pending); | 409 | atomic_inc(&mddev->flush_pending); |
339 | submit_bio(WRITE_BARRIER, bi); | 410 | submit_bio(WRITE_FLUSH, bi); |
340 | rcu_read_lock(); | 411 | rcu_read_lock(); |
341 | rdev_dec_pending(rdev, mddev); | 412 | rdev_dec_pending(rdev, mddev); |
342 | } | 413 | } |
343 | rcu_read_unlock(); | 414 | rcu_read_unlock(); |
415 | if (atomic_dec_and_test(&mddev->flush_pending)) | ||
416 | queue_work(md_wq, &mddev->flush_work); | ||
344 | } | 417 | } |
345 | 418 | ||
346 | static void md_submit_barrier(struct work_struct *ws) | 419 | static void md_submit_flush_data(struct work_struct *ws) |
347 | { | 420 | { |
348 | mddev_t *mddev = container_of(ws, mddev_t, barrier_work); | 421 | mddev_t *mddev = container_of(ws, mddev_t, flush_work); |
349 | struct bio *bio = mddev->barrier; | 422 | struct bio *bio = mddev->flush_bio; |
350 | |||
351 | atomic_set(&mddev->flush_pending, 1); | ||
352 | 423 | ||
353 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | 424 | if (bio->bi_size == 0) |
354 | bio_endio(bio, -EOPNOTSUPP); | ||
355 | else if (bio->bi_size == 0) | ||
356 | /* an empty barrier - all done */ | 425 | /* an empty barrier - all done */ |
357 | bio_endio(bio, 0); | 426 | bio_endio(bio, 0); |
358 | else { | 427 | else { |
359 | bio->bi_rw &= ~REQ_HARDBARRIER; | 428 | bio->bi_rw &= ~REQ_FLUSH; |
360 | if (mddev->pers->make_request(mddev, bio)) | 429 | if (mddev->pers->make_request(mddev, bio)) |
361 | generic_make_request(bio); | 430 | generic_make_request(bio); |
362 | mddev->barrier = POST_REQUEST_BARRIER; | ||
363 | submit_barriers(mddev); | ||
364 | } | ||
365 | if (atomic_dec_and_test(&mddev->flush_pending)) { | ||
366 | mddev->barrier = NULL; | ||
367 | wake_up(&mddev->sb_wait); | ||
368 | } | 431 | } |
432 | |||
433 | mddev->flush_bio = NULL; | ||
434 | wake_up(&mddev->sb_wait); | ||
369 | } | 435 | } |
370 | 436 | ||
371 | void md_barrier_request(mddev_t *mddev, struct bio *bio) | 437 | void md_flush_request(mddev_t *mddev, struct bio *bio) |
372 | { | 438 | { |
373 | spin_lock_irq(&mddev->write_lock); | 439 | spin_lock_irq(&mddev->write_lock); |
374 | wait_event_lock_irq(mddev->sb_wait, | 440 | wait_event_lock_irq(mddev->sb_wait, |
375 | !mddev->barrier, | 441 | !mddev->flush_bio, |
376 | mddev->write_lock, /*nothing*/); | 442 | mddev->write_lock, /*nothing*/); |
377 | mddev->barrier = bio; | 443 | mddev->flush_bio = bio; |
378 | spin_unlock_irq(&mddev->write_lock); | 444 | spin_unlock_irq(&mddev->write_lock); |
379 | 445 | ||
380 | atomic_set(&mddev->flush_pending, 1); | 446 | INIT_WORK(&mddev->flush_work, submit_flushes); |
381 | INIT_WORK(&mddev->barrier_work, md_submit_barrier); | 447 | queue_work(md_wq, &mddev->flush_work); |
382 | |||
383 | submit_barriers(mddev); | ||
384 | |||
385 | if (atomic_dec_and_test(&mddev->flush_pending)) | ||
386 | schedule_work(&mddev->barrier_work); | ||
387 | } | 448 | } |
388 | EXPORT_SYMBOL(md_barrier_request); | 449 | EXPORT_SYMBOL(md_flush_request); |
389 | 450 | ||
390 | /* Support for plugging. | 451 | /* Support for plugging. |
391 | * This mirrors the plugging support in request_queue, but does not | 452 | * This mirrors the plugging support in request_queue, but does not |
392 | * require having a whole queue | 453 | * require having a whole queue or request structures. |
454 | * We allocate an md_plug_cb for each md device and each thread it gets | ||
455 | * plugged on. This links tot the private plug_handle structure in the | ||
456 | * personality data where we keep a count of the number of outstanding | ||
457 | * plugs so other code can see if a plug is active. | ||
393 | */ | 458 | */ |
394 | static void plugger_work(struct work_struct *work) | 459 | struct md_plug_cb { |
395 | { | 460 | struct blk_plug_cb cb; |
396 | struct plug_handle *plug = | 461 | mddev_t *mddev; |
397 | container_of(work, struct plug_handle, unplug_work); | 462 | }; |
398 | plug->unplug_fn(plug); | ||
399 | } | ||
400 | static void plugger_timeout(unsigned long data) | ||
401 | { | ||
402 | struct plug_handle *plug = (void *)data; | ||
403 | kblockd_schedule_work(NULL, &plug->unplug_work); | ||
404 | } | ||
405 | void plugger_init(struct plug_handle *plug, | ||
406 | void (*unplug_fn)(struct plug_handle *)) | ||
407 | { | ||
408 | plug->unplug_flag = 0; | ||
409 | plug->unplug_fn = unplug_fn; | ||
410 | init_timer(&plug->unplug_timer); | ||
411 | plug->unplug_timer.function = plugger_timeout; | ||
412 | plug->unplug_timer.data = (unsigned long)plug; | ||
413 | INIT_WORK(&plug->unplug_work, plugger_work); | ||
414 | } | ||
415 | EXPORT_SYMBOL_GPL(plugger_init); | ||
416 | 463 | ||
417 | void plugger_set_plug(struct plug_handle *plug) | 464 | static void plugger_unplug(struct blk_plug_cb *cb) |
418 | { | 465 | { |
419 | if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) | 466 | struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); |
420 | mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); | 467 | if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) |
468 | md_wakeup_thread(mdcb->mddev->thread); | ||
469 | kfree(mdcb); | ||
421 | } | 470 | } |
422 | EXPORT_SYMBOL_GPL(plugger_set_plug); | ||
423 | 471 | ||
424 | int plugger_remove_plug(struct plug_handle *plug) | 472 | /* Check that an unplug wakeup will come shortly. |
473 | * If not, wakeup the md thread immediately | ||
474 | */ | ||
475 | int mddev_check_plugged(mddev_t *mddev) | ||
425 | { | 476 | { |
426 | if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { | 477 | struct blk_plug *plug = current->plug; |
427 | del_timer(&plug->unplug_timer); | 478 | struct md_plug_cb *mdcb; |
428 | return 1; | 479 | |
429 | } else | 480 | if (!plug) |
430 | return 0; | 481 | return 0; |
431 | } | ||
432 | EXPORT_SYMBOL_GPL(plugger_remove_plug); | ||
433 | 482 | ||
483 | list_for_each_entry(mdcb, &plug->cb_list, cb.list) { | ||
484 | if (mdcb->cb.callback == plugger_unplug && | ||
485 | mdcb->mddev == mddev) { | ||
486 | /* Already on the list, move to top */ | ||
487 | if (mdcb != list_first_entry(&plug->cb_list, | ||
488 | struct md_plug_cb, | ||
489 | cb.list)) | ||
490 | list_move(&mdcb->cb.list, &plug->cb_list); | ||
491 | return 1; | ||
492 | } | ||
493 | } | ||
494 | /* Not currently on the callback list */ | ||
495 | mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); | ||
496 | if (!mdcb) | ||
497 | return 0; | ||
498 | |||
499 | mdcb->mddev = mddev; | ||
500 | mdcb->cb.callback = plugger_unplug; | ||
501 | atomic_inc(&mddev->plug_cnt); | ||
502 | list_add(&mdcb->cb.list, &plug->cb_list); | ||
503 | return 1; | ||
504 | } | ||
505 | EXPORT_SYMBOL_GPL(mddev_check_plugged); | ||
434 | 506 | ||
435 | static inline mddev_t *mddev_get(mddev_t *mddev) | 507 | static inline mddev_t *mddev_get(mddev_t *mddev) |
436 | { | 508 | { |
@@ -442,6 +514,8 @@ static void mddev_delayed_delete(struct work_struct *ws); | |||
442 | 514 | ||
443 | static void mddev_put(mddev_t *mddev) | 515 | static void mddev_put(mddev_t *mddev) |
444 | { | 516 | { |
517 | struct bio_set *bs = NULL; | ||
518 | |||
445 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) | 519 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) |
446 | return; | 520 | return; |
447 | if (!mddev->raid_disks && list_empty(&mddev->disks) && | 521 | if (!mddev->raid_disks && list_empty(&mddev->disks) && |
@@ -449,19 +523,22 @@ static void mddev_put(mddev_t *mddev) | |||
449 | /* Array is not configured at all, and not held active, | 523 | /* Array is not configured at all, and not held active, |
450 | * so destroy it */ | 524 | * so destroy it */ |
451 | list_del(&mddev->all_mddevs); | 525 | list_del(&mddev->all_mddevs); |
526 | bs = mddev->bio_set; | ||
527 | mddev->bio_set = NULL; | ||
452 | if (mddev->gendisk) { | 528 | if (mddev->gendisk) { |
453 | /* we did a probe so need to clean up. | 529 | /* We did a probe so need to clean up. Call |
454 | * Call schedule_work inside the spinlock | 530 | * queue_work inside the spinlock so that |
455 | * so that flush_scheduled_work() after | 531 | * flush_workqueue() after mddev_find will |
456 | * mddev_find will succeed in waiting for the | 532 | * succeed in waiting for the work to be done. |
457 | * work to be done. | ||
458 | */ | 533 | */ |
459 | INIT_WORK(&mddev->del_work, mddev_delayed_delete); | 534 | INIT_WORK(&mddev->del_work, mddev_delayed_delete); |
460 | schedule_work(&mddev->del_work); | 535 | queue_work(md_misc_wq, &mddev->del_work); |
461 | } else | 536 | } else |
462 | kfree(mddev); | 537 | kfree(mddev); |
463 | } | 538 | } |
464 | spin_unlock(&all_mddevs_lock); | 539 | spin_unlock(&all_mddevs_lock); |
540 | if (bs) | ||
541 | bioset_free(bs); | ||
465 | } | 542 | } |
466 | 543 | ||
467 | void mddev_init(mddev_t *mddev) | 544 | void mddev_init(mddev_t *mddev) |
@@ -475,6 +552,7 @@ void mddev_init(mddev_t *mddev) | |||
475 | atomic_set(&mddev->active, 1); | 552 | atomic_set(&mddev->active, 1); |
476 | atomic_set(&mddev->openers, 0); | 553 | atomic_set(&mddev->openers, 0); |
477 | atomic_set(&mddev->active_io, 0); | 554 | atomic_set(&mddev->active_io, 0); |
555 | atomic_set(&mddev->plug_cnt, 0); | ||
478 | spin_lock_init(&mddev->write_lock); | 556 | spin_lock_init(&mddev->write_lock); |
479 | atomic_set(&mddev->flush_pending, 0); | 557 | atomic_set(&mddev->flush_pending, 0); |
480 | init_waitqueue_head(&mddev->sb_wait); | 558 | init_waitqueue_head(&mddev->sb_wait); |
@@ -490,6 +568,9 @@ static mddev_t * mddev_find(dev_t unit) | |||
490 | { | 568 | { |
491 | mddev_t *mddev, *new = NULL; | 569 | mddev_t *mddev, *new = NULL; |
492 | 570 | ||
571 | if (unit && MAJOR(unit) != MD_MAJOR) | ||
572 | unit &= ~((1<<MdpMinorShift)-1); | ||
573 | |||
493 | retry: | 574 | retry: |
494 | spin_lock(&all_mddevs_lock); | 575 | spin_lock(&all_mddevs_lock); |
495 | 576 | ||
@@ -647,9 +728,9 @@ static struct mdk_personality *find_pers(int level, char *clevel) | |||
647 | } | 728 | } |
648 | 729 | ||
649 | /* return the offset of the super block in 512byte sectors */ | 730 | /* return the offset of the super block in 512byte sectors */ |
650 | static inline sector_t calc_dev_sboffset(struct block_device *bdev) | 731 | static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev) |
651 | { | 732 | { |
652 | sector_t num_sectors = bdev->bd_inode->i_size / 512; | 733 | sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; |
653 | return MD_NEW_SIZE_SECTORS(num_sectors); | 734 | return MD_NEW_SIZE_SECTORS(num_sectors); |
654 | } | 735 | } |
655 | 736 | ||
@@ -696,31 +777,6 @@ static void super_written(struct bio *bio, int error) | |||
696 | bio_put(bio); | 777 | bio_put(bio); |
697 | } | 778 | } |
698 | 779 | ||
699 | static void super_written_barrier(struct bio *bio, int error) | ||
700 | { | ||
701 | struct bio *bio2 = bio->bi_private; | ||
702 | mdk_rdev_t *rdev = bio2->bi_private; | ||
703 | mddev_t *mddev = rdev->mddev; | ||
704 | |||
705 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && | ||
706 | error == -EOPNOTSUPP) { | ||
707 | unsigned long flags; | ||
708 | /* barriers don't appear to be supported :-( */ | ||
709 | set_bit(BarriersNotsupp, &rdev->flags); | ||
710 | mddev->barriers_work = 0; | ||
711 | spin_lock_irqsave(&mddev->write_lock, flags); | ||
712 | bio2->bi_next = mddev->biolist; | ||
713 | mddev->biolist = bio2; | ||
714 | spin_unlock_irqrestore(&mddev->write_lock, flags); | ||
715 | wake_up(&mddev->sb_wait); | ||
716 | bio_put(bio); | ||
717 | } else { | ||
718 | bio_put(bio2); | ||
719 | bio->bi_private = rdev; | ||
720 | super_written(bio, error); | ||
721 | } | ||
722 | } | ||
723 | |||
724 | void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | 780 | void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, |
725 | sector_t sector, int size, struct page *page) | 781 | sector_t sector, int size, struct page *page) |
726 | { | 782 | { |
@@ -729,51 +785,27 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | |||
729 | * and decrement it on completion, waking up sb_wait | 785 | * and decrement it on completion, waking up sb_wait |
730 | * if zero is reached. | 786 | * if zero is reached. |
731 | * If an error occurred, call md_error | 787 | * If an error occurred, call md_error |
732 | * | ||
733 | * As we might need to resubmit the request if REQ_HARDBARRIER | ||
734 | * causes ENOTSUPP, we allocate a spare bio... | ||
735 | */ | 788 | */ |
736 | struct bio *bio = bio_alloc(GFP_NOIO, 1); | 789 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); |
737 | int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG; | ||
738 | 790 | ||
739 | bio->bi_bdev = rdev->bdev; | 791 | bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; |
740 | bio->bi_sector = sector; | 792 | bio->bi_sector = sector; |
741 | bio_add_page(bio, page, size, 0); | 793 | bio_add_page(bio, page, size, 0); |
742 | bio->bi_private = rdev; | 794 | bio->bi_private = rdev; |
743 | bio->bi_end_io = super_written; | 795 | bio->bi_end_io = super_written; |
744 | bio->bi_rw = rw; | ||
745 | 796 | ||
746 | atomic_inc(&mddev->pending_writes); | 797 | atomic_inc(&mddev->pending_writes); |
747 | if (!test_bit(BarriersNotsupp, &rdev->flags)) { | 798 | submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio); |
748 | struct bio *rbio; | ||
749 | rw |= REQ_HARDBARRIER; | ||
750 | rbio = bio_clone(bio, GFP_NOIO); | ||
751 | rbio->bi_private = bio; | ||
752 | rbio->bi_end_io = super_written_barrier; | ||
753 | submit_bio(rw, rbio); | ||
754 | } else | ||
755 | submit_bio(rw, bio); | ||
756 | } | 799 | } |
757 | 800 | ||
758 | void md_super_wait(mddev_t *mddev) | 801 | void md_super_wait(mddev_t *mddev) |
759 | { | 802 | { |
760 | /* wait for all superblock writes that were scheduled to complete. | 803 | /* wait for all superblock writes that were scheduled to complete */ |
761 | * if any had to be retried (due to BARRIER problems), retry them | ||
762 | */ | ||
763 | DEFINE_WAIT(wq); | 804 | DEFINE_WAIT(wq); |
764 | for(;;) { | 805 | for(;;) { |
765 | prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); | 806 | prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); |
766 | if (atomic_read(&mddev->pending_writes)==0) | 807 | if (atomic_read(&mddev->pending_writes)==0) |
767 | break; | 808 | break; |
768 | while (mddev->biolist) { | ||
769 | struct bio *bio; | ||
770 | spin_lock_irq(&mddev->write_lock); | ||
771 | bio = mddev->biolist; | ||
772 | mddev->biolist = bio->bi_next ; | ||
773 | bio->bi_next = NULL; | ||
774 | spin_unlock_irq(&mddev->write_lock); | ||
775 | submit_bio(bio->bi_rw, bio); | ||
776 | } | ||
777 | schedule(); | 809 | schedule(); |
778 | } | 810 | } |
779 | finish_wait(&mddev->sb_wait, &wq); | 811 | finish_wait(&mddev->sb_wait, &wq); |
@@ -784,17 +816,21 @@ static void bi_complete(struct bio *bio, int error) | |||
784 | complete((struct completion*)bio->bi_private); | 816 | complete((struct completion*)bio->bi_private); |
785 | } | 817 | } |
786 | 818 | ||
787 | int sync_page_io(struct block_device *bdev, sector_t sector, int size, | 819 | int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, |
788 | struct page *page, int rw) | 820 | struct page *page, int rw, bool metadata_op) |
789 | { | 821 | { |
790 | struct bio *bio = bio_alloc(GFP_NOIO, 1); | 822 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); |
791 | struct completion event; | 823 | struct completion event; |
792 | int ret; | 824 | int ret; |
793 | 825 | ||
794 | rw |= REQ_SYNC | REQ_UNPLUG; | 826 | rw |= REQ_SYNC; |
795 | 827 | ||
796 | bio->bi_bdev = bdev; | 828 | bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? |
797 | bio->bi_sector = sector; | 829 | rdev->meta_bdev : rdev->bdev; |
830 | if (metadata_op) | ||
831 | bio->bi_sector = sector + rdev->sb_start; | ||
832 | else | ||
833 | bio->bi_sector = sector + rdev->data_offset; | ||
798 | bio_add_page(bio, page, size, 0); | 834 | bio_add_page(bio, page, size, 0); |
799 | init_completion(&event); | 835 | init_completion(&event); |
800 | bio->bi_private = &event; | 836 | bio->bi_private = &event; |
@@ -819,7 +855,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) | |||
819 | return 0; | 855 | return 0; |
820 | 856 | ||
821 | 857 | ||
822 | if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) | 858 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) |
823 | goto fail; | 859 | goto fail; |
824 | rdev->sb_loaded = 1; | 860 | rdev->sb_loaded = 1; |
825 | return 0; | 861 | return 0; |
@@ -981,7 +1017,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
981 | * | 1017 | * |
982 | * It also happens to be a multiple of 4Kb. | 1018 | * It also happens to be a multiple of 4Kb. |
983 | */ | 1019 | */ |
984 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 1020 | rdev->sb_start = calc_dev_sboffset(rdev); |
985 | 1021 | ||
986 | ret = read_disk_sb(rdev, MD_SB_BYTES); | 1022 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
987 | if (ret) return ret; | 1023 | if (ret) return ret; |
@@ -1070,7 +1106,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1070 | clear_bit(Faulty, &rdev->flags); | 1106 | clear_bit(Faulty, &rdev->flags); |
1071 | clear_bit(In_sync, &rdev->flags); | 1107 | clear_bit(In_sync, &rdev->flags); |
1072 | clear_bit(WriteMostly, &rdev->flags); | 1108 | clear_bit(WriteMostly, &rdev->flags); |
1073 | clear_bit(BarriersNotsupp, &rdev->flags); | ||
1074 | 1109 | ||
1075 | if (mddev->raid_disks == 0) { | 1110 | if (mddev->raid_disks == 0) { |
1076 | mddev->major_version = 0; | 1111 | mddev->major_version = 0; |
@@ -1323,13 +1358,13 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1323 | return 0; /* component must fit device */ | 1358 | return 0; /* component must fit device */ |
1324 | if (rdev->mddev->bitmap_info.offset) | 1359 | if (rdev->mddev->bitmap_info.offset) |
1325 | return 0; /* can't move bitmap */ | 1360 | return 0; /* can't move bitmap */ |
1326 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 1361 | rdev->sb_start = calc_dev_sboffset(rdev); |
1327 | if (!num_sectors || num_sectors > rdev->sb_start) | 1362 | if (!num_sectors || num_sectors > rdev->sb_start) |
1328 | num_sectors = rdev->sb_start; | 1363 | num_sectors = rdev->sb_start; |
1329 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 1364 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
1330 | rdev->sb_page); | 1365 | rdev->sb_page); |
1331 | md_super_wait(rdev->mddev); | 1366 | md_super_wait(rdev->mddev); |
1332 | return num_sectors / 2; /* kB for sysfs */ | 1367 | return num_sectors; |
1333 | } | 1368 | } |
1334 | 1369 | ||
1335 | 1370 | ||
@@ -1378,7 +1413,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1378 | */ | 1413 | */ |
1379 | switch(minor_version) { | 1414 | switch(minor_version) { |
1380 | case 0: | 1415 | case 0: |
1381 | sb_start = rdev->bdev->bd_inode->i_size >> 9; | 1416 | sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; |
1382 | sb_start -= 8*2; | 1417 | sb_start -= 8*2; |
1383 | sb_start &= ~(sector_t)(4*2-1); | 1418 | sb_start &= ~(sector_t)(4*2-1); |
1384 | break; | 1419 | break; |
@@ -1464,7 +1499,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1464 | ret = 0; | 1499 | ret = 0; |
1465 | } | 1500 | } |
1466 | if (minor_version) | 1501 | if (minor_version) |
1467 | rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - | 1502 | rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - |
1468 | le64_to_cpu(sb->data_offset); | 1503 | le64_to_cpu(sb->data_offset); |
1469 | else | 1504 | else |
1470 | rdev->sectors = rdev->sb_start; | 1505 | rdev->sectors = rdev->sb_start; |
@@ -1485,7 +1520,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1485 | clear_bit(Faulty, &rdev->flags); | 1520 | clear_bit(Faulty, &rdev->flags); |
1486 | clear_bit(In_sync, &rdev->flags); | 1521 | clear_bit(In_sync, &rdev->flags); |
1487 | clear_bit(WriteMostly, &rdev->flags); | 1522 | clear_bit(WriteMostly, &rdev->flags); |
1488 | clear_bit(BarriersNotsupp, &rdev->flags); | ||
1489 | 1523 | ||
1490 | if (mddev->raid_disks == 0) { | 1524 | if (mddev->raid_disks == 0) { |
1491 | mddev->major_version = 1; | 1525 | mddev->major_version = 1; |
@@ -1673,7 +1707,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1673 | return 0; /* component must fit device */ | 1707 | return 0; /* component must fit device */ |
1674 | if (rdev->sb_start < rdev->data_offset) { | 1708 | if (rdev->sb_start < rdev->data_offset) { |
1675 | /* minor versions 1 and 2; superblock before data */ | 1709 | /* minor versions 1 and 2; superblock before data */ |
1676 | max_sectors = rdev->bdev->bd_inode->i_size >> 9; | 1710 | max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; |
1677 | max_sectors -= rdev->data_offset; | 1711 | max_sectors -= rdev->data_offset; |
1678 | if (!num_sectors || num_sectors > max_sectors) | 1712 | if (!num_sectors || num_sectors > max_sectors) |
1679 | num_sectors = max_sectors; | 1713 | num_sectors = max_sectors; |
@@ -1683,7 +1717,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1683 | } else { | 1717 | } else { |
1684 | /* minor version 0; superblock after data */ | 1718 | /* minor version 0; superblock after data */ |
1685 | sector_t sb_start; | 1719 | sector_t sb_start; |
1686 | sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; | 1720 | sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; |
1687 | sb_start &= ~(sector_t)(4*2 - 1); | 1721 | sb_start &= ~(sector_t)(4*2 - 1); |
1688 | max_sectors = rdev->sectors + sb_start - rdev->sb_start; | 1722 | max_sectors = rdev->sectors + sb_start - rdev->sb_start; |
1689 | if (!num_sectors || num_sectors > max_sectors) | 1723 | if (!num_sectors || num_sectors > max_sectors) |
@@ -1697,7 +1731,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1697 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 1731 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
1698 | rdev->sb_page); | 1732 | rdev->sb_page); |
1699 | md_super_wait(rdev->mddev); | 1733 | md_super_wait(rdev->mddev); |
1700 | return num_sectors / 2; /* kB for sysfs */ | 1734 | return num_sectors; |
1701 | } | 1735 | } |
1702 | 1736 | ||
1703 | static struct super_type super_types[] = { | 1737 | static struct super_type super_types[] = { |
@@ -1719,6 +1753,18 @@ static struct super_type super_types[] = { | |||
1719 | }, | 1753 | }, |
1720 | }; | 1754 | }; |
1721 | 1755 | ||
1756 | static void sync_super(mddev_t *mddev, mdk_rdev_t *rdev) | ||
1757 | { | ||
1758 | if (mddev->sync_super) { | ||
1759 | mddev->sync_super(mddev, rdev); | ||
1760 | return; | ||
1761 | } | ||
1762 | |||
1763 | BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); | ||
1764 | |||
1765 | super_types[mddev->major_version].sync_super(mddev, rdev); | ||
1766 | } | ||
1767 | |||
1722 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) | 1768 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) |
1723 | { | 1769 | { |
1724 | mdk_rdev_t *rdev, *rdev2; | 1770 | mdk_rdev_t *rdev, *rdev2; |
@@ -1750,20 +1796,14 @@ int md_integrity_register(mddev_t *mddev) | |||
1750 | 1796 | ||
1751 | if (list_empty(&mddev->disks)) | 1797 | if (list_empty(&mddev->disks)) |
1752 | return 0; /* nothing to do */ | 1798 | return 0; /* nothing to do */ |
1753 | if (blk_get_integrity(mddev->gendisk)) | 1799 | if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) |
1754 | return 0; /* already registered */ | 1800 | return 0; /* shouldn't register, or already is */ |
1755 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 1801 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
1756 | /* skip spares and non-functional disks */ | 1802 | /* skip spares and non-functional disks */ |
1757 | if (test_bit(Faulty, &rdev->flags)) | 1803 | if (test_bit(Faulty, &rdev->flags)) |
1758 | continue; | 1804 | continue; |
1759 | if (rdev->raid_disk < 0) | 1805 | if (rdev->raid_disk < 0) |
1760 | continue; | 1806 | continue; |
1761 | /* | ||
1762 | * If at least one rdev is not integrity capable, we can not | ||
1763 | * enable data integrity for the md device. | ||
1764 | */ | ||
1765 | if (!bdev_get_integrity(rdev->bdev)) | ||
1766 | return -EINVAL; | ||
1767 | if (!reference) { | 1807 | if (!reference) { |
1768 | /* Use the first rdev as the reference */ | 1808 | /* Use the first rdev as the reference */ |
1769 | reference = rdev; | 1809 | reference = rdev; |
@@ -1774,6 +1814,8 @@ int md_integrity_register(mddev_t *mddev) | |||
1774 | rdev->bdev->bd_disk) < 0) | 1814 | rdev->bdev->bd_disk) < 0) |
1775 | return -EINVAL; | 1815 | return -EINVAL; |
1776 | } | 1816 | } |
1817 | if (!reference || !bdev_get_integrity(reference->bdev)) | ||
1818 | return 0; | ||
1777 | /* | 1819 | /* |
1778 | * All component devices are integrity capable and have matching | 1820 | * All component devices are integrity capable and have matching |
1779 | * profiles, register the common profile for the md device. | 1821 | * profiles, register the common profile for the md device. |
@@ -1784,8 +1826,12 @@ int md_integrity_register(mddev_t *mddev) | |||
1784 | mdname(mddev)); | 1826 | mdname(mddev)); |
1785 | return -EINVAL; | 1827 | return -EINVAL; |
1786 | } | 1828 | } |
1787 | printk(KERN_NOTICE "md: data integrity on %s enabled\n", | 1829 | printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); |
1788 | mdname(mddev)); | 1830 | if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { |
1831 | printk(KERN_ERR "md: failed to create integrity pool for %s\n", | ||
1832 | mdname(mddev)); | ||
1833 | return -EINVAL; | ||
1834 | } | ||
1789 | return 0; | 1835 | return 0; |
1790 | } | 1836 | } |
1791 | EXPORT_SYMBOL(md_integrity_register); | 1837 | EXPORT_SYMBOL(md_integrity_register); |
@@ -1873,7 +1919,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1873 | rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); | 1919 | rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); |
1874 | 1920 | ||
1875 | list_add_rcu(&rdev->same_set, &mddev->disks); | 1921 | list_add_rcu(&rdev->same_set, &mddev->disks); |
1876 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); | 1922 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); |
1877 | 1923 | ||
1878 | /* May as well allow recovery to be retried once */ | 1924 | /* May as well allow recovery to be retried once */ |
1879 | mddev->recovery_disabled = 0; | 1925 | mddev->recovery_disabled = 0; |
@@ -1900,7 +1946,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
1900 | MD_BUG(); | 1946 | MD_BUG(); |
1901 | return; | 1947 | return; |
1902 | } | 1948 | } |
1903 | bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); | 1949 | bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); |
1904 | list_del_rcu(&rdev->same_set); | 1950 | list_del_rcu(&rdev->same_set); |
1905 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); | 1951 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); |
1906 | rdev->mddev = NULL; | 1952 | rdev->mddev = NULL; |
@@ -1914,7 +1960,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
1914 | synchronize_rcu(); | 1960 | synchronize_rcu(); |
1915 | INIT_WORK(&rdev->del_work, md_delayed_delete); | 1961 | INIT_WORK(&rdev->del_work, md_delayed_delete); |
1916 | kobject_get(&rdev->kobj); | 1962 | kobject_get(&rdev->kobj); |
1917 | schedule_work(&rdev->del_work); | 1963 | queue_work(md_misc_wq, &rdev->del_work); |
1918 | } | 1964 | } |
1919 | 1965 | ||
1920 | /* | 1966 | /* |
@@ -1928,21 +1974,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) | |||
1928 | struct block_device *bdev; | 1974 | struct block_device *bdev; |
1929 | char b[BDEVNAME_SIZE]; | 1975 | char b[BDEVNAME_SIZE]; |
1930 | 1976 | ||
1931 | bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); | 1977 | bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, |
1978 | shared ? (mdk_rdev_t *)lock_rdev : rdev); | ||
1932 | if (IS_ERR(bdev)) { | 1979 | if (IS_ERR(bdev)) { |
1933 | printk(KERN_ERR "md: could not open %s.\n", | 1980 | printk(KERN_ERR "md: could not open %s.\n", |
1934 | __bdevname(dev, b)); | 1981 | __bdevname(dev, b)); |
1935 | return PTR_ERR(bdev); | 1982 | return PTR_ERR(bdev); |
1936 | } | 1983 | } |
1937 | err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); | ||
1938 | if (err) { | ||
1939 | printk(KERN_ERR "md: could not bd_claim %s.\n", | ||
1940 | bdevname(bdev, b)); | ||
1941 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE); | ||
1942 | return err; | ||
1943 | } | ||
1944 | if (!shared) | ||
1945 | set_bit(AllReserved, &rdev->flags); | ||
1946 | rdev->bdev = bdev; | 1984 | rdev->bdev = bdev; |
1947 | return err; | 1985 | return err; |
1948 | } | 1986 | } |
@@ -1953,8 +1991,7 @@ static void unlock_rdev(mdk_rdev_t *rdev) | |||
1953 | rdev->bdev = NULL; | 1991 | rdev->bdev = NULL; |
1954 | if (!bdev) | 1992 | if (!bdev) |
1955 | MD_BUG(); | 1993 | MD_BUG(); |
1956 | bd_release(bdev); | 1994 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); |
1957 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE); | ||
1958 | } | 1995 | } |
1959 | 1996 | ||
1960 | void md_autodetect_dev(dev_t dev); | 1997 | void md_autodetect_dev(dev_t dev); |
@@ -2146,8 +2183,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) | |||
2146 | /* Don't update this superblock */ | 2183 | /* Don't update this superblock */ |
2147 | rdev->sb_loaded = 2; | 2184 | rdev->sb_loaded = 2; |
2148 | } else { | 2185 | } else { |
2149 | super_types[mddev->major_version]. | 2186 | sync_super(mddev, rdev); |
2150 | sync_super(mddev, rdev); | ||
2151 | rdev->sb_loaded = 1; | 2187 | rdev->sb_loaded = 1; |
2152 | } | 2188 | } |
2153 | } | 2189 | } |
@@ -2172,6 +2208,8 @@ repeat: | |||
2172 | if (!mddev->persistent) { | 2208 | if (!mddev->persistent) { |
2173 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); | 2209 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); |
2174 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2210 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); |
2211 | if (!mddev->external) | ||
2212 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | ||
2175 | wake_up(&mddev->sb_wait); | 2213 | wake_up(&mddev->sb_wait); |
2176 | return; | 2214 | return; |
2177 | } | 2215 | } |
@@ -2438,7 +2476,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2438 | if (rdev->raid_disk == -1) | 2476 | if (rdev->raid_disk == -1) |
2439 | return -EEXIST; | 2477 | return -EEXIST; |
2440 | /* personality does all needed checks */ | 2478 | /* personality does all needed checks */ |
2441 | if (rdev->mddev->pers->hot_add_disk == NULL) | 2479 | if (rdev->mddev->pers->hot_remove_disk == NULL) |
2442 | return -EINVAL; | 2480 | return -EINVAL; |
2443 | err = rdev->mddev->pers-> | 2481 | err = rdev->mddev->pers-> |
2444 | hot_remove_disk(rdev->mddev, rdev->raid_disk); | 2482 | hot_remove_disk(rdev->mddev, rdev->raid_disk); |
@@ -2458,6 +2496,9 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2458 | if (rdev->raid_disk != -1) | 2496 | if (rdev->raid_disk != -1) |
2459 | return -EBUSY; | 2497 | return -EBUSY; |
2460 | 2498 | ||
2499 | if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) | ||
2500 | return -EBUSY; | ||
2501 | |||
2461 | if (rdev->mddev->pers->hot_add_disk == NULL) | 2502 | if (rdev->mddev->pers->hot_add_disk == NULL) |
2462 | return -EINVAL; | 2503 | return -EINVAL; |
2463 | 2504 | ||
@@ -2465,6 +2506,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2465 | if (rdev2->raid_disk == slot) | 2506 | if (rdev2->raid_disk == slot) |
2466 | return -EEXIST; | 2507 | return -EEXIST; |
2467 | 2508 | ||
2509 | if (slot >= rdev->mddev->raid_disks && | ||
2510 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) | ||
2511 | return -ENOSPC; | ||
2512 | |||
2468 | rdev->raid_disk = slot; | 2513 | rdev->raid_disk = slot; |
2469 | if (test_bit(In_sync, &rdev->flags)) | 2514 | if (test_bit(In_sync, &rdev->flags)) |
2470 | rdev->saved_raid_disk = slot; | 2515 | rdev->saved_raid_disk = slot; |
@@ -2482,7 +2527,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2482 | /* failure here is OK */; | 2527 | /* failure here is OK */; |
2483 | /* don't wakeup anyone, leave that to userspace. */ | 2528 | /* don't wakeup anyone, leave that to userspace. */ |
2484 | } else { | 2529 | } else { |
2485 | if (slot >= rdev->mddev->raid_disks) | 2530 | if (slot >= rdev->mddev->raid_disks && |
2531 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) | ||
2486 | return -ENOSPC; | 2532 | return -ENOSPC; |
2487 | rdev->raid_disk = slot; | 2533 | rdev->raid_disk = slot; |
2488 | /* assume it is working */ | 2534 | /* assume it is working */ |
@@ -2575,7 +2621,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2575 | if (!sectors) | 2621 | if (!sectors) |
2576 | return -EBUSY; | 2622 | return -EBUSY; |
2577 | } else if (!sectors) | 2623 | } else if (!sectors) |
2578 | sectors = (rdev->bdev->bd_inode->i_size >> 9) - | 2624 | sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - |
2579 | rdev->data_offset; | 2625 | rdev->data_offset; |
2580 | } | 2626 | } |
2581 | if (sectors < my_mddev->dev_sectors) | 2627 | if (sectors < my_mddev->dev_sectors) |
@@ -2598,12 +2644,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2598 | 2644 | ||
2599 | mddev_lock(mddev); | 2645 | mddev_lock(mddev); |
2600 | list_for_each_entry(rdev2, &mddev->disks, same_set) | 2646 | list_for_each_entry(rdev2, &mddev->disks, same_set) |
2601 | if (test_bit(AllReserved, &rdev2->flags) || | 2647 | if (rdev->bdev == rdev2->bdev && |
2602 | (rdev->bdev == rdev2->bdev && | 2648 | rdev != rdev2 && |
2603 | rdev != rdev2 && | 2649 | overlaps(rdev->data_offset, rdev->sectors, |
2604 | overlaps(rdev->data_offset, rdev->sectors, | 2650 | rdev2->data_offset, |
2605 | rdev2->data_offset, | 2651 | rdev2->sectors)) { |
2606 | rdev2->sectors))) { | ||
2607 | overlap = 1; | 2652 | overlap = 1; |
2608 | break; | 2653 | break; |
2609 | } | 2654 | } |
@@ -2788,7 +2833,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2788 | 2833 | ||
2789 | kobject_init(&rdev->kobj, &rdev_ktype); | 2834 | kobject_init(&rdev->kobj, &rdev_ktype); |
2790 | 2835 | ||
2791 | size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | 2836 | size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; |
2792 | if (!size) { | 2837 | if (!size) { |
2793 | printk(KERN_WARNING | 2838 | printk(KERN_WARNING |
2794 | "md: %s has zero or unknown size, marking faulty!\n", | 2839 | "md: %s has zero or unknown size, marking faulty!\n", |
@@ -3107,7 +3152,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3107 | char nm[20]; | 3152 | char nm[20]; |
3108 | if (rdev->raid_disk < 0) | 3153 | if (rdev->raid_disk < 0) |
3109 | continue; | 3154 | continue; |
3110 | if (rdev->new_raid_disk > mddev->raid_disks) | 3155 | if (rdev->new_raid_disk >= mddev->raid_disks) |
3111 | rdev->new_raid_disk = -1; | 3156 | rdev->new_raid_disk = -1; |
3112 | if (rdev->new_raid_disk == rdev->raid_disk) | 3157 | if (rdev->new_raid_disk == rdev->raid_disk) |
3113 | continue; | 3158 | continue; |
@@ -3139,6 +3184,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3139 | mddev->layout = mddev->new_layout; | 3184 | mddev->layout = mddev->new_layout; |
3140 | mddev->chunk_sectors = mddev->new_chunk_sectors; | 3185 | mddev->chunk_sectors = mddev->new_chunk_sectors; |
3141 | mddev->delta_disks = 0; | 3186 | mddev->delta_disks = 0; |
3187 | mddev->degraded = 0; | ||
3142 | if (mddev->pers->sync_request == NULL) { | 3188 | if (mddev->pers->sync_request == NULL) { |
3143 | /* this is now an array without redundancy, so | 3189 | /* this is now an array without redundancy, so |
3144 | * it must always be in_sync | 3190 | * it must always be in_sync |
@@ -3292,7 +3338,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) | |||
3292 | char *e; | 3338 | char *e; |
3293 | unsigned long long n = simple_strtoull(buf, &e, 10); | 3339 | unsigned long long n = simple_strtoull(buf, &e, 10); |
3294 | 3340 | ||
3295 | if (mddev->pers) | 3341 | if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) |
3296 | return -EBUSY; | 3342 | return -EBUSY; |
3297 | if (cmd_match(buf, "none")) | 3343 | if (cmd_match(buf, "none")) |
3298 | n = MaxSector; | 3344 | n = MaxSector; |
@@ -3736,6 +3782,8 @@ action_show(mddev_t *mddev, char *page) | |||
3736 | return sprintf(page, "%s\n", type); | 3782 | return sprintf(page, "%s\n", type); |
3737 | } | 3783 | } |
3738 | 3784 | ||
3785 | static void reap_sync_thread(mddev_t *mddev); | ||
3786 | |||
3739 | static ssize_t | 3787 | static ssize_t |
3740 | action_store(mddev_t *mddev, const char *page, size_t len) | 3788 | action_store(mddev_t *mddev, const char *page, size_t len) |
3741 | { | 3789 | { |
@@ -3750,9 +3798,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) | |||
3750 | if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { | 3798 | if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { |
3751 | if (mddev->sync_thread) { | 3799 | if (mddev->sync_thread) { |
3752 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 3800 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
3753 | md_unregister_thread(mddev->sync_thread); | 3801 | reap_sync_thread(mddev); |
3754 | mddev->sync_thread = NULL; | ||
3755 | mddev->recovery = 0; | ||
3756 | } | 3802 | } |
3757 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || | 3803 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
3758 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) | 3804 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) |
@@ -3904,7 +3950,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); | |||
3904 | static ssize_t | 3950 | static ssize_t |
3905 | sync_completed_show(mddev_t *mddev, char *page) | 3951 | sync_completed_show(mddev_t *mddev, char *page) |
3906 | { | 3952 | { |
3907 | unsigned long max_sectors, resync; | 3953 | unsigned long long max_sectors, resync; |
3908 | 3954 | ||
3909 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 3955 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
3910 | return sprintf(page, "none\n"); | 3956 | return sprintf(page, "none\n"); |
@@ -3915,7 +3961,7 @@ sync_completed_show(mddev_t *mddev, char *page) | |||
3915 | max_sectors = mddev->dev_sectors; | 3961 | max_sectors = mddev->dev_sectors; |
3916 | 3962 | ||
3917 | resync = mddev->curr_resync_completed; | 3963 | resync = mddev->curr_resync_completed; |
3918 | return sprintf(page, "%lu / %lu\n", resync, max_sectors); | 3964 | return sprintf(page, "%llu / %llu\n", resync, max_sectors); |
3919 | } | 3965 | } |
3920 | 3966 | ||
3921 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); | 3967 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); |
@@ -4002,19 +4048,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) | |||
4002 | { | 4048 | { |
4003 | char *e; | 4049 | char *e; |
4004 | unsigned long long new = simple_strtoull(buf, &e, 10); | 4050 | unsigned long long new = simple_strtoull(buf, &e, 10); |
4051 | unsigned long long old = mddev->suspend_lo; | ||
4005 | 4052 | ||
4006 | if (mddev->pers == NULL || | 4053 | if (mddev->pers == NULL || |
4007 | mddev->pers->quiesce == NULL) | 4054 | mddev->pers->quiesce == NULL) |
4008 | return -EINVAL; | 4055 | return -EINVAL; |
4009 | if (buf == e || (*e && *e != '\n')) | 4056 | if (buf == e || (*e && *e != '\n')) |
4010 | return -EINVAL; | 4057 | return -EINVAL; |
4011 | if (new >= mddev->suspend_hi || | 4058 | |
4012 | (new > mddev->suspend_lo && new < mddev->suspend_hi)) { | 4059 | mddev->suspend_lo = new; |
4013 | mddev->suspend_lo = new; | 4060 | if (new >= old) |
4061 | /* Shrinking suspended region */ | ||
4014 | mddev->pers->quiesce(mddev, 2); | 4062 | mddev->pers->quiesce(mddev, 2); |
4015 | return len; | 4063 | else { |
4016 | } else | 4064 | /* Expanding suspended region - need to wait */ |
4017 | return -EINVAL; | 4065 | mddev->pers->quiesce(mddev, 1); |
4066 | mddev->pers->quiesce(mddev, 0); | ||
4067 | } | ||
4068 | return len; | ||
4018 | } | 4069 | } |
4019 | static struct md_sysfs_entry md_suspend_lo = | 4070 | static struct md_sysfs_entry md_suspend_lo = |
4020 | __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); | 4071 | __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); |
@@ -4031,20 +4082,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) | |||
4031 | { | 4082 | { |
4032 | char *e; | 4083 | char *e; |
4033 | unsigned long long new = simple_strtoull(buf, &e, 10); | 4084 | unsigned long long new = simple_strtoull(buf, &e, 10); |
4085 | unsigned long long old = mddev->suspend_hi; | ||
4034 | 4086 | ||
4035 | if (mddev->pers == NULL || | 4087 | if (mddev->pers == NULL || |
4036 | mddev->pers->quiesce == NULL) | 4088 | mddev->pers->quiesce == NULL) |
4037 | return -EINVAL; | 4089 | return -EINVAL; |
4038 | if (buf == e || (*e && *e != '\n')) | 4090 | if (buf == e || (*e && *e != '\n')) |
4039 | return -EINVAL; | 4091 | return -EINVAL; |
4040 | if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || | 4092 | |
4041 | (new > mddev->suspend_lo && new > mddev->suspend_hi)) { | 4093 | mddev->suspend_hi = new; |
4042 | mddev->suspend_hi = new; | 4094 | if (new <= old) |
4095 | /* Shrinking suspended region */ | ||
4096 | mddev->pers->quiesce(mddev, 2); | ||
4097 | else { | ||
4098 | /* Expanding suspended region - need to wait */ | ||
4043 | mddev->pers->quiesce(mddev, 1); | 4099 | mddev->pers->quiesce(mddev, 1); |
4044 | mddev->pers->quiesce(mddev, 0); | 4100 | mddev->pers->quiesce(mddev, 0); |
4045 | return len; | 4101 | } |
4046 | } else | 4102 | return len; |
4047 | return -EINVAL; | ||
4048 | } | 4103 | } |
4049 | static struct md_sysfs_entry md_suspend_hi = | 4104 | static struct md_sysfs_entry md_suspend_hi = |
4050 | __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); | 4105 | __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); |
@@ -4112,10 +4167,10 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) | |||
4112 | } | 4167 | } |
4113 | 4168 | ||
4114 | mddev->array_sectors = sectors; | 4169 | mddev->array_sectors = sectors; |
4115 | set_capacity(mddev->gendisk, mddev->array_sectors); | 4170 | if (mddev->pers) { |
4116 | if (mddev->pers) | 4171 | set_capacity(mddev->gendisk, mddev->array_sectors); |
4117 | revalidate_disk(mddev->gendisk); | 4172 | revalidate_disk(mddev->gendisk); |
4118 | 4173 | } | |
4119 | return len; | 4174 | return len; |
4120 | } | 4175 | } |
4121 | 4176 | ||
@@ -4256,10 +4311,10 @@ static int md_alloc(dev_t dev, char *name) | |||
4256 | shift = partitioned ? MdpMinorShift : 0; | 4311 | shift = partitioned ? MdpMinorShift : 0; |
4257 | unit = MINOR(mddev->unit) >> shift; | 4312 | unit = MINOR(mddev->unit) >> shift; |
4258 | 4313 | ||
4259 | /* wait for any previous instance if this device | 4314 | /* wait for any previous instance of this device to be |
4260 | * to be completed removed (mddev_delayed_delete). | 4315 | * completely removed (mddev_delayed_delete). |
4261 | */ | 4316 | */ |
4262 | flush_scheduled_work(); | 4317 | flush_workqueue(md_misc_wq); |
4263 | 4318 | ||
4264 | mutex_lock(&disks_mutex); | 4319 | mutex_lock(&disks_mutex); |
4265 | error = -EEXIST; | 4320 | error = -EEXIST; |
@@ -4287,9 +4342,6 @@ static int md_alloc(dev_t dev, char *name) | |||
4287 | goto abort; | 4342 | goto abort; |
4288 | mddev->queue->queuedata = mddev; | 4343 | mddev->queue->queuedata = mddev; |
4289 | 4344 | ||
4290 | /* Can be unlocked because the queue is new: no concurrency */ | ||
4291 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); | ||
4292 | |||
4293 | blk_queue_make_request(mddev->queue, md_make_request); | 4345 | blk_queue_make_request(mddev->queue, md_make_request); |
4294 | 4346 | ||
4295 | disk = alloc_disk(1 << shift); | 4347 | disk = alloc_disk(1 << shift); |
@@ -4309,13 +4361,19 @@ static int md_alloc(dev_t dev, char *name) | |||
4309 | disk->fops = &md_fops; | 4361 | disk->fops = &md_fops; |
4310 | disk->private_data = mddev; | 4362 | disk->private_data = mddev; |
4311 | disk->queue = mddev->queue; | 4363 | disk->queue = mddev->queue; |
4364 | blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); | ||
4312 | /* Allow extended partitions. This makes the | 4365 | /* Allow extended partitions. This makes the |
4313 | * 'mdp' device redundant, but we can't really | 4366 | * 'mdp' device redundant, but we can't really |
4314 | * remove it now. | 4367 | * remove it now. |
4315 | */ | 4368 | */ |
4316 | disk->flags |= GENHD_FL_EXT_DEVT; | 4369 | disk->flags |= GENHD_FL_EXT_DEVT; |
4317 | add_disk(disk); | ||
4318 | mddev->gendisk = disk; | 4370 | mddev->gendisk = disk; |
4371 | /* As soon as we call add_disk(), another thread could get | ||
4372 | * through to md_open, so make sure it doesn't get too far | ||
4373 | */ | ||
4374 | mutex_lock(&mddev->open_mutex); | ||
4375 | add_disk(disk); | ||
4376 | |||
4319 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, | 4377 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, |
4320 | &disk_to_dev(disk)->kobj, "%s", "md"); | 4378 | &disk_to_dev(disk)->kobj, "%s", "md"); |
4321 | if (error) { | 4379 | if (error) { |
@@ -4329,6 +4387,7 @@ static int md_alloc(dev_t dev, char *name) | |||
4329 | if (mddev->kobj.sd && | 4387 | if (mddev->kobj.sd && |
4330 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) | 4388 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) |
4331 | printk(KERN_DEBUG "pointless warning\n"); | 4389 | printk(KERN_DEBUG "pointless warning\n"); |
4390 | mutex_unlock(&mddev->open_mutex); | ||
4332 | abort: | 4391 | abort: |
4333 | mutex_unlock(&disks_mutex); | 4392 | mutex_unlock(&disks_mutex); |
4334 | if (!error && mddev->kobj.sd) { | 4393 | if (!error && mddev->kobj.sd) { |
@@ -4423,7 +4482,9 @@ int md_run(mddev_t *mddev) | |||
4423 | * We don't want the data to overlap the metadata, | 4482 | * We don't want the data to overlap the metadata, |
4424 | * Internal Bitmap issues have been handled elsewhere. | 4483 | * Internal Bitmap issues have been handled elsewhere. |
4425 | */ | 4484 | */ |
4426 | if (rdev->data_offset < rdev->sb_start) { | 4485 | if (rdev->meta_bdev) { |
4486 | /* Nothing to check */; | ||
4487 | } else if (rdev->data_offset < rdev->sb_start) { | ||
4427 | if (mddev->dev_sectors && | 4488 | if (mddev->dev_sectors && |
4428 | rdev->data_offset + mddev->dev_sectors | 4489 | rdev->data_offset + mddev->dev_sectors |
4429 | > rdev->sb_start) { | 4490 | > rdev->sb_start) { |
@@ -4442,6 +4503,9 @@ int md_run(mddev_t *mddev) | |||
4442 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 4503 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
4443 | } | 4504 | } |
4444 | 4505 | ||
4506 | if (mddev->bio_set == NULL) | ||
4507 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); | ||
4508 | |||
4445 | spin_lock(&pers_lock); | 4509 | spin_lock(&pers_lock); |
4446 | pers = find_pers(mddev->level, mddev->clevel); | 4510 | pers = find_pers(mddev->level, mddev->clevel); |
4447 | if (!pers || !try_module_get(pers->owner)) { | 4511 | if (!pers || !try_module_get(pers->owner)) { |
@@ -4504,7 +4568,6 @@ int md_run(mddev_t *mddev) | |||
4504 | /* may be over-ridden by personality */ | 4568 | /* may be over-ridden by personality */ |
4505 | mddev->resync_max_sectors = mddev->dev_sectors; | 4569 | mddev->resync_max_sectors = mddev->dev_sectors; |
4506 | 4570 | ||
4507 | mddev->barriers_work = 1; | ||
4508 | mddev->ok_start_degraded = start_dirty_degraded; | 4571 | mddev->ok_start_degraded = start_dirty_degraded; |
4509 | 4572 | ||
4510 | if (start_readonly && mddev->ro == 0) | 4573 | if (start_readonly && mddev->ro == 0) |
@@ -4555,7 +4618,8 @@ int md_run(mddev_t *mddev) | |||
4555 | mddev->safemode_timer.data = (unsigned long) mddev; | 4618 | mddev->safemode_timer.data = (unsigned long) mddev; |
4556 | mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ | 4619 | mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ |
4557 | mddev->in_sync = 1; | 4620 | mddev->in_sync = 1; |
4558 | 4621 | smp_wmb(); | |
4622 | mddev->ready = 1; | ||
4559 | list_for_each_entry(rdev, &mddev->disks, same_set) | 4623 | list_for_each_entry(rdev, &mddev->disks, same_set) |
4560 | if (rdev->raid_disk >= 0) { | 4624 | if (rdev->raid_disk >= 0) { |
4561 | char nm[20]; | 4625 | char nm[20]; |
@@ -4569,9 +4633,6 @@ int md_run(mddev_t *mddev) | |||
4569 | if (mddev->flags) | 4633 | if (mddev->flags) |
4570 | md_update_sb(mddev, 0); | 4634 | md_update_sb(mddev, 0); |
4571 | 4635 | ||
4572 | md_wakeup_thread(mddev->thread); | ||
4573 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | ||
4574 | |||
4575 | md_new_event(mddev); | 4636 | md_new_event(mddev); |
4576 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 4637 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
4577 | sysfs_notify_dirent_safe(mddev->sysfs_action); | 4638 | sysfs_notify_dirent_safe(mddev->sysfs_action); |
@@ -4592,8 +4653,13 @@ static int do_md_run(mddev_t *mddev) | |||
4592 | bitmap_destroy(mddev); | 4653 | bitmap_destroy(mddev); |
4593 | goto out; | 4654 | goto out; |
4594 | } | 4655 | } |
4656 | |||
4657 | md_wakeup_thread(mddev->thread); | ||
4658 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | ||
4659 | |||
4595 | set_capacity(mddev->gendisk, mddev->array_sectors); | 4660 | set_capacity(mddev->gendisk, mddev->array_sectors); |
4596 | revalidate_disk(mddev->gendisk); | 4661 | revalidate_disk(mddev->gendisk); |
4662 | mddev->changed = 1; | ||
4597 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); | 4663 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); |
4598 | out: | 4664 | out: |
4599 | return err; | 4665 | return err; |
@@ -4682,24 +4748,22 @@ static void md_clean(mddev_t *mddev) | |||
4682 | mddev->sync_speed_min = mddev->sync_speed_max = 0; | 4748 | mddev->sync_speed_min = mddev->sync_speed_max = 0; |
4683 | mddev->recovery = 0; | 4749 | mddev->recovery = 0; |
4684 | mddev->in_sync = 0; | 4750 | mddev->in_sync = 0; |
4751 | mddev->changed = 0; | ||
4685 | mddev->degraded = 0; | 4752 | mddev->degraded = 0; |
4686 | mddev->barriers_work = 0; | ||
4687 | mddev->safemode = 0; | 4753 | mddev->safemode = 0; |
4688 | mddev->bitmap_info.offset = 0; | 4754 | mddev->bitmap_info.offset = 0; |
4689 | mddev->bitmap_info.default_offset = 0; | 4755 | mddev->bitmap_info.default_offset = 0; |
4690 | mddev->bitmap_info.chunksize = 0; | 4756 | mddev->bitmap_info.chunksize = 0; |
4691 | mddev->bitmap_info.daemon_sleep = 0; | 4757 | mddev->bitmap_info.daemon_sleep = 0; |
4692 | mddev->bitmap_info.max_write_behind = 0; | 4758 | mddev->bitmap_info.max_write_behind = 0; |
4693 | mddev->plug = NULL; | ||
4694 | } | 4759 | } |
4695 | 4760 | ||
4696 | void md_stop_writes(mddev_t *mddev) | 4761 | static void __md_stop_writes(mddev_t *mddev) |
4697 | { | 4762 | { |
4698 | if (mddev->sync_thread) { | 4763 | if (mddev->sync_thread) { |
4699 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 4764 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
4700 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 4765 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
4701 | md_unregister_thread(mddev->sync_thread); | 4766 | reap_sync_thread(mddev); |
4702 | mddev->sync_thread = NULL; | ||
4703 | } | 4767 | } |
4704 | 4768 | ||
4705 | del_timer_sync(&mddev->safemode_timer); | 4769 | del_timer_sync(&mddev->safemode_timer); |
@@ -4713,10 +4777,18 @@ void md_stop_writes(mddev_t *mddev) | |||
4713 | md_update_sb(mddev, 1); | 4777 | md_update_sb(mddev, 1); |
4714 | } | 4778 | } |
4715 | } | 4779 | } |
4780 | |||
4781 | void md_stop_writes(mddev_t *mddev) | ||
4782 | { | ||
4783 | mddev_lock(mddev); | ||
4784 | __md_stop_writes(mddev); | ||
4785 | mddev_unlock(mddev); | ||
4786 | } | ||
4716 | EXPORT_SYMBOL_GPL(md_stop_writes); | 4787 | EXPORT_SYMBOL_GPL(md_stop_writes); |
4717 | 4788 | ||
4718 | void md_stop(mddev_t *mddev) | 4789 | void md_stop(mddev_t *mddev) |
4719 | { | 4790 | { |
4791 | mddev->ready = 0; | ||
4720 | mddev->pers->stop(mddev); | 4792 | mddev->pers->stop(mddev); |
4721 | if (mddev->pers->sync_request && mddev->to_remove == NULL) | 4793 | if (mddev->pers->sync_request && mddev->to_remove == NULL) |
4722 | mddev->to_remove = &md_redundancy_group; | 4794 | mddev->to_remove = &md_redundancy_group; |
@@ -4736,7 +4808,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) | |||
4736 | goto out; | 4808 | goto out; |
4737 | } | 4809 | } |
4738 | if (mddev->pers) { | 4810 | if (mddev->pers) { |
4739 | md_stop_writes(mddev); | 4811 | __md_stop_writes(mddev); |
4740 | 4812 | ||
4741 | err = -ENXIO; | 4813 | err = -ENXIO; |
4742 | if (mddev->ro==1) | 4814 | if (mddev->ro==1) |
@@ -4773,10 +4845,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4773 | if (mddev->ro) | 4845 | if (mddev->ro) |
4774 | set_disk_ro(disk, 0); | 4846 | set_disk_ro(disk, 0); |
4775 | 4847 | ||
4776 | md_stop_writes(mddev); | 4848 | __md_stop_writes(mddev); |
4777 | md_stop(mddev); | 4849 | md_stop(mddev); |
4778 | mddev->queue->merge_bvec_fn = NULL; | 4850 | mddev->queue->merge_bvec_fn = NULL; |
4779 | mddev->queue->unplug_fn = NULL; | ||
4780 | mddev->queue->backing_dev_info.congested_fn = NULL; | 4851 | mddev->queue->backing_dev_info.congested_fn = NULL; |
4781 | 4852 | ||
4782 | /* tell userspace to handle 'inactive' */ | 4853 | /* tell userspace to handle 'inactive' */ |
@@ -4791,6 +4862,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4791 | 4862 | ||
4792 | set_capacity(disk, 0); | 4863 | set_capacity(disk, 0); |
4793 | mutex_unlock(&mddev->open_mutex); | 4864 | mutex_unlock(&mddev->open_mutex); |
4865 | mddev->changed = 1; | ||
4794 | revalidate_disk(disk); | 4866 | revalidate_disk(disk); |
4795 | 4867 | ||
4796 | if (mddev->ro) | 4868 | if (mddev->ro) |
@@ -5148,17 +5220,31 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
5148 | PTR_ERR(rdev)); | 5220 | PTR_ERR(rdev)); |
5149 | return PTR_ERR(rdev); | 5221 | return PTR_ERR(rdev); |
5150 | } | 5222 | } |
5151 | /* set save_raid_disk if appropriate */ | 5223 | /* set saved_raid_disk if appropriate */ |
5152 | if (!mddev->persistent) { | 5224 | if (!mddev->persistent) { |
5153 | if (info->state & (1<<MD_DISK_SYNC) && | 5225 | if (info->state & (1<<MD_DISK_SYNC) && |
5154 | info->raid_disk < mddev->raid_disks) | 5226 | info->raid_disk < mddev->raid_disks) { |
5155 | rdev->raid_disk = info->raid_disk; | 5227 | rdev->raid_disk = info->raid_disk; |
5156 | else | 5228 | set_bit(In_sync, &rdev->flags); |
5229 | } else | ||
5157 | rdev->raid_disk = -1; | 5230 | rdev->raid_disk = -1; |
5158 | } else | 5231 | } else |
5159 | super_types[mddev->major_version]. | 5232 | super_types[mddev->major_version]. |
5160 | validate_super(mddev, rdev); | 5233 | validate_super(mddev, rdev); |
5161 | rdev->saved_raid_disk = rdev->raid_disk; | 5234 | if ((info->state & (1<<MD_DISK_SYNC)) && |
5235 | (!test_bit(In_sync, &rdev->flags) || | ||
5236 | rdev->raid_disk != info->raid_disk)) { | ||
5237 | /* This was a hot-add request, but events doesn't | ||
5238 | * match, so reject it. | ||
5239 | */ | ||
5240 | export_rdev(rdev); | ||
5241 | return -EINVAL; | ||
5242 | } | ||
5243 | |||
5244 | if (test_bit(In_sync, &rdev->flags)) | ||
5245 | rdev->saved_raid_disk = rdev->raid_disk; | ||
5246 | else | ||
5247 | rdev->saved_raid_disk = -1; | ||
5162 | 5248 | ||
5163 | clear_bit(In_sync, &rdev->flags); /* just to be sure */ | 5249 | clear_bit(In_sync, &rdev->flags); /* just to be sure */ |
5164 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | 5250 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
@@ -5188,6 +5274,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
5188 | if (mddev->degraded) | 5274 | if (mddev->degraded) |
5189 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | 5275 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
5190 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5276 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5277 | if (!err) | ||
5278 | md_new_event(mddev); | ||
5191 | md_wakeup_thread(mddev->thread); | 5279 | md_wakeup_thread(mddev->thread); |
5192 | return err; | 5280 | return err; |
5193 | } | 5281 | } |
@@ -5225,9 +5313,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
5225 | 5313 | ||
5226 | if (!mddev->persistent) { | 5314 | if (!mddev->persistent) { |
5227 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); | 5315 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); |
5228 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 5316 | rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; |
5229 | } else | 5317 | } else |
5230 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 5318 | rdev->sb_start = calc_dev_sboffset(rdev); |
5231 | rdev->sectors = rdev->sb_start; | 5319 | rdev->sectors = rdev->sb_start; |
5232 | 5320 | ||
5233 | err = bind_rdev_to_array(rdev, mddev); | 5321 | err = bind_rdev_to_array(rdev, mddev); |
@@ -5294,9 +5382,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) | |||
5294 | } | 5382 | } |
5295 | 5383 | ||
5296 | if (mddev->persistent) | 5384 | if (mddev->persistent) |
5297 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 5385 | rdev->sb_start = calc_dev_sboffset(rdev); |
5298 | else | 5386 | else |
5299 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 5387 | rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; |
5300 | 5388 | ||
5301 | rdev->sectors = rdev->sb_start; | 5389 | rdev->sectors = rdev->sb_start; |
5302 | 5390 | ||
@@ -5507,7 +5595,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) | |||
5507 | * sb_start or, if that is <data_offset, it must fit before the size | 5595 | * sb_start or, if that is <data_offset, it must fit before the size |
5508 | * of each device. If num_sectors is zero, we find the largest size | 5596 | * of each device. If num_sectors is zero, we find the largest size |
5509 | * that fits. | 5597 | * that fits. |
5510 | |||
5511 | */ | 5598 | */ |
5512 | if (mddev->sync_thread) | 5599 | if (mddev->sync_thread) |
5513 | return -EBUSY; | 5600 | return -EBUSY; |
@@ -5544,6 +5631,8 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) | |||
5544 | mddev->delta_disks = raid_disks - mddev->raid_disks; | 5631 | mddev->delta_disks = raid_disks - mddev->raid_disks; |
5545 | 5632 | ||
5546 | rv = mddev->pers->check_reshape(mddev); | 5633 | rv = mddev->pers->check_reshape(mddev); |
5634 | if (rv < 0) | ||
5635 | mddev->delta_disks = 0; | ||
5547 | return rv; | 5636 | return rv; |
5548 | } | 5637 | } |
5549 | 5638 | ||
@@ -5951,16 +6040,14 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
5951 | mddev_t *mddev = mddev_find(bdev->bd_dev); | 6040 | mddev_t *mddev = mddev_find(bdev->bd_dev); |
5952 | int err; | 6041 | int err; |
5953 | 6042 | ||
5954 | lock_kernel(); | ||
5955 | if (mddev->gendisk != bdev->bd_disk) { | 6043 | if (mddev->gendisk != bdev->bd_disk) { |
5956 | /* we are racing with mddev_put which is discarding this | 6044 | /* we are racing with mddev_put which is discarding this |
5957 | * bd_disk. | 6045 | * bd_disk. |
5958 | */ | 6046 | */ |
5959 | mddev_put(mddev); | 6047 | mddev_put(mddev); |
5960 | /* Wait until bdev->bd_disk is definitely gone */ | 6048 | /* Wait until bdev->bd_disk is definitely gone */ |
5961 | flush_scheduled_work(); | 6049 | flush_workqueue(md_misc_wq); |
5962 | /* Then retry the open from the top */ | 6050 | /* Then retry the open from the top */ |
5963 | unlock_kernel(); | ||
5964 | return -ERESTARTSYS; | 6051 | return -ERESTARTSYS; |
5965 | } | 6052 | } |
5966 | BUG_ON(mddev != bdev->bd_disk->private_data); | 6053 | BUG_ON(mddev != bdev->bd_disk->private_data); |
@@ -5972,9 +6059,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
5972 | atomic_inc(&mddev->openers); | 6059 | atomic_inc(&mddev->openers); |
5973 | mutex_unlock(&mddev->open_mutex); | 6060 | mutex_unlock(&mddev->open_mutex); |
5974 | 6061 | ||
5975 | check_disk_size_change(mddev->gendisk, bdev); | 6062 | check_disk_change(bdev); |
5976 | out: | 6063 | out: |
5977 | unlock_kernel(); | ||
5978 | return err; | 6064 | return err; |
5979 | } | 6065 | } |
5980 | 6066 | ||
@@ -5983,13 +6069,26 @@ static int md_release(struct gendisk *disk, fmode_t mode) | |||
5983 | mddev_t *mddev = disk->private_data; | 6069 | mddev_t *mddev = disk->private_data; |
5984 | 6070 | ||
5985 | BUG_ON(!mddev); | 6071 | BUG_ON(!mddev); |
5986 | lock_kernel(); | ||
5987 | atomic_dec(&mddev->openers); | 6072 | atomic_dec(&mddev->openers); |
5988 | mddev_put(mddev); | 6073 | mddev_put(mddev); |
5989 | unlock_kernel(); | ||
5990 | 6074 | ||
5991 | return 0; | 6075 | return 0; |
5992 | } | 6076 | } |
6077 | |||
6078 | static int md_media_changed(struct gendisk *disk) | ||
6079 | { | ||
6080 | mddev_t *mddev = disk->private_data; | ||
6081 | |||
6082 | return mddev->changed; | ||
6083 | } | ||
6084 | |||
6085 | static int md_revalidate(struct gendisk *disk) | ||
6086 | { | ||
6087 | mddev_t *mddev = disk->private_data; | ||
6088 | |||
6089 | mddev->changed = 0; | ||
6090 | return 0; | ||
6091 | } | ||
5993 | static const struct block_device_operations md_fops = | 6092 | static const struct block_device_operations md_fops = |
5994 | { | 6093 | { |
5995 | .owner = THIS_MODULE, | 6094 | .owner = THIS_MODULE, |
@@ -6000,6 +6099,8 @@ static const struct block_device_operations md_fops = | |||
6000 | .compat_ioctl = md_compat_ioctl, | 6099 | .compat_ioctl = md_compat_ioctl, |
6001 | #endif | 6100 | #endif |
6002 | .getgeo = md_getgeo, | 6101 | .getgeo = md_getgeo, |
6102 | .media_changed = md_media_changed, | ||
6103 | .revalidate_disk= md_revalidate, | ||
6003 | }; | 6104 | }; |
6004 | 6105 | ||
6005 | static int md_thread(void * arg) | 6106 | static int md_thread(void * arg) |
@@ -6036,8 +6137,8 @@ static int md_thread(void * arg) | |||
6036 | thread->timeout); | 6137 | thread->timeout); |
6037 | 6138 | ||
6038 | clear_bit(THREAD_WAKEUP, &thread->flags); | 6139 | clear_bit(THREAD_WAKEUP, &thread->flags); |
6039 | 6140 | if (!kthread_should_stop()) | |
6040 | thread->run(thread->mddev); | 6141 | thread->run(thread->mddev); |
6041 | } | 6142 | } |
6042 | 6143 | ||
6043 | return 0; | 6144 | return 0; |
@@ -6118,7 +6219,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
6118 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 6219 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
6119 | md_wakeup_thread(mddev->thread); | 6220 | md_wakeup_thread(mddev->thread); |
6120 | if (mddev->event_work.func) | 6221 | if (mddev->event_work.func) |
6121 | schedule_work(&mddev->event_work); | 6222 | queue_work(md_misc_wq, &mddev->event_work); |
6122 | md_new_event_inintr(mddev); | 6223 | md_new_event_inintr(mddev); |
6123 | } | 6224 | } |
6124 | 6225 | ||
@@ -6209,7 +6310,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) | |||
6209 | * rt is a sector_t, so could be 32bit or 64bit. | 6310 | * rt is a sector_t, so could be 32bit or 64bit. |
6210 | * So we divide before multiply in case it is 32bit and close | 6311 | * So we divide before multiply in case it is 32bit and close |
6211 | * to the limit. | 6312 | * to the limit. |
6212 | * We scale the divisor (db) by 32 to avoid loosing precision | 6313 | * We scale the divisor (db) by 32 to avoid losing precision |
6213 | * near the end of resync when the number of remaining sectors | 6314 | * near the end of resync when the number of remaining sectors |
6214 | * is close to 'db'. | 6315 | * is close to 'db'. |
6215 | * We then divide rt by 32 after multiplying by db to compensate. | 6316 | * We then divide rt by 32 after multiplying by db to compensate. |
@@ -6631,14 +6732,6 @@ int md_allow_write(mddev_t *mddev) | |||
6631 | } | 6732 | } |
6632 | EXPORT_SYMBOL_GPL(md_allow_write); | 6733 | EXPORT_SYMBOL_GPL(md_allow_write); |
6633 | 6734 | ||
6634 | void md_unplug(mddev_t *mddev) | ||
6635 | { | ||
6636 | if (mddev->queue) | ||
6637 | blk_unplug(mddev->queue); | ||
6638 | if (mddev->plug) | ||
6639 | mddev->plug->unplug_fn(mddev->plug); | ||
6640 | } | ||
6641 | |||
6642 | #define SYNC_MARKS 10 | 6735 | #define SYNC_MARKS 10 |
6643 | #define SYNC_MARK_STEP (3*HZ) | 6736 | #define SYNC_MARK_STEP (3*HZ) |
6644 | void md_do_sync(mddev_t *mddev) | 6737 | void md_do_sync(mddev_t *mddev) |
@@ -6790,8 +6883,8 @@ void md_do_sync(mddev_t *mddev) | |||
6790 | * Tune reconstruction: | 6883 | * Tune reconstruction: |
6791 | */ | 6884 | */ |
6792 | window = 32*(PAGE_SIZE/512); | 6885 | window = 32*(PAGE_SIZE/512); |
6793 | printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", | 6886 | printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", |
6794 | window/2,(unsigned long long) max_sectors/2); | 6887 | window/2, (unsigned long long)max_sectors/2); |
6795 | 6888 | ||
6796 | atomic_set(&mddev->recovery_active, 0); | 6889 | atomic_set(&mddev->recovery_active, 0); |
6797 | last_check = 0; | 6890 | last_check = 0; |
@@ -6802,7 +6895,7 @@ void md_do_sync(mddev_t *mddev) | |||
6802 | desc, mdname(mddev)); | 6895 | desc, mdname(mddev)); |
6803 | mddev->curr_resync = j; | 6896 | mddev->curr_resync = j; |
6804 | } | 6897 | } |
6805 | mddev->curr_resync_completed = mddev->curr_resync; | 6898 | mddev->curr_resync_completed = j; |
6806 | 6899 | ||
6807 | while (j < max_sectors) { | 6900 | while (j < max_sectors) { |
6808 | sector_t sectors; | 6901 | sector_t sectors; |
@@ -6817,11 +6910,9 @@ void md_do_sync(mddev_t *mddev) | |||
6817 | >= mddev->resync_max - mddev->curr_resync_completed | 6910 | >= mddev->resync_max - mddev->curr_resync_completed |
6818 | )) { | 6911 | )) { |
6819 | /* time to update curr_resync_completed */ | 6912 | /* time to update curr_resync_completed */ |
6820 | md_unplug(mddev); | ||
6821 | wait_event(mddev->recovery_wait, | 6913 | wait_event(mddev->recovery_wait, |
6822 | atomic_read(&mddev->recovery_active) == 0); | 6914 | atomic_read(&mddev->recovery_active) == 0); |
6823 | mddev->curr_resync_completed = | 6915 | mddev->curr_resync_completed = j; |
6824 | mddev->curr_resync; | ||
6825 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 6916 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); |
6826 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 6917 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
6827 | } | 6918 | } |
@@ -6894,7 +6985,6 @@ void md_do_sync(mddev_t *mddev) | |||
6894 | * about not overloading the IO subsystem. (things like an | 6985 | * about not overloading the IO subsystem. (things like an |
6895 | * e2fsck being done on the RAID array should execute fast) | 6986 | * e2fsck being done on the RAID array should execute fast) |
6896 | */ | 6987 | */ |
6897 | md_unplug(mddev); | ||
6898 | cond_resched(); | 6988 | cond_resched(); |
6899 | 6989 | ||
6900 | currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 | 6990 | currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 |
@@ -6913,8 +7003,6 @@ void md_do_sync(mddev_t *mddev) | |||
6913 | * this also signals 'finished resyncing' to md_stop | 7003 | * this also signals 'finished resyncing' to md_stop |
6914 | */ | 7004 | */ |
6915 | out: | 7005 | out: |
6916 | md_unplug(mddev); | ||
6917 | |||
6918 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | 7006 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
6919 | 7007 | ||
6920 | /* tell personality that we are finished */ | 7008 | /* tell personality that we are finished */ |
@@ -6957,9 +7045,6 @@ void md_do_sync(mddev_t *mddev) | |||
6957 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | 7045 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
6958 | mddev->resync_min = mddev->curr_resync_completed; | 7046 | mddev->resync_min = mddev->curr_resync_completed; |
6959 | mddev->curr_resync = 0; | 7047 | mddev->curr_resync = 0; |
6960 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
6961 | mddev->curr_resync_completed = 0; | ||
6962 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||
6963 | wake_up(&resync_wait); | 7048 | wake_up(&resync_wait); |
6964 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | 7049 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); |
6965 | md_wakeup_thread(mddev->thread); | 7050 | md_wakeup_thread(mddev->thread); |
@@ -6977,7 +7062,6 @@ void md_do_sync(mddev_t *mddev) | |||
6977 | } | 7062 | } |
6978 | EXPORT_SYMBOL_GPL(md_do_sync); | 7063 | EXPORT_SYMBOL_GPL(md_do_sync); |
6979 | 7064 | ||
6980 | |||
6981 | static int remove_and_add_spares(mddev_t *mddev) | 7065 | static int remove_and_add_spares(mddev_t *mddev) |
6982 | { | 7066 | { |
6983 | mdk_rdev_t *rdev; | 7067 | mdk_rdev_t *rdev; |
@@ -7000,10 +7084,11 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7000 | } | 7084 | } |
7001 | } | 7085 | } |
7002 | 7086 | ||
7003 | if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { | 7087 | if (mddev->degraded && !mddev->recovery_disabled) { |
7004 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 7088 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
7005 | if (rdev->raid_disk >= 0 && | 7089 | if (rdev->raid_disk >= 0 && |
7006 | !test_bit(In_sync, &rdev->flags) && | 7090 | !test_bit(In_sync, &rdev->flags) && |
7091 | !test_bit(Faulty, &rdev->flags) && | ||
7007 | !test_bit(Blocked, &rdev->flags)) | 7092 | !test_bit(Blocked, &rdev->flags)) |
7008 | spares++; | 7093 | spares++; |
7009 | if (rdev->raid_disk < 0 | 7094 | if (rdev->raid_disk < 0 |
@@ -7026,6 +7111,45 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7026 | } | 7111 | } |
7027 | return spares; | 7112 | return spares; |
7028 | } | 7113 | } |
7114 | |||
7115 | static void reap_sync_thread(mddev_t *mddev) | ||
7116 | { | ||
7117 | mdk_rdev_t *rdev; | ||
7118 | |||
7119 | /* resync has finished, collect result */ | ||
7120 | md_unregister_thread(mddev->sync_thread); | ||
7121 | mddev->sync_thread = NULL; | ||
7122 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | ||
7123 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | ||
7124 | /* success...*/ | ||
7125 | /* activate any spares */ | ||
7126 | if (mddev->pers->spare_active(mddev)) | ||
7127 | sysfs_notify(&mddev->kobj, NULL, | ||
7128 | "degraded"); | ||
7129 | } | ||
7130 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
7131 | mddev->pers->finish_reshape) | ||
7132 | mddev->pers->finish_reshape(mddev); | ||
7133 | md_update_sb(mddev, 1); | ||
7134 | |||
7135 | /* if array is no-longer degraded, then any saved_raid_disk | ||
7136 | * information must be scrapped | ||
7137 | */ | ||
7138 | if (!mddev->degraded) | ||
7139 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
7140 | rdev->saved_raid_disk = -1; | ||
7141 | |||
7142 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
7143 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
7144 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
7145 | clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); | ||
7146 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
7147 | /* flag recovery needed just to double check */ | ||
7148 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
7149 | sysfs_notify_dirent_safe(mddev->sysfs_action); | ||
7150 | md_new_event(mddev); | ||
7151 | } | ||
7152 | |||
7029 | /* | 7153 | /* |
7030 | * This routine is regularly called by all per-raid-array threads to | 7154 | * This routine is regularly called by all per-raid-array threads to |
7031 | * deal with generic issues like resync and super-block update. | 7155 | * deal with generic issues like resync and super-block update. |
@@ -7050,8 +7174,8 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7050 | */ | 7174 | */ |
7051 | void md_check_recovery(mddev_t *mddev) | 7175 | void md_check_recovery(mddev_t *mddev) |
7052 | { | 7176 | { |
7053 | mdk_rdev_t *rdev; | 7177 | if (mddev->suspended) |
7054 | 7178 | return; | |
7055 | 7179 | ||
7056 | if (mddev->bitmap) | 7180 | if (mddev->bitmap) |
7057 | bitmap_daemon_work(mddev); | 7181 | bitmap_daemon_work(mddev); |
@@ -7087,7 +7211,20 @@ void md_check_recovery(mddev_t *mddev) | |||
7087 | /* Only thing we do on a ro array is remove | 7211 | /* Only thing we do on a ro array is remove |
7088 | * failed devices. | 7212 | * failed devices. |
7089 | */ | 7213 | */ |
7090 | remove_and_add_spares(mddev); | 7214 | mdk_rdev_t *rdev; |
7215 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
7216 | if (rdev->raid_disk >= 0 && | ||
7217 | !test_bit(Blocked, &rdev->flags) && | ||
7218 | test_bit(Faulty, &rdev->flags) && | ||
7219 | atomic_read(&rdev->nr_pending)==0) { | ||
7220 | if (mddev->pers->hot_remove_disk( | ||
7221 | mddev, rdev->raid_disk)==0) { | ||
7222 | char nm[20]; | ||
7223 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
7224 | sysfs_remove_link(&mddev->kobj, nm); | ||
7225 | rdev->raid_disk = -1; | ||
7226 | } | ||
7227 | } | ||
7091 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 7228 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
7092 | goto unlock; | 7229 | goto unlock; |
7093 | } | 7230 | } |
@@ -7120,34 +7257,7 @@ void md_check_recovery(mddev_t *mddev) | |||
7120 | goto unlock; | 7257 | goto unlock; |
7121 | } | 7258 | } |
7122 | if (mddev->sync_thread) { | 7259 | if (mddev->sync_thread) { |
7123 | /* resync has finished, collect result */ | 7260 | reap_sync_thread(mddev); |
7124 | md_unregister_thread(mddev->sync_thread); | ||
7125 | mddev->sync_thread = NULL; | ||
7126 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | ||
7127 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | ||
7128 | /* success...*/ | ||
7129 | /* activate any spares */ | ||
7130 | if (mddev->pers->spare_active(mddev)) | ||
7131 | sysfs_notify(&mddev->kobj, NULL, | ||
7132 | "degraded"); | ||
7133 | } | ||
7134 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
7135 | mddev->pers->finish_reshape) | ||
7136 | mddev->pers->finish_reshape(mddev); | ||
7137 | md_update_sb(mddev, 1); | ||
7138 | |||
7139 | /* if array is no-longer degraded, then any saved_raid_disk | ||
7140 | * information must be scrapped | ||
7141 | */ | ||
7142 | if (!mddev->degraded) | ||
7143 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
7144 | rdev->saved_raid_disk = -1; | ||
7145 | |||
7146 | mddev->recovery = 0; | ||
7147 | /* flag recovery needed just to double check */ | ||
7148 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
7149 | sysfs_notify_dirent_safe(mddev->sysfs_action); | ||
7150 | md_new_event(mddev); | ||
7151 | goto unlock; | 7261 | goto unlock; |
7152 | } | 7262 | } |
7153 | /* Set RUNNING before clearing NEEDED to avoid | 7263 | /* Set RUNNING before clearing NEEDED to avoid |
@@ -7205,7 +7315,11 @@ void md_check_recovery(mddev_t *mddev) | |||
7205 | " thread...\n", | 7315 | " thread...\n", |
7206 | mdname(mddev)); | 7316 | mdname(mddev)); |
7207 | /* leave the spares where they are, it shouldn't hurt */ | 7317 | /* leave the spares where they are, it shouldn't hurt */ |
7208 | mddev->recovery = 0; | 7318 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); |
7319 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
7320 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
7321 | clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); | ||
7322 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
7209 | } else | 7323 | } else |
7210 | md_wakeup_thread(mddev->sync_thread); | 7324 | md_wakeup_thread(mddev->sync_thread); |
7211 | sysfs_notify_dirent_safe(mddev->sysfs_action); | 7325 | sysfs_notify_dirent_safe(mddev->sysfs_action); |
@@ -7278,12 +7392,23 @@ static void md_geninit(void) | |||
7278 | 7392 | ||
7279 | static int __init md_init(void) | 7393 | static int __init md_init(void) |
7280 | { | 7394 | { |
7281 | if (register_blkdev(MD_MAJOR, "md")) | 7395 | int ret = -ENOMEM; |
7282 | return -1; | 7396 | |
7283 | if ((mdp_major=register_blkdev(0, "mdp"))<=0) { | 7397 | md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); |
7284 | unregister_blkdev(MD_MAJOR, "md"); | 7398 | if (!md_wq) |
7285 | return -1; | 7399 | goto err_wq; |
7286 | } | 7400 | |
7401 | md_misc_wq = alloc_workqueue("md_misc", 0, 0); | ||
7402 | if (!md_misc_wq) | ||
7403 | goto err_misc_wq; | ||
7404 | |||
7405 | if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) | ||
7406 | goto err_md; | ||
7407 | |||
7408 | if ((ret = register_blkdev(0, "mdp")) < 0) | ||
7409 | goto err_mdp; | ||
7410 | mdp_major = ret; | ||
7411 | |||
7287 | blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, | 7412 | blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, |
7288 | md_probe, NULL, NULL); | 7413 | md_probe, NULL, NULL); |
7289 | blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, | 7414 | blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, |
@@ -7294,8 +7419,16 @@ static int __init md_init(void) | |||
7294 | 7419 | ||
7295 | md_geninit(); | 7420 | md_geninit(); |
7296 | return 0; | 7421 | return 0; |
7297 | } | ||
7298 | 7422 | ||
7423 | err_mdp: | ||
7424 | unregister_blkdev(MD_MAJOR, "md"); | ||
7425 | err_md: | ||
7426 | destroy_workqueue(md_misc_wq); | ||
7427 | err_misc_wq: | ||
7428 | destroy_workqueue(md_wq); | ||
7429 | err_wq: | ||
7430 | return ret; | ||
7431 | } | ||
7299 | 7432 | ||
7300 | #ifndef MODULE | 7433 | #ifndef MODULE |
7301 | 7434 | ||
@@ -7382,6 +7515,8 @@ static __exit void md_exit(void) | |||
7382 | export_array(mddev); | 7515 | export_array(mddev); |
7383 | mddev->hold_active = 0; | 7516 | mddev->hold_active = 0; |
7384 | } | 7517 | } |
7518 | destroy_workqueue(md_misc_wq); | ||
7519 | destroy_workqueue(md_wq); | ||
7385 | } | 7520 | } |
7386 | 7521 | ||
7387 | subsys_initcall(md_init); | 7522 | subsys_initcall(md_init); |