diff options
author | Dan Williams <dan.j.williams@intel.com> | 2009-07-14 14:48:22 -0400 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2009-08-29 22:09:26 -0400 |
commit | 36d1c6476be51101778882897b315bd928c8c7b5 (patch) | |
tree | 55b4ecd93ce9c22722c9c9da0dd28a2d2f7c082d /drivers/md | |
parent | a11034b4282515fd7d9f6fdc0a1380781da461c3 (diff) |
md/raid6: move the spare page to a percpu allocation
In preparation for asynchronous handling of raid6 operations move the
spare page to a percpu allocation to allow multiple simultaneous
synchronous raid6 recovery operations.
Make this allocation cpu hotplug aware to maximize allocation
efficiency.
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/raid5.c | 252 | ||||
-rw-r--r-- | drivers/md/raid5.h | 9 |
2 files changed, 175 insertions, 86 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9411466f71de..5359236a1ec7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/raid/pq.h> | 48 | #include <linux/raid/pq.h> |
49 | #include <linux/async_tx.h> | 49 | #include <linux/async_tx.h> |
50 | #include <linux/seq_file.h> | 50 | #include <linux/seq_file.h> |
51 | #include <linux/cpu.h> | ||
51 | #include "md.h" | 52 | #include "md.h" |
52 | #include "raid5.h" | 53 | #include "raid5.h" |
53 | #include "bitmap.h" | 54 | #include "bitmap.h" |
@@ -2565,14 +2566,15 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2565 | 2566 | ||
2566 | 2567 | ||
2567 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | 2568 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, |
2568 | struct stripe_head_state *s, | 2569 | struct stripe_head_state *s, |
2569 | struct r6_state *r6s, struct page *tmp_page, | 2570 | struct r6_state *r6s, int disks) |
2570 | int disks) | ||
2571 | { | 2571 | { |
2572 | int update_p = 0, update_q = 0; | 2572 | int update_p = 0, update_q = 0; |
2573 | struct r5dev *dev; | 2573 | struct r5dev *dev; |
2574 | int pd_idx = sh->pd_idx; | 2574 | int pd_idx = sh->pd_idx; |
2575 | int qd_idx = sh->qd_idx; | 2575 | int qd_idx = sh->qd_idx; |
2576 | unsigned long cpu; | ||
2577 | struct page *tmp_page; | ||
2576 | 2578 | ||
2577 | set_bit(STRIPE_HANDLE, &sh->state); | 2579 | set_bit(STRIPE_HANDLE, &sh->state); |
2578 | 2580 | ||
@@ -2583,78 +2585,75 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2583 | * case we can only check one of them, possibly using the | 2585 | * case we can only check one of them, possibly using the |
2584 | * other to generate missing data | 2586 | * other to generate missing data |
2585 | */ | 2587 | */ |
2586 | 2588 | cpu = get_cpu(); | |
2587 | /* If !tmp_page, we cannot do the calculations, | 2589 | tmp_page = per_cpu_ptr(conf->percpu, cpu)->spare_page; |
2588 | * but as we have set STRIPE_HANDLE, we will soon be called | 2590 | if (s->failed == r6s->q_failed) { |
2589 | * by stripe_handle with a tmp_page - just wait until then. | 2591 | /* The only possible failed device holds 'Q', so it |
2590 | */ | 2592 | * makes sense to check P (If anything else were failed, |
2591 | if (tmp_page) { | 2593 | * we would have used P to recreate it). |
2592 | if (s->failed == r6s->q_failed) { | 2594 | */ |
2593 | /* The only possible failed device holds 'Q', so it | 2595 | compute_block_1(sh, pd_idx, 1); |
2594 | * makes sense to check P (If anything else were failed, | 2596 | if (!page_is_zero(sh->dev[pd_idx].page)) { |
2595 | * we would have used P to recreate it). | 2597 | compute_block_1(sh, pd_idx, 0); |
2596 | */ | 2598 | update_p = 1; |
2597 | compute_block_1(sh, pd_idx, 1); | ||
2598 | if (!page_is_zero(sh->dev[pd_idx].page)) { | ||
2599 | compute_block_1(sh, pd_idx, 0); | ||
2600 | update_p = 1; | ||
2601 | } | ||
2602 | } | ||
2603 | if (!r6s->q_failed && s->failed < 2) { | ||
2604 | /* q is not failed, and we didn't use it to generate | ||
2605 | * anything, so it makes sense to check it | ||
2606 | */ | ||
2607 | memcpy(page_address(tmp_page), | ||
2608 | page_address(sh->dev[qd_idx].page), | ||
2609 | STRIPE_SIZE); | ||
2610 | compute_parity6(sh, UPDATE_PARITY); | ||
2611 | if (memcmp(page_address(tmp_page), | ||
2612 | page_address(sh->dev[qd_idx].page), | ||
2613 | STRIPE_SIZE) != 0) { | ||
2614 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
2615 | update_q = 1; | ||
2616 | } | ||
2617 | } | 2599 | } |
2618 | if (update_p || update_q) { | 2600 | } |
2619 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | 2601 | if (!r6s->q_failed && s->failed < 2) { |
2620 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | 2602 | /* q is not failed, and we didn't use it to generate |
2621 | /* don't try to repair!! */ | 2603 | * anything, so it makes sense to check it |
2622 | update_p = update_q = 0; | 2604 | */ |
2605 | memcpy(page_address(tmp_page), | ||
2606 | page_address(sh->dev[qd_idx].page), | ||
2607 | STRIPE_SIZE); | ||
2608 | compute_parity6(sh, UPDATE_PARITY); | ||
2609 | if (memcmp(page_address(tmp_page), | ||
2610 | page_address(sh->dev[qd_idx].page), | ||
2611 | STRIPE_SIZE) != 0) { | ||
2612 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
2613 | update_q = 1; | ||
2623 | } | 2614 | } |
2615 | } | ||
2616 | put_cpu(); | ||
2624 | 2617 | ||
2625 | /* now write out any block on a failed drive, | 2618 | if (update_p || update_q) { |
2626 | * or P or Q if they need it | 2619 | conf->mddev->resync_mismatches += STRIPE_SECTORS; |
2627 | */ | 2620 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) |
2621 | /* don't try to repair!! */ | ||
2622 | update_p = update_q = 0; | ||
2623 | } | ||
2628 | 2624 | ||
2629 | if (s->failed == 2) { | 2625 | /* now write out any block on a failed drive, |
2630 | dev = &sh->dev[r6s->failed_num[1]]; | 2626 | * or P or Q if they need it |
2631 | s->locked++; | 2627 | */ |
2632 | set_bit(R5_LOCKED, &dev->flags); | ||
2633 | set_bit(R5_Wantwrite, &dev->flags); | ||
2634 | } | ||
2635 | if (s->failed >= 1) { | ||
2636 | dev = &sh->dev[r6s->failed_num[0]]; | ||
2637 | s->locked++; | ||
2638 | set_bit(R5_LOCKED, &dev->flags); | ||
2639 | set_bit(R5_Wantwrite, &dev->flags); | ||
2640 | } | ||
2641 | 2628 | ||
2642 | if (update_p) { | 2629 | if (s->failed == 2) { |
2643 | dev = &sh->dev[pd_idx]; | 2630 | dev = &sh->dev[r6s->failed_num[1]]; |
2644 | s->locked++; | 2631 | s->locked++; |
2645 | set_bit(R5_LOCKED, &dev->flags); | 2632 | set_bit(R5_LOCKED, &dev->flags); |
2646 | set_bit(R5_Wantwrite, &dev->flags); | 2633 | set_bit(R5_Wantwrite, &dev->flags); |
2647 | } | 2634 | } |
2648 | if (update_q) { | 2635 | if (s->failed >= 1) { |
2649 | dev = &sh->dev[qd_idx]; | 2636 | dev = &sh->dev[r6s->failed_num[0]]; |
2650 | s->locked++; | 2637 | s->locked++; |
2651 | set_bit(R5_LOCKED, &dev->flags); | 2638 | set_bit(R5_LOCKED, &dev->flags); |
2652 | set_bit(R5_Wantwrite, &dev->flags); | 2639 | set_bit(R5_Wantwrite, &dev->flags); |
2653 | } | 2640 | } |
2654 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
2655 | 2641 | ||
2656 | set_bit(STRIPE_INSYNC, &sh->state); | 2642 | if (update_p) { |
2643 | dev = &sh->dev[pd_idx]; | ||
2644 | s->locked++; | ||
2645 | set_bit(R5_LOCKED, &dev->flags); | ||
2646 | set_bit(R5_Wantwrite, &dev->flags); | ||
2647 | } | ||
2648 | if (update_q) { | ||
2649 | dev = &sh->dev[qd_idx]; | ||
2650 | s->locked++; | ||
2651 | set_bit(R5_LOCKED, &dev->flags); | ||
2652 | set_bit(R5_Wantwrite, &dev->flags); | ||
2657 | } | 2653 | } |
2654 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
2655 | |||
2656 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2658 | } | 2657 | } |
2659 | 2658 | ||
2660 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | 2659 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, |
@@ -3009,7 +3008,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
3009 | return blocked_rdev == NULL; | 3008 | return blocked_rdev == NULL; |
3010 | } | 3009 | } |
3011 | 3010 | ||
3012 | static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | 3011 | static bool handle_stripe6(struct stripe_head *sh) |
3013 | { | 3012 | { |
3014 | raid5_conf_t *conf = sh->raid_conf; | 3013 | raid5_conf_t *conf = sh->raid_conf; |
3015 | int disks = sh->disks; | 3014 | int disks = sh->disks; |
@@ -3164,7 +3163,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3164 | * data is available | 3163 | * data is available |
3165 | */ | 3164 | */ |
3166 | if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) | 3165 | if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) |
3167 | handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); | 3166 | handle_parity_checks6(conf, sh, &s, &r6s, disks); |
3168 | 3167 | ||
3169 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3168 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
3170 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 3169 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); |
@@ -3247,16 +3246,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3247 | } | 3246 | } |
3248 | 3247 | ||
3249 | /* returns true if the stripe was handled */ | 3248 | /* returns true if the stripe was handled */ |
3250 | static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) | 3249 | static bool handle_stripe(struct stripe_head *sh) |
3251 | { | 3250 | { |
3252 | if (sh->raid_conf->level == 6) | 3251 | if (sh->raid_conf->level == 6) |
3253 | return handle_stripe6(sh, tmp_page); | 3252 | return handle_stripe6(sh); |
3254 | else | 3253 | else |
3255 | return handle_stripe5(sh); | 3254 | return handle_stripe5(sh); |
3256 | } | 3255 | } |
3257 | 3256 | ||
3258 | |||
3259 | |||
3260 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3257 | static void raid5_activate_delayed(raid5_conf_t *conf) |
3261 | { | 3258 | { |
3262 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { | 3259 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { |
@@ -4047,7 +4044,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4047 | spin_unlock(&sh->lock); | 4044 | spin_unlock(&sh->lock); |
4048 | 4045 | ||
4049 | /* wait for any blocked device to be handled */ | 4046 | /* wait for any blocked device to be handled */ |
4050 | while(unlikely(!handle_stripe(sh, NULL))) | 4047 | while (unlikely(!handle_stripe(sh))) |
4051 | ; | 4048 | ; |
4052 | release_stripe(sh); | 4049 | release_stripe(sh); |
4053 | 4050 | ||
@@ -4104,7 +4101,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4104 | return handled; | 4101 | return handled; |
4105 | } | 4102 | } |
4106 | 4103 | ||
4107 | handle_stripe(sh, NULL); | 4104 | handle_stripe(sh); |
4108 | release_stripe(sh); | 4105 | release_stripe(sh); |
4109 | handled++; | 4106 | handled++; |
4110 | } | 4107 | } |
@@ -4168,7 +4165,7 @@ static void raid5d(mddev_t *mddev) | |||
4168 | spin_unlock_irq(&conf->device_lock); | 4165 | spin_unlock_irq(&conf->device_lock); |
4169 | 4166 | ||
4170 | handled++; | 4167 | handled++; |
4171 | handle_stripe(sh, conf->spare_page); | 4168 | handle_stripe(sh); |
4172 | release_stripe(sh); | 4169 | release_stripe(sh); |
4173 | 4170 | ||
4174 | spin_lock_irq(&conf->device_lock); | 4171 | spin_lock_irq(&conf->device_lock); |
@@ -4309,15 +4306,104 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
4309 | return sectors * (raid_disks - conf->max_degraded); | 4306 | return sectors * (raid_disks - conf->max_degraded); |
4310 | } | 4307 | } |
4311 | 4308 | ||
4309 | static void raid5_free_percpu(raid5_conf_t *conf) | ||
4310 | { | ||
4311 | struct raid5_percpu *percpu; | ||
4312 | unsigned long cpu; | ||
4313 | |||
4314 | if (!conf->percpu) | ||
4315 | return; | ||
4316 | |||
4317 | get_online_cpus(); | ||
4318 | for_each_possible_cpu(cpu) { | ||
4319 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4320 | safe_put_page(percpu->spare_page); | ||
4321 | } | ||
4322 | #ifdef CONFIG_HOTPLUG_CPU | ||
4323 | unregister_cpu_notifier(&conf->cpu_notify); | ||
4324 | #endif | ||
4325 | put_online_cpus(); | ||
4326 | |||
4327 | free_percpu(conf->percpu); | ||
4328 | } | ||
4329 | |||
4312 | static void free_conf(raid5_conf_t *conf) | 4330 | static void free_conf(raid5_conf_t *conf) |
4313 | { | 4331 | { |
4314 | shrink_stripes(conf); | 4332 | shrink_stripes(conf); |
4315 | safe_put_page(conf->spare_page); | 4333 | raid5_free_percpu(conf); |
4316 | kfree(conf->disks); | 4334 | kfree(conf->disks); |
4317 | kfree(conf->stripe_hashtbl); | 4335 | kfree(conf->stripe_hashtbl); |
4318 | kfree(conf); | 4336 | kfree(conf); |
4319 | } | 4337 | } |
4320 | 4338 | ||
4339 | #ifdef CONFIG_HOTPLUG_CPU | ||
4340 | static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | ||
4341 | void *hcpu) | ||
4342 | { | ||
4343 | raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); | ||
4344 | long cpu = (long)hcpu; | ||
4345 | struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4346 | |||
4347 | switch (action) { | ||
4348 | case CPU_UP_PREPARE: | ||
4349 | case CPU_UP_PREPARE_FROZEN: | ||
4350 | if (!percpu->spare_page) | ||
4351 | percpu->spare_page = alloc_page(GFP_KERNEL); | ||
4352 | if (!percpu->spare_page) { | ||
4353 | pr_err("%s: failed memory allocation for cpu%ld\n", | ||
4354 | __func__, cpu); | ||
4355 | return NOTIFY_BAD; | ||
4356 | } | ||
4357 | break; | ||
4358 | case CPU_DEAD: | ||
4359 | case CPU_DEAD_FROZEN: | ||
4360 | safe_put_page(percpu->spare_page); | ||
4361 | percpu->spare_page = NULL; | ||
4362 | break; | ||
4363 | default: | ||
4364 | break; | ||
4365 | } | ||
4366 | return NOTIFY_OK; | ||
4367 | } | ||
4368 | #endif | ||
4369 | |||
4370 | static int raid5_alloc_percpu(raid5_conf_t *conf) | ||
4371 | { | ||
4372 | unsigned long cpu; | ||
4373 | struct page *spare_page; | ||
4374 | struct raid5_percpu *allcpus; | ||
4375 | int err; | ||
4376 | |||
4377 | /* the only percpu data is the raid6 spare page */ | ||
4378 | if (conf->level != 6) | ||
4379 | return 0; | ||
4380 | |||
4381 | allcpus = alloc_percpu(struct raid5_percpu); | ||
4382 | if (!allcpus) | ||
4383 | return -ENOMEM; | ||
4384 | conf->percpu = allcpus; | ||
4385 | |||
4386 | get_online_cpus(); | ||
4387 | err = 0; | ||
4388 | for_each_present_cpu(cpu) { | ||
4389 | spare_page = alloc_page(GFP_KERNEL); | ||
4390 | if (!spare_page) { | ||
4391 | err = -ENOMEM; | ||
4392 | break; | ||
4393 | } | ||
4394 | per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; | ||
4395 | } | ||
4396 | #ifdef CONFIG_HOTPLUG_CPU | ||
4397 | conf->cpu_notify.notifier_call = raid456_cpu_notify; | ||
4398 | conf->cpu_notify.priority = 0; | ||
4399 | if (err == 0) | ||
4400 | err = register_cpu_notifier(&conf->cpu_notify); | ||
4401 | #endif | ||
4402 | put_online_cpus(); | ||
4403 | |||
4404 | return err; | ||
4405 | } | ||
4406 | |||
4321 | static raid5_conf_t *setup_conf(mddev_t *mddev) | 4407 | static raid5_conf_t *setup_conf(mddev_t *mddev) |
4322 | { | 4408 | { |
4323 | raid5_conf_t *conf; | 4409 | raid5_conf_t *conf; |
@@ -4372,11 +4458,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4372 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 4458 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
4373 | goto abort; | 4459 | goto abort; |
4374 | 4460 | ||
4375 | if (mddev->new_level == 6) { | 4461 | conf->level = mddev->new_level; |
4376 | conf->spare_page = alloc_page(GFP_KERNEL); | 4462 | if (raid5_alloc_percpu(conf) != 0) |
4377 | if (!conf->spare_page) | 4463 | goto abort; |
4378 | goto abort; | 4464 | |
4379 | } | ||
4380 | spin_lock_init(&conf->device_lock); | 4465 | spin_lock_init(&conf->device_lock); |
4381 | init_waitqueue_head(&conf->wait_for_stripe); | 4466 | init_waitqueue_head(&conf->wait_for_stripe); |
4382 | init_waitqueue_head(&conf->wait_for_overlap); | 4467 | init_waitqueue_head(&conf->wait_for_overlap); |
@@ -4412,7 +4497,6 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4412 | } | 4497 | } |
4413 | 4498 | ||
4414 | conf->chunk_size = mddev->new_chunk; | 4499 | conf->chunk_size = mddev->new_chunk; |
4415 | conf->level = mddev->new_level; | ||
4416 | if (conf->level == 6) | 4500 | if (conf->level == 6) |
4417 | conf->max_degraded = 2; | 4501 | conf->max_degraded = 2; |
4418 | else | 4502 | else |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 52ba99954dec..07a7a4102f05 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -383,8 +383,13 @@ struct raid5_private_data { | |||
383 | * (fresh device added). | 383 | * (fresh device added). |
384 | * Cleared when a sync completes. | 384 | * Cleared when a sync completes. |
385 | */ | 385 | */ |
386 | 386 | /* per cpu variables */ | |
387 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 387 | struct raid5_percpu { |
388 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | ||
389 | } *percpu; | ||
390 | #ifdef CONFIG_HOTPLUG_CPU | ||
391 | struct notifier_block cpu_notify; | ||
392 | #endif | ||
388 | 393 | ||
389 | /* | 394 | /* |
390 | * Free stripes pool | 395 | * Free stripes pool |