aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-07-14 14:48:22 -0400
committerDan Williams <dan.j.williams@intel.com>2009-08-29 22:09:26 -0400
commit36d1c6476be51101778882897b315bd928c8c7b5 (patch)
tree55b4ecd93ce9c22722c9c9da0dd28a2d2f7c082d
parenta11034b4282515fd7d9f6fdc0a1380781da461c3 (diff)
md/raid6: move the spare page to a percpu allocation
In preparation for asynchronous handling of raid6 operations move the spare page to a percpu allocation to allow multiple simultaneous synchronous raid6 recovery operations. Make this allocation cpu hotplug aware to maximize allocation efficiency. Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r--drivers/md/raid5.c252
-rw-r--r--drivers/md/raid5.h9
2 files changed, 175 insertions, 86 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9411466f71de..5359236a1ec7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -48,6 +48,7 @@
48#include <linux/raid/pq.h> 48#include <linux/raid/pq.h>
49#include <linux/async_tx.h> 49#include <linux/async_tx.h>
50#include <linux/seq_file.h> 50#include <linux/seq_file.h>
51#include <linux/cpu.h>
51#include "md.h" 52#include "md.h"
52#include "raid5.h" 53#include "raid5.h"
53#include "bitmap.h" 54#include "bitmap.h"
@@ -2565,14 +2566,15 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2565 2566
2566 2567
2567static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2568static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2568 struct stripe_head_state *s, 2569 struct stripe_head_state *s,
2569 struct r6_state *r6s, struct page *tmp_page, 2570 struct r6_state *r6s, int disks)
2570 int disks)
2571{ 2571{
2572 int update_p = 0, update_q = 0; 2572 int update_p = 0, update_q = 0;
2573 struct r5dev *dev; 2573 struct r5dev *dev;
2574 int pd_idx = sh->pd_idx; 2574 int pd_idx = sh->pd_idx;
2575 int qd_idx = sh->qd_idx; 2575 int qd_idx = sh->qd_idx;
2576 unsigned long cpu;
2577 struct page *tmp_page;
2576 2578
2577 set_bit(STRIPE_HANDLE, &sh->state); 2579 set_bit(STRIPE_HANDLE, &sh->state);
2578 2580
@@ -2583,78 +2585,75 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2583 * case we can only check one of them, possibly using the 2585 * case we can only check one of them, possibly using the
2584 * other to generate missing data 2586 * other to generate missing data
2585 */ 2587 */
2586 2588 cpu = get_cpu();
2587 /* If !tmp_page, we cannot do the calculations, 2589 tmp_page = per_cpu_ptr(conf->percpu, cpu)->spare_page;
2588 * but as we have set STRIPE_HANDLE, we will soon be called 2590 if (s->failed == r6s->q_failed) {
2589 * by stripe_handle with a tmp_page - just wait until then. 2591 /* The only possible failed device holds 'Q', so it
2590 */ 2592 * makes sense to check P (If anything else were failed,
2591 if (tmp_page) { 2593 * we would have used P to recreate it).
2592 if (s->failed == r6s->q_failed) { 2594 */
2593 /* The only possible failed device holds 'Q', so it 2595 compute_block_1(sh, pd_idx, 1);
2594 * makes sense to check P (If anything else were failed, 2596 if (!page_is_zero(sh->dev[pd_idx].page)) {
2595 * we would have used P to recreate it). 2597 compute_block_1(sh, pd_idx, 0);
2596 */ 2598 update_p = 1;
2597 compute_block_1(sh, pd_idx, 1);
2598 if (!page_is_zero(sh->dev[pd_idx].page)) {
2599 compute_block_1(sh, pd_idx, 0);
2600 update_p = 1;
2601 }
2602 }
2603 if (!r6s->q_failed && s->failed < 2) {
2604 /* q is not failed, and we didn't use it to generate
2605 * anything, so it makes sense to check it
2606 */
2607 memcpy(page_address(tmp_page),
2608 page_address(sh->dev[qd_idx].page),
2609 STRIPE_SIZE);
2610 compute_parity6(sh, UPDATE_PARITY);
2611 if (memcmp(page_address(tmp_page),
2612 page_address(sh->dev[qd_idx].page),
2613 STRIPE_SIZE) != 0) {
2614 clear_bit(STRIPE_INSYNC, &sh->state);
2615 update_q = 1;
2616 }
2617 } 2599 }
2618 if (update_p || update_q) { 2600 }
2619 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2601 if (!r6s->q_failed && s->failed < 2) {
2620 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2602 /* q is not failed, and we didn't use it to generate
2621 /* don't try to repair!! */ 2603 * anything, so it makes sense to check it
2622 update_p = update_q = 0; 2604 */
2605 memcpy(page_address(tmp_page),
2606 page_address(sh->dev[qd_idx].page),
2607 STRIPE_SIZE);
2608 compute_parity6(sh, UPDATE_PARITY);
2609 if (memcmp(page_address(tmp_page),
2610 page_address(sh->dev[qd_idx].page),
2611 STRIPE_SIZE) != 0) {
2612 clear_bit(STRIPE_INSYNC, &sh->state);
2613 update_q = 1;
2623 } 2614 }
2615 }
2616 put_cpu();
2624 2617
2625 /* now write out any block on a failed drive, 2618 if (update_p || update_q) {
2626 * or P or Q if they need it 2619 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2627 */ 2620 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2621 /* don't try to repair!! */
2622 update_p = update_q = 0;
2623 }
2628 2624
2629 if (s->failed == 2) { 2625 /* now write out any block on a failed drive,
2630 dev = &sh->dev[r6s->failed_num[1]]; 2626 * or P or Q if they need it
2631 s->locked++; 2627 */
2632 set_bit(R5_LOCKED, &dev->flags);
2633 set_bit(R5_Wantwrite, &dev->flags);
2634 }
2635 if (s->failed >= 1) {
2636 dev = &sh->dev[r6s->failed_num[0]];
2637 s->locked++;
2638 set_bit(R5_LOCKED, &dev->flags);
2639 set_bit(R5_Wantwrite, &dev->flags);
2640 }
2641 2628
2642 if (update_p) { 2629 if (s->failed == 2) {
2643 dev = &sh->dev[pd_idx]; 2630 dev = &sh->dev[r6s->failed_num[1]];
2644 s->locked++; 2631 s->locked++;
2645 set_bit(R5_LOCKED, &dev->flags); 2632 set_bit(R5_LOCKED, &dev->flags);
2646 set_bit(R5_Wantwrite, &dev->flags); 2633 set_bit(R5_Wantwrite, &dev->flags);
2647 } 2634 }
2648 if (update_q) { 2635 if (s->failed >= 1) {
2649 dev = &sh->dev[qd_idx]; 2636 dev = &sh->dev[r6s->failed_num[0]];
2650 s->locked++; 2637 s->locked++;
2651 set_bit(R5_LOCKED, &dev->flags); 2638 set_bit(R5_LOCKED, &dev->flags);
2652 set_bit(R5_Wantwrite, &dev->flags); 2639 set_bit(R5_Wantwrite, &dev->flags);
2653 } 2640 }
2654 clear_bit(STRIPE_DEGRADED, &sh->state);
2655 2641
2656 set_bit(STRIPE_INSYNC, &sh->state); 2642 if (update_p) {
2643 dev = &sh->dev[pd_idx];
2644 s->locked++;
2645 set_bit(R5_LOCKED, &dev->flags);
2646 set_bit(R5_Wantwrite, &dev->flags);
2647 }
2648 if (update_q) {
2649 dev = &sh->dev[qd_idx];
2650 s->locked++;
2651 set_bit(R5_LOCKED, &dev->flags);
2652 set_bit(R5_Wantwrite, &dev->flags);
2657 } 2653 }
2654 clear_bit(STRIPE_DEGRADED, &sh->state);
2655
2656 set_bit(STRIPE_INSYNC, &sh->state);
2658} 2657}
2659 2658
2660static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2659static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
@@ -3009,7 +3008,7 @@ static bool handle_stripe5(struct stripe_head *sh)
3009 return blocked_rdev == NULL; 3008 return blocked_rdev == NULL;
3010} 3009}
3011 3010
3012static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 3011static bool handle_stripe6(struct stripe_head *sh)
3013{ 3012{
3014 raid5_conf_t *conf = sh->raid_conf; 3013 raid5_conf_t *conf = sh->raid_conf;
3015 int disks = sh->disks; 3014 int disks = sh->disks;
@@ -3164,7 +3163,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3164 * data is available 3163 * data is available
3165 */ 3164 */
3166 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) 3165 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
3167 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); 3166 handle_parity_checks6(conf, sh, &s, &r6s, disks);
3168 3167
3169 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3168 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3170 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3169 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
@@ -3247,16 +3246,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3247} 3246}
3248 3247
3249/* returns true if the stripe was handled */ 3248/* returns true if the stripe was handled */
3250static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) 3249static bool handle_stripe(struct stripe_head *sh)
3251{ 3250{
3252 if (sh->raid_conf->level == 6) 3251 if (sh->raid_conf->level == 6)
3253 return handle_stripe6(sh, tmp_page); 3252 return handle_stripe6(sh);
3254 else 3253 else
3255 return handle_stripe5(sh); 3254 return handle_stripe5(sh);
3256} 3255}
3257 3256
3258
3259
3260static void raid5_activate_delayed(raid5_conf_t *conf) 3257static void raid5_activate_delayed(raid5_conf_t *conf)
3261{ 3258{
3262 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3259 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -4047,7 +4044,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4047 spin_unlock(&sh->lock); 4044 spin_unlock(&sh->lock);
4048 4045
4049 /* wait for any blocked device to be handled */ 4046 /* wait for any blocked device to be handled */
4050 while(unlikely(!handle_stripe(sh, NULL))) 4047 while (unlikely(!handle_stripe(sh)))
4051 ; 4048 ;
4052 release_stripe(sh); 4049 release_stripe(sh);
4053 4050
@@ -4104,7 +4101,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4104 return handled; 4101 return handled;
4105 } 4102 }
4106 4103
4107 handle_stripe(sh, NULL); 4104 handle_stripe(sh);
4108 release_stripe(sh); 4105 release_stripe(sh);
4109 handled++; 4106 handled++;
4110 } 4107 }
@@ -4168,7 +4165,7 @@ static void raid5d(mddev_t *mddev)
4168 spin_unlock_irq(&conf->device_lock); 4165 spin_unlock_irq(&conf->device_lock);
4169 4166
4170 handled++; 4167 handled++;
4171 handle_stripe(sh, conf->spare_page); 4168 handle_stripe(sh);
4172 release_stripe(sh); 4169 release_stripe(sh);
4173 4170
4174 spin_lock_irq(&conf->device_lock); 4171 spin_lock_irq(&conf->device_lock);
@@ -4309,15 +4306,104 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4309 return sectors * (raid_disks - conf->max_degraded); 4306 return sectors * (raid_disks - conf->max_degraded);
4310} 4307}
4311 4308
4309static void raid5_free_percpu(raid5_conf_t *conf)
4310{
4311 struct raid5_percpu *percpu;
4312 unsigned long cpu;
4313
4314 if (!conf->percpu)
4315 return;
4316
4317 get_online_cpus();
4318 for_each_possible_cpu(cpu) {
4319 percpu = per_cpu_ptr(conf->percpu, cpu);
4320 safe_put_page(percpu->spare_page);
4321 }
4322#ifdef CONFIG_HOTPLUG_CPU
4323 unregister_cpu_notifier(&conf->cpu_notify);
4324#endif
4325 put_online_cpus();
4326
4327 free_percpu(conf->percpu);
4328}
4329
4312static void free_conf(raid5_conf_t *conf) 4330static void free_conf(raid5_conf_t *conf)
4313{ 4331{
4314 shrink_stripes(conf); 4332 shrink_stripes(conf);
4315 safe_put_page(conf->spare_page); 4333 raid5_free_percpu(conf);
4316 kfree(conf->disks); 4334 kfree(conf->disks);
4317 kfree(conf->stripe_hashtbl); 4335 kfree(conf->stripe_hashtbl);
4318 kfree(conf); 4336 kfree(conf);
4319} 4337}
4320 4338
4339#ifdef CONFIG_HOTPLUG_CPU
4340static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4341 void *hcpu)
4342{
4343 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
4344 long cpu = (long)hcpu;
4345 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4346
4347 switch (action) {
4348 case CPU_UP_PREPARE:
4349 case CPU_UP_PREPARE_FROZEN:
4350 if (!percpu->spare_page)
4351 percpu->spare_page = alloc_page(GFP_KERNEL);
4352 if (!percpu->spare_page) {
4353 pr_err("%s: failed memory allocation for cpu%ld\n",
4354 __func__, cpu);
4355 return NOTIFY_BAD;
4356 }
4357 break;
4358 case CPU_DEAD:
4359 case CPU_DEAD_FROZEN:
4360 safe_put_page(percpu->spare_page);
4361 percpu->spare_page = NULL;
4362 break;
4363 default:
4364 break;
4365 }
4366 return NOTIFY_OK;
4367}
4368#endif
4369
4370static int raid5_alloc_percpu(raid5_conf_t *conf)
4371{
4372 unsigned long cpu;
4373 struct page *spare_page;
4374 struct raid5_percpu *allcpus;
4375 int err;
4376
4377 /* the only percpu data is the raid6 spare page */
4378 if (conf->level != 6)
4379 return 0;
4380
4381 allcpus = alloc_percpu(struct raid5_percpu);
4382 if (!allcpus)
4383 return -ENOMEM;
4384 conf->percpu = allcpus;
4385
4386 get_online_cpus();
4387 err = 0;
4388 for_each_present_cpu(cpu) {
4389 spare_page = alloc_page(GFP_KERNEL);
4390 if (!spare_page) {
4391 err = -ENOMEM;
4392 break;
4393 }
4394 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4395 }
4396#ifdef CONFIG_HOTPLUG_CPU
4397 conf->cpu_notify.notifier_call = raid456_cpu_notify;
4398 conf->cpu_notify.priority = 0;
4399 if (err == 0)
4400 err = register_cpu_notifier(&conf->cpu_notify);
4401#endif
4402 put_online_cpus();
4403
4404 return err;
4405}
4406
4321static raid5_conf_t *setup_conf(mddev_t *mddev) 4407static raid5_conf_t *setup_conf(mddev_t *mddev)
4322{ 4408{
4323 raid5_conf_t *conf; 4409 raid5_conf_t *conf;
@@ -4372,11 +4458,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4372 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4458 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4373 goto abort; 4459 goto abort;
4374 4460
4375 if (mddev->new_level == 6) { 4461 conf->level = mddev->new_level;
4376 conf->spare_page = alloc_page(GFP_KERNEL); 4462 if (raid5_alloc_percpu(conf) != 0)
4377 if (!conf->spare_page) 4463 goto abort;
4378 goto abort; 4464
4379 }
4380 spin_lock_init(&conf->device_lock); 4465 spin_lock_init(&conf->device_lock);
4381 init_waitqueue_head(&conf->wait_for_stripe); 4466 init_waitqueue_head(&conf->wait_for_stripe);
4382 init_waitqueue_head(&conf->wait_for_overlap); 4467 init_waitqueue_head(&conf->wait_for_overlap);
@@ -4412,7 +4497,6 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4412 } 4497 }
4413 4498
4414 conf->chunk_size = mddev->new_chunk; 4499 conf->chunk_size = mddev->new_chunk;
4415 conf->level = mddev->new_level;
4416 if (conf->level == 6) 4500 if (conf->level == 6)
4417 conf->max_degraded = 2; 4501 conf->max_degraded = 2;
4418 else 4502 else
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 52ba99954dec..07a7a4102f05 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -383,8 +383,13 @@ struct raid5_private_data {
383 * (fresh device added). 383 * (fresh device added).
384 * Cleared when a sync completes. 384 * Cleared when a sync completes.
385 */ 385 */
386 386 /* per cpu variables */
387 struct page *spare_page; /* Used when checking P/Q in raid6 */ 387 struct raid5_percpu {
388 struct page *spare_page; /* Used when checking P/Q in raid6 */
389 } *percpu;
390#ifdef CONFIG_HOTPLUG_CPU
391 struct notifier_block cpu_notify;
392#endif
388 393
389 /* 394 /*
390 * Free stripes pool 395 * Free stripes pool