aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-03-27 04:18:07 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-27 11:45:01 -0500
commitad01c9e3752f4ba4f3d99c89b7370fa4983a25b5 (patch)
tree856868aa97332d6d15d4cad412e0ebe3576bb571
parentb55e6bfcd23cb2f7249095050c649f7aea813f9f (diff)
[PATCH] md: Allow stripes to be expanded in preparation for expanding an array
Before a RAID-5 can be expanded, we need to be able to expand the stripe-cache data structure. This requires allocating new stripes in a new kmem_cache. If this succeeds, we copy cache pages over and release the old stripes and kmem_cache. We then allocate new pages. If that fails, we leave the stripe cache at it's new size. It isn't worth the effort to shrink it back again. Unfortuanately this means we need two kmem_cache names as we, for a short period of time, we have two kmem_caches. So they are raid5/%s and raid5/%s-alt Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/md.c2
-rw-r--r--drivers/md/raid5.c131
-rw-r--r--drivers/md/raid6main.c4
-rw-r--r--include/linux/raid/raid5.h9
4 files changed, 137 insertions, 9 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a3ecaf8ed30a..c7b7656f9aa5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2775,7 +2775,6 @@ static void autorun_array(mddev_t *mddev)
2775 */ 2775 */
2776static void autorun_devices(int part) 2776static void autorun_devices(int part)
2777{ 2777{
2778 struct list_head candidates;
2779 struct list_head *tmp; 2778 struct list_head *tmp;
2780 mdk_rdev_t *rdev0, *rdev; 2779 mdk_rdev_t *rdev0, *rdev;
2781 mddev_t *mddev; 2780 mddev_t *mddev;
@@ -2784,6 +2783,7 @@ static void autorun_devices(int part)
2784 printk(KERN_INFO "md: autorun ...\n"); 2783 printk(KERN_INFO "md: autorun ...\n");
2785 while (!list_empty(&pending_raid_disks)) { 2784 while (!list_empty(&pending_raid_disks)) {
2786 dev_t dev; 2785 dev_t dev;
2786 LIST_HEAD(candidates);
2787 rdev0 = list_entry(pending_raid_disks.next, 2787 rdev0 = list_entry(pending_raid_disks.next,
2788 mdk_rdev_t, same_set); 2788 mdk_rdev_t, same_set);
2789 2789
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 03f31379cebb..6c20b44509d8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -313,20 +313,143 @@ static int grow_stripes(raid5_conf_t *conf, int num)
313 kmem_cache_t *sc; 313 kmem_cache_t *sc;
314 int devs = conf->raid_disks; 314 int devs = conf->raid_disks;
315 315
316 sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev)); 316 sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
317 317 sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
318 sc = kmem_cache_create(conf->cache_name, 318 conf->active_name = 0;
319 sc = kmem_cache_create(conf->cache_name[conf->active_name],
319 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 320 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
320 0, 0, NULL, NULL); 321 0, 0, NULL, NULL);
321 if (!sc) 322 if (!sc)
322 return 1; 323 return 1;
323 conf->slab_cache = sc; 324 conf->slab_cache = sc;
325 conf->pool_size = devs;
324 while (num--) { 326 while (num--) {
325 if (!grow_one_stripe(conf)) 327 if (!grow_one_stripe(conf))
326 return 1; 328 return 1;
327 } 329 }
328 return 0; 330 return 0;
329} 331}
332static int resize_stripes(raid5_conf_t *conf, int newsize)
333{
334 /* Make all the stripes able to hold 'newsize' devices.
335 * New slots in each stripe get 'page' set to a new page.
336 *
337 * This happens in stages:
338 * 1/ create a new kmem_cache and allocate the required number of
339 * stripe_heads.
340 * 2/ gather all the old stripe_heads and tranfer the pages across
341 * to the new stripe_heads. This will have the side effect of
342 * freezing the array as once all stripe_heads have been collected,
343 * no IO will be possible. Old stripe heads are freed once their
344 * pages have been transferred over, and the old kmem_cache is
345 * freed when all stripes are done.
346 * 3/ reallocate conf->disks to be suitable bigger. If this fails,
347 * we simple return a failre status - no need to clean anything up.
348 * 4/ allocate new pages for the new slots in the new stripe_heads.
349 * If this fails, we don't bother trying the shrink the
350 * stripe_heads down again, we just leave them as they are.
351 * As each stripe_head is processed the new one is released into
352 * active service.
353 *
354 * Once step2 is started, we cannot afford to wait for a write,
355 * so we use GFP_NOIO allocations.
356 */
357 struct stripe_head *osh, *nsh;
358 LIST_HEAD(newstripes);
359 struct disk_info *ndisks;
360 int err = 0;
361 kmem_cache_t *sc;
362 int i;
363
364 if (newsize <= conf->pool_size)
365 return 0; /* never bother to shrink */
366
367 /* Step 1 */
368 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
369 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
370 0, 0, NULL, NULL);
371 if (!sc)
372 return -ENOMEM;
373
374 for (i = conf->max_nr_stripes; i; i--) {
375 nsh = kmem_cache_alloc(sc, GFP_KERNEL);
376 if (!nsh)
377 break;
378
379 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
380
381 nsh->raid_conf = conf;
382 spin_lock_init(&nsh->lock);
383
384 list_add(&nsh->lru, &newstripes);
385 }
386 if (i) {
387 /* didn't get enough, give up */
388 while (!list_empty(&newstripes)) {
389 nsh = list_entry(newstripes.next, struct stripe_head, lru);
390 list_del(&nsh->lru);
391 kmem_cache_free(sc, nsh);
392 }
393 kmem_cache_destroy(sc);
394 return -ENOMEM;
395 }
396 /* Step 2 - Must use GFP_NOIO now.
397 * OK, we have enough stripes, start collecting inactive
398 * stripes and copying them over
399 */
400 list_for_each_entry(nsh, &newstripes, lru) {
401 spin_lock_irq(&conf->device_lock);
402 wait_event_lock_irq(conf->wait_for_stripe,
403 !list_empty(&conf->inactive_list),
404 conf->device_lock,
405 unplug_slaves(conf->mddev);
406 );
407 osh = get_free_stripe(conf);
408 spin_unlock_irq(&conf->device_lock);
409 atomic_set(&nsh->count, 1);
410 for(i=0; i<conf->pool_size; i++)
411 nsh->dev[i].page = osh->dev[i].page;
412 for( ; i<newsize; i++)
413 nsh->dev[i].page = NULL;
414 kmem_cache_free(conf->slab_cache, osh);
415 }
416 kmem_cache_destroy(conf->slab_cache);
417
418 /* Step 3.
419 * At this point, we are holding all the stripes so the array
420 * is completely stalled, so now is a good time to resize
421 * conf->disks.
422 */
423 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
424 if (ndisks) {
425 for (i=0; i<conf->raid_disks; i++)
426 ndisks[i] = conf->disks[i];
427 kfree(conf->disks);
428 conf->disks = ndisks;
429 } else
430 err = -ENOMEM;
431
432 /* Step 4, return new stripes to service */
433 while(!list_empty(&newstripes)) {
434 nsh = list_entry(newstripes.next, struct stripe_head, lru);
435 list_del_init(&nsh->lru);
436 for (i=conf->raid_disks; i < newsize; i++)
437 if (nsh->dev[i].page == NULL) {
438 struct page *p = alloc_page(GFP_NOIO);
439 nsh->dev[i].page = p;
440 if (!p)
441 err = -ENOMEM;
442 }
443 release_stripe(nsh);
444 }
445 /* critical section pass, GFP_NOIO no longer needed */
446
447 conf->slab_cache = sc;
448 conf->active_name = 1-conf->active_name;
449 conf->pool_size = newsize;
450 return err;
451}
452
330 453
331static int drop_one_stripe(raid5_conf_t *conf) 454static int drop_one_stripe(raid5_conf_t *conf)
332{ 455{
@@ -339,7 +462,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
339 return 0; 462 return 0;
340 if (atomic_read(&sh->count)) 463 if (atomic_read(&sh->count))
341 BUG(); 464 BUG();
342 shrink_buffers(sh, conf->raid_disks); 465 shrink_buffers(sh, conf->pool_size);
343 kmem_cache_free(conf->slab_cache, sh); 466 kmem_cache_free(conf->slab_cache, sh);
344 atomic_dec(&conf->active_stripes); 467 atomic_dec(&conf->active_stripes);
345 return 1; 468 return 1;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index c7632f6cc487..6df4930fddec 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -331,9 +331,9 @@ static int grow_stripes(raid6_conf_t *conf, int num)
331 kmem_cache_t *sc; 331 kmem_cache_t *sc;
332 int devs = conf->raid_disks; 332 int devs = conf->raid_disks;
333 333
334 sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev)); 334 sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
335 335
336 sc = kmem_cache_create(conf->cache_name, 336 sc = kmem_cache_create(conf->cache_name[0],
337 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 337 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
338 0, 0, NULL, NULL); 338 0, 0, NULL, NULL);
339 if (!sc) 339 if (!sc)
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 94dbdd406f12..b7b2653af7bb 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -216,7 +216,11 @@ struct raid5_private_data {
216 struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ 216 struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
217 atomic_t preread_active_stripes; /* stripes with scheduled io */ 217 atomic_t preread_active_stripes; /* stripes with scheduled io */
218 218
219 char cache_name[20]; 219 /* unfortunately we need two cache names as we temporarily have
220 * two caches.
221 */
222 int active_name;
223 char cache_name[2][20];
220 kmem_cache_t *slab_cache; /* for allocating stripes */ 224 kmem_cache_t *slab_cache; /* for allocating stripes */
221 225
222 int seq_flush, seq_write; 226 int seq_flush, seq_write;
@@ -238,7 +242,8 @@ struct raid5_private_data {
238 wait_queue_head_t wait_for_overlap; 242 wait_queue_head_t wait_for_overlap;
239 int inactive_blocked; /* release of inactive stripes blocked, 243 int inactive_blocked; /* release of inactive stripes blocked,
240 * waiting for 25% to be free 244 * waiting for 25% to be free
241 */ 245 */
246 int pool_size; /* number of disks in stripeheads in pool */
242 spinlock_t device_lock; 247 spinlock_t device_lock;
243 struct disk_info *disks; 248 struct disk_info *disks;
244}; 249};