diff options
author | NeilBrown <neilb@cse.unsw.edu.au> | 2005-09-09 19:23:45 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-09-09 19:39:10 -0400 |
commit | 8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 (patch) | |
tree | 90862c8fa9f04cf98423b3da1b2c5d1f01a7310d | |
parent | 36fa30636fb84b209210299684e1be66d9e58217 (diff) |
[PATCH] md: support write-mostly device in raid1
This allows a device in a raid1 to be marked as "write mostly". Read requests
will only be sent if there is no other option.
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | drivers/md/md.c | 18 | ||||
-rw-r--r-- | drivers/md/raid1.c | 76 | ||||
-rw-r--r-- | include/linux/raid/md_k.h | 3 | ||||
-rw-r--r-- | include/linux/raid/md_p.h | 11 |
4 files changed, 82 insertions, 26 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index ae654466dc23..f1ac356e656d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -670,6 +670,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
670 | 670 | ||
671 | if (mddev->level != LEVEL_MULTIPATH) { | 671 | if (mddev->level != LEVEL_MULTIPATH) { |
672 | rdev->faulty = 0; | 672 | rdev->faulty = 0; |
673 | rdev->flags = 0; | ||
673 | desc = sb->disks + rdev->desc_nr; | 674 | desc = sb->disks + rdev->desc_nr; |
674 | 675 | ||
675 | if (desc->state & (1<<MD_DISK_FAULTY)) | 676 | if (desc->state & (1<<MD_DISK_FAULTY)) |
@@ -679,6 +680,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
679 | rdev->in_sync = 1; | 680 | rdev->in_sync = 1; |
680 | rdev->raid_disk = desc->raid_disk; | 681 | rdev->raid_disk = desc->raid_disk; |
681 | } | 682 | } |
683 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) | ||
684 | set_bit(WriteMostly, &rdev->flags); | ||
682 | } else /* MULTIPATH are always insync */ | 685 | } else /* MULTIPATH are always insync */ |
683 | rdev->in_sync = 1; | 686 | rdev->in_sync = 1; |
684 | return 0; | 687 | return 0; |
@@ -777,6 +780,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
777 | spare++; | 780 | spare++; |
778 | working++; | 781 | working++; |
779 | } | 782 | } |
783 | if (test_bit(WriteMostly, &rdev2->flags)) | ||
784 | d->state |= (1<<MD_DISK_WRITEMOSTLY); | ||
780 | } | 785 | } |
781 | 786 | ||
782 | /* now set the "removed" and "faulty" bits on any missing devices */ | 787 | /* now set the "removed" and "faulty" bits on any missing devices */ |
@@ -990,6 +995,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
990 | rdev->raid_disk = role; | 995 | rdev->raid_disk = role; |
991 | break; | 996 | break; |
992 | } | 997 | } |
998 | rdev->flags = 0; | ||
999 | if (sb->devflags & WriteMostly1) | ||
1000 | set_bit(WriteMostly, &rdev->flags); | ||
993 | } else /* MULTIPATH are always insync */ | 1001 | } else /* MULTIPATH are always insync */ |
994 | rdev->in_sync = 1; | 1002 | rdev->in_sync = 1; |
995 | 1003 | ||
@@ -2152,6 +2160,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) | |||
2152 | info.state |= (1<<MD_DISK_ACTIVE); | 2160 | info.state |= (1<<MD_DISK_ACTIVE); |
2153 | info.state |= (1<<MD_DISK_SYNC); | 2161 | info.state |= (1<<MD_DISK_SYNC); |
2154 | } | 2162 | } |
2163 | if (test_bit(WriteMostly, &rdev->flags)) | ||
2164 | info.state |= (1<<MD_DISK_WRITEMOSTLY); | ||
2155 | } else { | 2165 | } else { |
2156 | info.major = info.minor = 0; | 2166 | info.major = info.minor = 0; |
2157 | info.raid_disk = -1; | 2167 | info.raid_disk = -1; |
@@ -2237,6 +2247,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
2237 | rdev->saved_raid_disk = rdev->raid_disk; | 2247 | rdev->saved_raid_disk = rdev->raid_disk; |
2238 | 2248 | ||
2239 | rdev->in_sync = 0; /* just to be sure */ | 2249 | rdev->in_sync = 0; /* just to be sure */ |
2250 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | ||
2251 | set_bit(WriteMostly, &rdev->flags); | ||
2252 | |||
2240 | rdev->raid_disk = -1; | 2253 | rdev->raid_disk = -1; |
2241 | err = bind_rdev_to_array(rdev, mddev); | 2254 | err = bind_rdev_to_array(rdev, mddev); |
2242 | if (err) | 2255 | if (err) |
@@ -2277,6 +2290,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
2277 | else | 2290 | else |
2278 | rdev->in_sync = 0; | 2291 | rdev->in_sync = 0; |
2279 | 2292 | ||
2293 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | ||
2294 | set_bit(WriteMostly, &rdev->flags); | ||
2295 | |||
2280 | err = bind_rdev_to_array(rdev, mddev); | 2296 | err = bind_rdev_to_array(rdev, mddev); |
2281 | if (err) { | 2297 | if (err) { |
2282 | export_rdev(rdev); | 2298 | export_rdev(rdev); |
@@ -3329,6 +3345,8 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
3329 | char b[BDEVNAME_SIZE]; | 3345 | char b[BDEVNAME_SIZE]; |
3330 | seq_printf(seq, " %s[%d]", | 3346 | seq_printf(seq, " %s[%d]", |
3331 | bdevname(rdev->bdev,b), rdev->desc_nr); | 3347 | bdevname(rdev->bdev,b), rdev->desc_nr); |
3348 | if (test_bit(WriteMostly, &rdev->flags)) | ||
3349 | seq_printf(seq, "(W)"); | ||
3332 | if (rdev->faulty) { | 3350 | if (rdev->faulty) { |
3333 | seq_printf(seq, "(F)"); | 3351 | seq_printf(seq, "(F)"); |
3334 | continue; | 3352 | continue; |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ba643e4bfac9..28839a8193f2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -360,13 +360,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
360 | { | 360 | { |
361 | const unsigned long this_sector = r1_bio->sector; | 361 | const unsigned long this_sector = r1_bio->sector; |
362 | int new_disk = conf->last_used, disk = new_disk; | 362 | int new_disk = conf->last_used, disk = new_disk; |
363 | int wonly_disk = -1; | ||
363 | const int sectors = r1_bio->sectors; | 364 | const int sectors = r1_bio->sectors; |
364 | sector_t new_distance, current_distance; | 365 | sector_t new_distance, current_distance; |
365 | mdk_rdev_t *new_rdev, *rdev; | 366 | mdk_rdev_t *rdev; |
366 | 367 | ||
367 | rcu_read_lock(); | 368 | rcu_read_lock(); |
368 | /* | 369 | /* |
369 | * Check if it if we can balance. We can balance on the whole | 370 | * Check if we can balance. We can balance on the whole |
370 | * device if no resync is going on, or below the resync window. | 371 | * device if no resync is going on, or below the resync window. |
371 | * We take the first readable disk when above the resync window. | 372 | * We take the first readable disk when above the resync window. |
372 | */ | 373 | */ |
@@ -376,11 +377,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
376 | /* Choose the first operation device, for consistancy */ | 377 | /* Choose the first operation device, for consistancy */ |
377 | new_disk = 0; | 378 | new_disk = 0; |
378 | 379 | ||
379 | while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || | 380 | for (rdev = conf->mirrors[new_disk].rdev; |
380 | !new_rdev->in_sync) { | 381 | !rdev || !rdev->in_sync |
381 | new_disk++; | 382 | || test_bit(WriteMostly, &rdev->flags); |
382 | if (new_disk == conf->raid_disks) { | 383 | rdev = conf->mirrors[++new_disk].rdev) { |
383 | new_disk = -1; | 384 | |
385 | if (rdev && rdev->in_sync) | ||
386 | wonly_disk = new_disk; | ||
387 | |||
388 | if (new_disk == conf->raid_disks - 1) { | ||
389 | new_disk = wonly_disk; | ||
384 | break; | 390 | break; |
385 | } | 391 | } |
386 | } | 392 | } |
@@ -389,16 +395,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
389 | 395 | ||
390 | 396 | ||
391 | /* make sure the disk is operational */ | 397 | /* make sure the disk is operational */ |
392 | while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || | 398 | for (rdev = conf->mirrors[new_disk].rdev; |
393 | !new_rdev->in_sync) { | 399 | !rdev || !rdev->in_sync || |
400 | test_bit(WriteMostly, &rdev->flags); | ||
401 | rdev = conf->mirrors[new_disk].rdev) { | ||
402 | |||
403 | if (rdev && rdev->in_sync) | ||
404 | wonly_disk = new_disk; | ||
405 | |||
394 | if (new_disk <= 0) | 406 | if (new_disk <= 0) |
395 | new_disk = conf->raid_disks; | 407 | new_disk = conf->raid_disks; |
396 | new_disk--; | 408 | new_disk--; |
397 | if (new_disk == disk) { | 409 | if (new_disk == disk) { |
398 | new_disk = -1; | 410 | new_disk = wonly_disk; |
399 | goto rb_out; | 411 | break; |
400 | } | 412 | } |
401 | } | 413 | } |
414 | |||
415 | if (new_disk < 0) | ||
416 | goto rb_out; | ||
417 | |||
402 | disk = new_disk; | 418 | disk = new_disk; |
403 | /* now disk == new_disk == starting point for search */ | 419 | /* now disk == new_disk == starting point for search */ |
404 | 420 | ||
@@ -419,37 +435,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
419 | disk = conf->raid_disks; | 435 | disk = conf->raid_disks; |
420 | disk--; | 436 | disk--; |
421 | 437 | ||
422 | if ((rdev=conf->mirrors[disk].rdev) == NULL || | 438 | rdev = conf->mirrors[disk].rdev; |
423 | !rdev->in_sync) | 439 | |
440 | if (!rdev || | ||
441 | !rdev->in_sync || | ||
442 | test_bit(WriteMostly, &rdev->flags)) | ||
424 | continue; | 443 | continue; |
425 | 444 | ||
426 | if (!atomic_read(&rdev->nr_pending)) { | 445 | if (!atomic_read(&rdev->nr_pending)) { |
427 | new_disk = disk; | 446 | new_disk = disk; |
428 | new_rdev = rdev; | ||
429 | break; | 447 | break; |
430 | } | 448 | } |
431 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); | 449 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); |
432 | if (new_distance < current_distance) { | 450 | if (new_distance < current_distance) { |
433 | current_distance = new_distance; | 451 | current_distance = new_distance; |
434 | new_disk = disk; | 452 | new_disk = disk; |
435 | new_rdev = rdev; | ||
436 | } | 453 | } |
437 | } while (disk != conf->last_used); | 454 | } while (disk != conf->last_used); |
438 | 455 | ||
439 | rb_out: | 456 | rb_out: |
440 | 457 | ||
441 | 458 | ||
442 | if (new_disk >= 0) { | 459 | if (new_disk >= 0) { |
443 | conf->next_seq_sect = this_sector + sectors; | 460 | rdev = conf->mirrors[new_disk].rdev; |
444 | conf->last_used = new_disk; | 461 | if (!rdev) |
445 | atomic_inc(&new_rdev->nr_pending); | 462 | goto retry; |
446 | if (!new_rdev->in_sync) { | 463 | atomic_inc(&rdev->nr_pending); |
464 | if (!rdev->in_sync) { | ||
447 | /* cannot risk returning a device that failed | 465 | /* cannot risk returning a device that failed |
448 | * before we inc'ed nr_pending | 466 | * before we inc'ed nr_pending |
449 | */ | 467 | */ |
450 | atomic_dec(&new_rdev->nr_pending); | 468 | atomic_dec(&rdev->nr_pending); |
451 | goto retry; | 469 | goto retry; |
452 | } | 470 | } |
471 | conf->next_seq_sect = this_sector + sectors; | ||
472 | conf->last_used = new_disk; | ||
453 | } | 473 | } |
454 | rcu_read_unlock(); | 474 | rcu_read_unlock(); |
455 | 475 | ||
@@ -1109,6 +1129,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1109 | sector_t max_sector, nr_sectors; | 1129 | sector_t max_sector, nr_sectors; |
1110 | int disk; | 1130 | int disk; |
1111 | int i; | 1131 | int i; |
1132 | int wonly; | ||
1112 | int write_targets = 0; | 1133 | int write_targets = 0; |
1113 | int sync_blocks; | 1134 | int sync_blocks; |
1114 | int still_degraded = 0; | 1135 | int still_degraded = 0; |
@@ -1164,14 +1185,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1164 | */ | 1185 | */ |
1165 | disk = conf->last_used; | 1186 | disk = conf->last_used; |
1166 | /* make sure disk is operational */ | 1187 | /* make sure disk is operational */ |
1167 | 1188 | wonly = disk; | |
1168 | while (conf->mirrors[disk].rdev == NULL || | 1189 | while (conf->mirrors[disk].rdev == NULL || |
1169 | !conf->mirrors[disk].rdev->in_sync) { | 1190 | !conf->mirrors[disk].rdev->in_sync || |
1191 | test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) | ||
1192 | ) { | ||
1193 | if (conf->mirrors[disk].rdev && | ||
1194 | conf->mirrors[disk].rdev->in_sync) | ||
1195 | wonly = disk; | ||
1170 | if (disk <= 0) | 1196 | if (disk <= 0) |
1171 | disk = conf->raid_disks; | 1197 | disk = conf->raid_disks; |
1172 | disk--; | 1198 | disk--; |
1173 | if (disk == conf->last_used) | 1199 | if (disk == conf->last_used) { |
1200 | disk = wonly; | ||
1174 | break; | 1201 | break; |
1202 | } | ||
1175 | } | 1203 | } |
1176 | conf->last_used = disk; | 1204 | conf->last_used = disk; |
1177 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); | 1205 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); |
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 817062bf7352..7ef78e15ce04 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h | |||
@@ -181,6 +181,9 @@ struct mdk_rdev_s | |||
181 | int faulty; /* if faulty do not issue IO requests */ | 181 | int faulty; /* if faulty do not issue IO requests */ |
182 | int in_sync; /* device is a full member of the array */ | 182 | int in_sync; /* device is a full member of the array */ |
183 | 183 | ||
184 | unsigned long flags; /* Should include faulty and in_sync here. */ | ||
185 | #define WriteMostly 4 /* Avoid reading if at all possible */ | ||
186 | |||
184 | int desc_nr; /* descriptor index in the superblock */ | 187 | int desc_nr; /* descriptor index in the superblock */ |
185 | int raid_disk; /* role of device in array */ | 188 | int raid_disk; /* role of device in array */ |
186 | int saved_raid_disk; /* role that device used to have in the | 189 | int saved_raid_disk; /* role that device used to have in the |
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index dc65cd435494..4f047f84fb1f 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h | |||
@@ -79,6 +79,11 @@ | |||
79 | #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ | 79 | #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ |
80 | #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ | 80 | #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ |
81 | 81 | ||
82 | #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. | ||
83 | * read requests will only be sent here in | ||
84 | * dire need | ||
85 | */ | ||
86 | |||
82 | typedef struct mdp_device_descriptor_s { | 87 | typedef struct mdp_device_descriptor_s { |
83 | __u32 number; /* 0 Device number in the entire set */ | 88 | __u32 number; /* 0 Device number in the entire set */ |
84 | __u32 major; /* 1 Device major number */ | 89 | __u32 major; /* 1 Device major number */ |
@@ -193,7 +198,7 @@ struct mdp_superblock_1 { | |||
193 | 198 | ||
194 | __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ | 199 | __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ |
195 | __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ | 200 | __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ |
196 | __u32 layout; /* only for raid5 currently */ | 201 | __u32 layout; /* only for raid5 and raid10 currently */ |
197 | __u64 size; /* used size of component devices, in 512byte sectors */ | 202 | __u64 size; /* used size of component devices, in 512byte sectors */ |
198 | 203 | ||
199 | __u32 chunksize; /* in 512byte sectors */ | 204 | __u32 chunksize; /* in 512byte sectors */ |
@@ -212,7 +217,9 @@ struct mdp_superblock_1 { | |||
212 | __u32 dev_number; /* permanent identifier of this device - not role in raid */ | 217 | __u32 dev_number; /* permanent identifier of this device - not role in raid */ |
213 | __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ | 218 | __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ |
214 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ | 219 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ |
215 | __u8 pad2[64-56]; /* set to 0 when writing */ | 220 | __u8 devflags; /* per-device flags. Only one defined...*/ |
221 | #define WriteMostly1 1 /* mask for writemostly flag in above */ | ||
222 | __u8 pad2[64-57]; /* set to 0 when writing */ | ||
216 | 223 | ||
217 | /* array state information - 64 bytes */ | 224 | /* array state information - 64 bytes */ |
218 | __u64 utime; /* 40 bits second, 24 btes microseconds */ | 225 | __u64 utime; /* 40 bits second, 24 btes microseconds */ |