diff options
-rw-r--r-- | drivers/md/raid5-cache.c | 143 | ||||
-rw-r--r-- | drivers/md/raid5.c | 45 | ||||
-rw-r--r-- | drivers/md/raid5.h | 31 |
3 files changed, 211 insertions, 8 deletions
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 33fc85015147..02a554434747 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c | |||
@@ -40,6 +40,47 @@ | |||
40 | */ | 40 | */ |
41 | #define R5L_POOL_SIZE 4 | 41 | #define R5L_POOL_SIZE 4 |
42 | 42 | ||
43 | /* | ||
44 | * r5c journal modes of the array: write-back or write-through. | ||
45 | * write-through mode has identical behavior as existing log only | ||
46 | * implementation. | ||
47 | */ | ||
48 | enum r5c_journal_mode { | ||
49 | R5C_JOURNAL_MODE_WRITE_THROUGH = 0, | ||
50 | R5C_JOURNAL_MODE_WRITE_BACK = 1, | ||
51 | }; | ||
52 | |||
53 | /* | ||
54 | * raid5 cache state machine | ||
55 | * | ||
56 | * With rhe RAID cache, each stripe works in two phases: | ||
57 | * - caching phase | ||
58 | * - writing-out phase | ||
59 | * | ||
60 | * These two phases are controlled by bit STRIPE_R5C_CACHING: | ||
61 | * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase | ||
62 | * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase | ||
63 | * | ||
64 | * When there is no journal, or the journal is in write-through mode, | ||
65 | * the stripe is always in writing-out phase. | ||
66 | * | ||
67 | * For write-back journal, the stripe is sent to caching phase on write | ||
68 | * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off | ||
69 | * the write-out phase by clearing STRIPE_R5C_CACHING. | ||
70 | * | ||
71 | * Stripes in caching phase do not write the raid disks. Instead, all | ||
72 | * writes are committed from the log device. Therefore, a stripe in | ||
73 | * caching phase handles writes as: | ||
74 | * - write to log device | ||
75 | * - return IO | ||
76 | * | ||
77 | * Stripes in writing-out phase handle writes as: | ||
78 | * - calculate parity | ||
79 | * - write pending data and parity to journal | ||
80 | * - write data and parity to raid disks | ||
81 | * - return IO for pending writes | ||
82 | */ | ||
83 | |||
43 | struct r5l_log { | 84 | struct r5l_log { |
44 | struct md_rdev *rdev; | 85 | struct md_rdev *rdev; |
45 | 86 | ||
@@ -96,6 +137,9 @@ struct r5l_log { | |||
96 | spinlock_t no_space_stripes_lock; | 137 | spinlock_t no_space_stripes_lock; |
97 | 138 | ||
98 | bool need_cache_flush; | 139 | bool need_cache_flush; |
140 | |||
141 | /* for r5c_cache */ | ||
142 | enum r5c_journal_mode r5c_journal_mode; | ||
99 | }; | 143 | }; |
100 | 144 | ||
101 | /* | 145 | /* |
@@ -133,6 +177,12 @@ enum r5l_io_unit_state { | |||
133 | IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ | 177 | IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ |
134 | }; | 178 | }; |
135 | 179 | ||
180 | bool r5c_is_writeback(struct r5l_log *log) | ||
181 | { | ||
182 | return (log != NULL && | ||
183 | log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); | ||
184 | } | ||
185 | |||
136 | static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) | 186 | static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) |
137 | { | 187 | { |
138 | start += inc; | 188 | start += inc; |
@@ -168,12 +218,51 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io, | |||
168 | io->state = state; | 218 | io->state = state; |
169 | } | 219 | } |
170 | 220 | ||
221 | /* | ||
222 | * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. | ||
223 | * This function should only be called in write-back mode. | ||
224 | */ | ||
225 | static void r5c_make_stripe_write_out(struct stripe_head *sh) | ||
226 | { | ||
227 | struct r5conf *conf = sh->raid_conf; | ||
228 | struct r5l_log *log = conf->log; | ||
229 | |||
230 | BUG_ON(!r5c_is_writeback(log)); | ||
231 | |||
232 | WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
233 | clear_bit(STRIPE_R5C_CACHING, &sh->state); | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Setting proper flags after writing (or flushing) data and/or parity to the | ||
238 | * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). | ||
239 | */ | ||
240 | static void r5c_finish_cache_stripe(struct stripe_head *sh) | ||
241 | { | ||
242 | struct r5l_log *log = sh->raid_conf->log; | ||
243 | |||
244 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { | ||
245 | BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
246 | /* | ||
247 | * Set R5_InJournal for parity dev[pd_idx]. This means | ||
248 | * all data AND parity in the journal. For RAID 6, it is | ||
249 | * NOT necessary to set the flag for dev[qd_idx], as the | ||
250 | * two parities are written out together. | ||
251 | */ | ||
252 | set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); | ||
253 | } else | ||
254 | BUG(); /* write-back logic in next patch */ | ||
255 | } | ||
256 | |||
171 | static void r5l_io_run_stripes(struct r5l_io_unit *io) | 257 | static void r5l_io_run_stripes(struct r5l_io_unit *io) |
172 | { | 258 | { |
173 | struct stripe_head *sh, *next; | 259 | struct stripe_head *sh, *next; |
174 | 260 | ||
175 | list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { | 261 | list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { |
176 | list_del_init(&sh->log_list); | 262 | list_del_init(&sh->log_list); |
263 | |||
264 | r5c_finish_cache_stripe(sh); | ||
265 | |||
177 | set_bit(STRIPE_HANDLE, &sh->state); | 266 | set_bit(STRIPE_HANDLE, &sh->state); |
178 | raid5_release_stripe(sh); | 267 | raid5_release_stripe(sh); |
179 | } | 268 | } |
@@ -412,18 +501,19 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, | |||
412 | r5l_append_payload_page(log, sh->dev[i].page); | 501 | r5l_append_payload_page(log, sh->dev[i].page); |
413 | } | 502 | } |
414 | 503 | ||
415 | if (sh->qd_idx >= 0) { | 504 | if (parity_pages == 2) { |
416 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, | 505 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, |
417 | sh->sector, sh->dev[sh->pd_idx].log_checksum, | 506 | sh->sector, sh->dev[sh->pd_idx].log_checksum, |
418 | sh->dev[sh->qd_idx].log_checksum, true); | 507 | sh->dev[sh->qd_idx].log_checksum, true); |
419 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); | 508 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); |
420 | r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); | 509 | r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); |
421 | } else { | 510 | } else if (parity_pages == 1) { |
422 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, | 511 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, |
423 | sh->sector, sh->dev[sh->pd_idx].log_checksum, | 512 | sh->sector, sh->dev[sh->pd_idx].log_checksum, |
424 | 0, false); | 513 | 0, false); |
425 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); | 514 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); |
426 | } | 515 | } else /* Just writing data, not parity, in caching phase */ |
516 | BUG_ON(parity_pages != 0); | ||
427 | 517 | ||
428 | list_add_tail(&sh->log_list, &io->stripe_list); | 518 | list_add_tail(&sh->log_list, &io->stripe_list); |
429 | atomic_inc(&io->pending_stripe); | 519 | atomic_inc(&io->pending_stripe); |
@@ -455,6 +545,8 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) | |||
455 | return -EAGAIN; | 545 | return -EAGAIN; |
456 | } | 546 | } |
457 | 547 | ||
548 | WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
549 | |||
458 | for (i = 0; i < sh->disks; i++) { | 550 | for (i = 0; i < sh->disks; i++) { |
459 | void *addr; | 551 | void *addr; |
460 | 552 | ||
@@ -1112,6 +1204,49 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp) | |||
1112 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1204 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
1113 | } | 1205 | } |
1114 | 1206 | ||
1207 | /* | ||
1208 | * Try handle write operation in caching phase. This function should only | ||
1209 | * be called in write-back mode. | ||
1210 | * | ||
1211 | * If all outstanding writes can be handled in caching phase, returns 0 | ||
1212 | * If writes requires write-out phase, call r5c_make_stripe_write_out() | ||
1213 | * and returns -EAGAIN | ||
1214 | */ | ||
1215 | int r5c_try_caching_write(struct r5conf *conf, | ||
1216 | struct stripe_head *sh, | ||
1217 | struct stripe_head_state *s, | ||
1218 | int disks) | ||
1219 | { | ||
1220 | struct r5l_log *log = conf->log; | ||
1221 | |||
1222 | BUG_ON(!r5c_is_writeback(log)); | ||
1223 | |||
1224 | /* more write-back logic in next patches */ | ||
1225 | r5c_make_stripe_write_out(sh); | ||
1226 | return -EAGAIN; | ||
1227 | } | ||
1228 | |||
1229 | /* | ||
1230 | * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the | ||
1231 | * stripe is committed to RAID disks. | ||
1232 | */ | ||
1233 | void r5c_finish_stripe_write_out(struct r5conf *conf, | ||
1234 | struct stripe_head *sh, | ||
1235 | struct stripe_head_state *s) | ||
1236 | { | ||
1237 | if (!conf->log || | ||
1238 | !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) | ||
1239 | return; | ||
1240 | |||
1241 | WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
1242 | clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); | ||
1243 | |||
1244 | if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
1245 | return; | ||
1246 | BUG(); /* write-back logic in following patches */ | ||
1247 | } | ||
1248 | |||
1249 | |||
1115 | static int r5l_load_log(struct r5l_log *log) | 1250 | static int r5l_load_log(struct r5l_log *log) |
1116 | { | 1251 | { |
1117 | struct md_rdev *rdev = log->rdev; | 1252 | struct md_rdev *rdev = log->rdev; |
@@ -1249,6 +1384,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | |||
1249 | INIT_LIST_HEAD(&log->no_space_stripes); | 1384 | INIT_LIST_HEAD(&log->no_space_stripes); |
1250 | spin_lock_init(&log->no_space_stripes_lock); | 1385 | spin_lock_init(&log->no_space_stripes_lock); |
1251 | 1386 | ||
1387 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; | ||
1388 | |||
1252 | if (r5l_load_log(log)) | 1389 | if (r5l_load_log(log)) |
1253 | goto error; | 1390 | goto error; |
1254 | 1391 | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 34895f3218d9..7c98eb06d1b2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -4107,6 +4107,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
4107 | if (rdev && !test_bit(Faulty, &rdev->flags)) | 4107 | if (rdev && !test_bit(Faulty, &rdev->flags)) |
4108 | do_recovery = 1; | 4108 | do_recovery = 1; |
4109 | } | 4109 | } |
4110 | |||
4111 | if (test_bit(R5_InJournal, &dev->flags)) | ||
4112 | s->injournal++; | ||
4110 | } | 4113 | } |
4111 | if (test_bit(STRIPE_SYNCING, &sh->state)) { | 4114 | if (test_bit(STRIPE_SYNCING, &sh->state)) { |
4112 | /* If there is a failed device being replaced, | 4115 | /* If there is a failed device being replaced, |
@@ -4386,14 +4389,47 @@ static void handle_stripe(struct stripe_head *sh) | |||
4386 | || s.expanding) | 4389 | || s.expanding) |
4387 | handle_stripe_fill(sh, &s, disks); | 4390 | handle_stripe_fill(sh, &s, disks); |
4388 | 4391 | ||
4389 | /* Now to consider new write requests and what else, if anything | 4392 | /* |
4390 | * should be read. We do not handle new writes when: | 4393 | * When the stripe finishes full journal write cycle (write to journal |
4394 | * and raid disk), this is the clean up procedure so it is ready for | ||
4395 | * next operation. | ||
4396 | */ | ||
4397 | r5c_finish_stripe_write_out(conf, sh, &s); | ||
4398 | |||
4399 | /* | ||
4400 | * Now to consider new write requests, cache write back and what else, | ||
4401 | * if anything should be read. We do not handle new writes when: | ||
4391 | * 1/ A 'write' operation (copy+xor) is already in flight. | 4402 | * 1/ A 'write' operation (copy+xor) is already in flight. |
4392 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 4403 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
4393 | * block. | 4404 | * block. |
4405 | * 3/ A r5c cache log write is in flight. | ||
4394 | */ | 4406 | */ |
4395 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | 4407 | |
4396 | handle_stripe_dirtying(conf, sh, &s, disks); | 4408 | if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { |
4409 | if (!r5c_is_writeback(conf->log)) { | ||
4410 | if (s.to_write) | ||
4411 | handle_stripe_dirtying(conf, sh, &s, disks); | ||
4412 | } else { /* write back cache */ | ||
4413 | int ret = 0; | ||
4414 | |||
4415 | /* First, try handle writes in caching phase */ | ||
4416 | if (s.to_write) | ||
4417 | ret = r5c_try_caching_write(conf, sh, &s, | ||
4418 | disks); | ||
4419 | /* | ||
4420 | * If caching phase failed: ret == -EAGAIN | ||
4421 | * OR | ||
4422 | * stripe under reclaim: !caching && injournal | ||
4423 | * | ||
4424 | * fall back to handle_stripe_dirtying() | ||
4425 | */ | ||
4426 | if (ret == -EAGAIN || | ||
4427 | /* stripe under reclaim: !caching && injournal */ | ||
4428 | (!test_bit(STRIPE_R5C_CACHING, &sh->state) && | ||
4429 | s.injournal > 0)) | ||
4430 | handle_stripe_dirtying(conf, sh, &s, disks); | ||
4431 | } | ||
4432 | } | ||
4397 | 4433 | ||
4398 | /* maybe we need to check and possibly fix the parity for this stripe | 4434 | /* maybe we need to check and possibly fix the parity for this stripe |
4399 | * Any reads will already have been scheduled, so we just see if enough | 4435 | * Any reads will already have been scheduled, so we just see if enough |
@@ -5110,6 +5146,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
5110 | * data on failed drives. | 5146 | * data on failed drives. |
5111 | */ | 5147 | */ |
5112 | if (rw == READ && mddev->degraded == 0 && | 5148 | if (rw == READ && mddev->degraded == 0 && |
5149 | !r5c_is_writeback(conf->log) && | ||
5113 | mddev->reshape_position == MaxSector) { | 5150 | mddev->reshape_position == MaxSector) { |
5114 | bi = chunk_aligned_read(mddev, bi); | 5151 | bi = chunk_aligned_read(mddev, bi); |
5115 | if (!bi) | 5152 | if (!bi) |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ffc13c4d7e63..c9590a8e1425 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -264,6 +264,7 @@ struct stripe_head_state { | |||
264 | int syncing, expanding, expanded, replacing; | 264 | int syncing, expanding, expanded, replacing; |
265 | int locked, uptodate, to_read, to_write, failed, written; | 265 | int locked, uptodate, to_read, to_write, failed, written; |
266 | int to_fill, compute, req_compute, non_overwrite; | 266 | int to_fill, compute, req_compute, non_overwrite; |
267 | int injournal; | ||
267 | int failed_num[2]; | 268 | int failed_num[2]; |
268 | int p_failed, q_failed; | 269 | int p_failed, q_failed; |
269 | int dec_preread_active; | 270 | int dec_preread_active; |
@@ -313,6 +314,11 @@ enum r5dev_flags { | |||
313 | */ | 314 | */ |
314 | R5_Discard, /* Discard the stripe */ | 315 | R5_Discard, /* Discard the stripe */ |
315 | R5_SkipCopy, /* Don't copy data from bio to stripe cache */ | 316 | R5_SkipCopy, /* Don't copy data from bio to stripe cache */ |
317 | R5_InJournal, /* data being written is in the journal device. | ||
318 | * if R5_InJournal is set for parity pd_idx, all the | ||
319 | * data and parity being written are in the journal | ||
320 | * device | ||
321 | */ | ||
316 | }; | 322 | }; |
317 | 323 | ||
318 | /* | 324 | /* |
@@ -345,7 +351,23 @@ enum { | |||
345 | STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add | 351 | STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add |
346 | * to batch yet. | 352 | * to batch yet. |
347 | */ | 353 | */ |
348 | STRIPE_LOG_TRAPPED, /* trapped into log */ | 354 | STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c) |
355 | * this bit is used in two scenarios: | ||
356 | * | ||
357 | * 1. write-out phase | ||
358 | * set in first entry of r5l_write_stripe | ||
359 | * clear in second entry of r5l_write_stripe | ||
360 | * used to bypass logic in handle_stripe | ||
361 | * | ||
362 | * 2. caching phase | ||
363 | * set in r5c_try_caching_write() | ||
364 | * clear when journal write is done | ||
365 | * used to initiate r5c_cache_data() | ||
366 | * also used to bypass logic in handle_stripe | ||
367 | */ | ||
368 | STRIPE_R5C_CACHING, /* the stripe is in caching phase | ||
369 | * see more detail in the raid5-cache.c | ||
370 | */ | ||
349 | }; | 371 | }; |
350 | 372 | ||
351 | #define STRIPE_EXPAND_SYNC_FLAGS \ | 373 | #define STRIPE_EXPAND_SYNC_FLAGS \ |
@@ -710,4 +732,11 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh); | |||
710 | extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); | 732 | extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); |
711 | extern void r5l_quiesce(struct r5l_log *log, int state); | 733 | extern void r5l_quiesce(struct r5l_log *log, int state); |
712 | extern bool r5l_log_disk_error(struct r5conf *conf); | 734 | extern bool r5l_log_disk_error(struct r5conf *conf); |
735 | extern bool r5c_is_writeback(struct r5l_log *log); | ||
736 | extern int | ||
737 | r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, | ||
738 | struct stripe_head_state *s, int disks); | ||
739 | extern void | ||
740 | r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, | ||
741 | struct stripe_head_state *s); | ||
713 | #endif | 742 | #endif |