aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/md/raid5-cache.c143
-rw-r--r--drivers/md/raid5.c45
-rw-r--r--drivers/md/raid5.h31
3 files changed, 211 insertions, 8 deletions
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 33fc85015147..02a554434747 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -40,6 +40,47 @@
40 */ 40 */
41#define R5L_POOL_SIZE 4 41#define R5L_POOL_SIZE 4
42 42
43/*
44 * r5c journal modes of the array: write-back or write-through.
45 * write-through mode has identical behavior as existing log only
46 * implementation.
47 */
48enum r5c_journal_mode {
49 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
50 R5C_JOURNAL_MODE_WRITE_BACK = 1,
51};
52
53/*
54 * raid5 cache state machine
55 *
56 * With rhe RAID cache, each stripe works in two phases:
57 * - caching phase
58 * - writing-out phase
59 *
60 * These two phases are controlled by bit STRIPE_R5C_CACHING:
61 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
62 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
63 *
64 * When there is no journal, or the journal is in write-through mode,
65 * the stripe is always in writing-out phase.
66 *
67 * For write-back journal, the stripe is sent to caching phase on write
68 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
69 * the write-out phase by clearing STRIPE_R5C_CACHING.
70 *
71 * Stripes in caching phase do not write the raid disks. Instead, all
72 * writes are committed from the log device. Therefore, a stripe in
73 * caching phase handles writes as:
74 * - write to log device
75 * - return IO
76 *
77 * Stripes in writing-out phase handle writes as:
78 * - calculate parity
79 * - write pending data and parity to journal
80 * - write data and parity to raid disks
81 * - return IO for pending writes
82 */
83
43struct r5l_log { 84struct r5l_log {
44 struct md_rdev *rdev; 85 struct md_rdev *rdev;
45 86
@@ -96,6 +137,9 @@ struct r5l_log {
96 spinlock_t no_space_stripes_lock; 137 spinlock_t no_space_stripes_lock;
97 138
98 bool need_cache_flush; 139 bool need_cache_flush;
140
141 /* for r5c_cache */
142 enum r5c_journal_mode r5c_journal_mode;
99}; 143};
100 144
101/* 145/*
@@ -133,6 +177,12 @@ enum r5l_io_unit_state {
133 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 177 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
134}; 178};
135 179
180bool r5c_is_writeback(struct r5l_log *log)
181{
182 return (log != NULL &&
183 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
184}
185
136static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 186static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
137{ 187{
138 start += inc; 188 start += inc;
@@ -168,12 +218,51 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
168 io->state = state; 218 io->state = state;
169} 219}
170 220
221/*
222 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
223 * This function should only be called in write-back mode.
224 */
225static void r5c_make_stripe_write_out(struct stripe_head *sh)
226{
227 struct r5conf *conf = sh->raid_conf;
228 struct r5l_log *log = conf->log;
229
230 BUG_ON(!r5c_is_writeback(log));
231
232 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
233 clear_bit(STRIPE_R5C_CACHING, &sh->state);
234}
235
236/*
237 * Setting proper flags after writing (or flushing) data and/or parity to the
238 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
239 */
240static void r5c_finish_cache_stripe(struct stripe_head *sh)
241{
242 struct r5l_log *log = sh->raid_conf->log;
243
244 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
245 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
246 /*
247 * Set R5_InJournal for parity dev[pd_idx]. This means
248 * all data AND parity in the journal. For RAID 6, it is
249 * NOT necessary to set the flag for dev[qd_idx], as the
250 * two parities are written out together.
251 */
252 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
253 } else
254 BUG(); /* write-back logic in next patch */
255}
256
171static void r5l_io_run_stripes(struct r5l_io_unit *io) 257static void r5l_io_run_stripes(struct r5l_io_unit *io)
172{ 258{
173 struct stripe_head *sh, *next; 259 struct stripe_head *sh, *next;
174 260
175 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 261 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
176 list_del_init(&sh->log_list); 262 list_del_init(&sh->log_list);
263
264 r5c_finish_cache_stripe(sh);
265
177 set_bit(STRIPE_HANDLE, &sh->state); 266 set_bit(STRIPE_HANDLE, &sh->state);
178 raid5_release_stripe(sh); 267 raid5_release_stripe(sh);
179 } 268 }
@@ -412,18 +501,19 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
412 r5l_append_payload_page(log, sh->dev[i].page); 501 r5l_append_payload_page(log, sh->dev[i].page);
413 } 502 }
414 503
415 if (sh->qd_idx >= 0) { 504 if (parity_pages == 2) {
416 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 505 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
417 sh->sector, sh->dev[sh->pd_idx].log_checksum, 506 sh->sector, sh->dev[sh->pd_idx].log_checksum,
418 sh->dev[sh->qd_idx].log_checksum, true); 507 sh->dev[sh->qd_idx].log_checksum, true);
419 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 508 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
420 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 509 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
421 } else { 510 } else if (parity_pages == 1) {
422 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 511 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
423 sh->sector, sh->dev[sh->pd_idx].log_checksum, 512 sh->sector, sh->dev[sh->pd_idx].log_checksum,
424 0, false); 513 0, false);
425 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 514 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
426 } 515 } else /* Just writing data, not parity, in caching phase */
516 BUG_ON(parity_pages != 0);
427 517
428 list_add_tail(&sh->log_list, &io->stripe_list); 518 list_add_tail(&sh->log_list, &io->stripe_list);
429 atomic_inc(&io->pending_stripe); 519 atomic_inc(&io->pending_stripe);
@@ -455,6 +545,8 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
455 return -EAGAIN; 545 return -EAGAIN;
456 } 546 }
457 547
548 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
549
458 for (i = 0; i < sh->disks; i++) { 550 for (i = 0; i < sh->disks; i++) {
459 void *addr; 551 void *addr;
460 552
@@ -1112,6 +1204,49 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
1112 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1204 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1113} 1205}
1114 1206
1207/*
1208 * Try handle write operation in caching phase. This function should only
1209 * be called in write-back mode.
1210 *
1211 * If all outstanding writes can be handled in caching phase, returns 0
1212 * If writes requires write-out phase, call r5c_make_stripe_write_out()
1213 * and returns -EAGAIN
1214 */
1215int r5c_try_caching_write(struct r5conf *conf,
1216 struct stripe_head *sh,
1217 struct stripe_head_state *s,
1218 int disks)
1219{
1220 struct r5l_log *log = conf->log;
1221
1222 BUG_ON(!r5c_is_writeback(log));
1223
1224 /* more write-back logic in next patches */
1225 r5c_make_stripe_write_out(sh);
1226 return -EAGAIN;
1227}
1228
1229/*
1230 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
1231 * stripe is committed to RAID disks.
1232 */
1233void r5c_finish_stripe_write_out(struct r5conf *conf,
1234 struct stripe_head *sh,
1235 struct stripe_head_state *s)
1236{
1237 if (!conf->log ||
1238 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
1239 return;
1240
1241 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1242 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
1243
1244 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1245 return;
1246 BUG(); /* write-back logic in following patches */
1247}
1248
1249
1115static int r5l_load_log(struct r5l_log *log) 1250static int r5l_load_log(struct r5l_log *log)
1116{ 1251{
1117 struct md_rdev *rdev = log->rdev; 1252 struct md_rdev *rdev = log->rdev;
@@ -1249,6 +1384,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1249 INIT_LIST_HEAD(&log->no_space_stripes); 1384 INIT_LIST_HEAD(&log->no_space_stripes);
1250 spin_lock_init(&log->no_space_stripes_lock); 1385 spin_lock_init(&log->no_space_stripes_lock);
1251 1386
1387 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
1388
1252 if (r5l_load_log(log)) 1389 if (r5l_load_log(log))
1253 goto error; 1390 goto error;
1254 1391
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 34895f3218d9..7c98eb06d1b2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4107,6 +4107,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4107 if (rdev && !test_bit(Faulty, &rdev->flags)) 4107 if (rdev && !test_bit(Faulty, &rdev->flags))
4108 do_recovery = 1; 4108 do_recovery = 1;
4109 } 4109 }
4110
4111 if (test_bit(R5_InJournal, &dev->flags))
4112 s->injournal++;
4110 } 4113 }
4111 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4114 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4112 /* If there is a failed device being replaced, 4115 /* If there is a failed device being replaced,
@@ -4386,14 +4389,47 @@ static void handle_stripe(struct stripe_head *sh)
4386 || s.expanding) 4389 || s.expanding)
4387 handle_stripe_fill(sh, &s, disks); 4390 handle_stripe_fill(sh, &s, disks);
4388 4391
4389 /* Now to consider new write requests and what else, if anything 4392 /*
4390 * should be read. We do not handle new writes when: 4393 * When the stripe finishes full journal write cycle (write to journal
4394 * and raid disk), this is the clean up procedure so it is ready for
4395 * next operation.
4396 */
4397 r5c_finish_stripe_write_out(conf, sh, &s);
4398
4399 /*
4400 * Now to consider new write requests, cache write back and what else,
4401 * if anything should be read. We do not handle new writes when:
4391 * 1/ A 'write' operation (copy+xor) is already in flight. 4402 * 1/ A 'write' operation (copy+xor) is already in flight.
4392 * 2/ A 'check' operation is in flight, as it may clobber the parity 4403 * 2/ A 'check' operation is in flight, as it may clobber the parity
4393 * block. 4404 * block.
4405 * 3/ A r5c cache log write is in flight.
4394 */ 4406 */
4395 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 4407
4396 handle_stripe_dirtying(conf, sh, &s, disks); 4408 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4409 if (!r5c_is_writeback(conf->log)) {
4410 if (s.to_write)
4411 handle_stripe_dirtying(conf, sh, &s, disks);
4412 } else { /* write back cache */
4413 int ret = 0;
4414
4415 /* First, try handle writes in caching phase */
4416 if (s.to_write)
4417 ret = r5c_try_caching_write(conf, sh, &s,
4418 disks);
4419 /*
4420 * If caching phase failed: ret == -EAGAIN
4421 * OR
4422 * stripe under reclaim: !caching && injournal
4423 *
4424 * fall back to handle_stripe_dirtying()
4425 */
4426 if (ret == -EAGAIN ||
4427 /* stripe under reclaim: !caching && injournal */
4428 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4429 s.injournal > 0))
4430 handle_stripe_dirtying(conf, sh, &s, disks);
4431 }
4432 }
4397 4433
4398 /* maybe we need to check and possibly fix the parity for this stripe 4434 /* maybe we need to check and possibly fix the parity for this stripe
4399 * Any reads will already have been scheduled, so we just see if enough 4435 * Any reads will already have been scheduled, so we just see if enough
@@ -5110,6 +5146,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
5110 * data on failed drives. 5146 * data on failed drives.
5111 */ 5147 */
5112 if (rw == READ && mddev->degraded == 0 && 5148 if (rw == READ && mddev->degraded == 0 &&
5149 !r5c_is_writeback(conf->log) &&
5113 mddev->reshape_position == MaxSector) { 5150 mddev->reshape_position == MaxSector) {
5114 bi = chunk_aligned_read(mddev, bi); 5151 bi = chunk_aligned_read(mddev, bi);
5115 if (!bi) 5152 if (!bi)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ffc13c4d7e63..c9590a8e1425 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -264,6 +264,7 @@ struct stripe_head_state {
264 int syncing, expanding, expanded, replacing; 264 int syncing, expanding, expanded, replacing;
265 int locked, uptodate, to_read, to_write, failed, written; 265 int locked, uptodate, to_read, to_write, failed, written;
266 int to_fill, compute, req_compute, non_overwrite; 266 int to_fill, compute, req_compute, non_overwrite;
267 int injournal;
267 int failed_num[2]; 268 int failed_num[2];
268 int p_failed, q_failed; 269 int p_failed, q_failed;
269 int dec_preread_active; 270 int dec_preread_active;
@@ -313,6 +314,11 @@ enum r5dev_flags {
313 */ 314 */
314 R5_Discard, /* Discard the stripe */ 315 R5_Discard, /* Discard the stripe */
315 R5_SkipCopy, /* Don't copy data from bio to stripe cache */ 316 R5_SkipCopy, /* Don't copy data from bio to stripe cache */
317 R5_InJournal, /* data being written is in the journal device.
318 * if R5_InJournal is set for parity pd_idx, all the
319 * data and parity being written are in the journal
320 * device
321 */
316}; 322};
317 323
318/* 324/*
@@ -345,7 +351,23 @@ enum {
345 STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add 351 STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
346 * to batch yet. 352 * to batch yet.
347 */ 353 */
348 STRIPE_LOG_TRAPPED, /* trapped into log */ 354 STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
355 * this bit is used in two scenarios:
356 *
357 * 1. write-out phase
358 * set in first entry of r5l_write_stripe
359 * clear in second entry of r5l_write_stripe
360 * used to bypass logic in handle_stripe
361 *
362 * 2. caching phase
363 * set in r5c_try_caching_write()
364 * clear when journal write is done
365 * used to initiate r5c_cache_data()
366 * also used to bypass logic in handle_stripe
367 */
368 STRIPE_R5C_CACHING, /* the stripe is in caching phase
369 * see more detail in the raid5-cache.c
370 */
349}; 371};
350 372
351#define STRIPE_EXPAND_SYNC_FLAGS \ 373#define STRIPE_EXPAND_SYNC_FLAGS \
@@ -710,4 +732,11 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
710extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); 732extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
711extern void r5l_quiesce(struct r5l_log *log, int state); 733extern void r5l_quiesce(struct r5l_log *log, int state);
712extern bool r5l_log_disk_error(struct r5conf *conf); 734extern bool r5l_log_disk_error(struct r5conf *conf);
735extern bool r5c_is_writeback(struct r5l_log *log);
736extern int
737r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
738 struct stripe_head_state *s, int disks);
739extern void
740r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
741 struct stripe_head_state *s);
713#endif 742#endif