aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVishal Verma <vishal.l.verma@intel.com>2017-12-18 11:28:39 -0500
committerDan Williams <dan.j.williams@intel.com>2017-12-21 17:59:27 -0500
commit24e3a7fb60a9187e5df90e5fa655ffc94b9c4f77 (patch)
treeeac112b892e6a56fc4217ba20b5b30af1b4ea504
parent13b7954c0b8dd2d6382b4ddb5053f09e389d5c6e (diff)
libnvdimm, btt: Fix an incompatibility in the log layout
Due to a spec misinterpretation, the Linux implementation of the BTT log area had different padding scheme from other implementations, such as UEFI and NVML. This fixes the padding scheme, and defaults to it for new BTT layouts. We attempt to detect the padding scheme in use when probing for an existing BTT. If we detect the older/incompatible scheme, we continue using it. Reported-by: Juston Li <juston.li@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: <stable@vger.kernel.org> Fixes: 5212e11fde4d ("nd_btt: atomic sector updates") Signed-off-by: Vishal Verma <vishal.l.verma@intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r--drivers/nvdimm/btt.c201
-rw-r--r--drivers/nvdimm/btt.h45
2 files changed, 211 insertions, 35 deletions
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index e949e3302af4..c586bcdb5190 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -211,12 +211,12 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
211 return ret; 211 return ret;
212} 212}
213 213
214static int btt_log_read_pair(struct arena_info *arena, u32 lane, 214static int btt_log_group_read(struct arena_info *arena, u32 lane,
215 struct log_entry *ent) 215 struct log_group *log)
216{ 216{
217 return arena_read_bytes(arena, 217 return arena_read_bytes(arena,
218 arena->logoff + (2 * lane * LOG_ENT_SIZE), ent, 218 arena->logoff + (lane * LOG_GRP_SIZE), log,
219 2 * LOG_ENT_SIZE, 0); 219 LOG_GRP_SIZE, 0);
220} 220}
221 221
222static struct dentry *debugfs_root; 222static struct dentry *debugfs_root;
@@ -256,6 +256,8 @@ static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
256 debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff); 256 debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
257 debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off); 257 debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
258 debugfs_create_x32("flags", S_IRUGO, d, &a->flags); 258 debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
259 debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]);
260 debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]);
259} 261}
260 262
261static void btt_debugfs_init(struct btt *btt) 263static void btt_debugfs_init(struct btt *btt)
@@ -274,6 +276,11 @@ static void btt_debugfs_init(struct btt *btt)
274 } 276 }
275} 277}
276 278
279static u32 log_seq(struct log_group *log, int log_idx)
280{
281 return le32_to_cpu(log->ent[log_idx].seq);
282}
283
277/* 284/*
278 * This function accepts two log entries, and uses the 285 * This function accepts two log entries, and uses the
279 * sequence number to find the 'older' entry. 286 * sequence number to find the 'older' entry.
@@ -283,8 +290,10 @@ static void btt_debugfs_init(struct btt *btt)
283 * 290 *
284 * TODO The logic feels a bit kludge-y. make it better.. 291 * TODO The logic feels a bit kludge-y. make it better..
285 */ 292 */
286static int btt_log_get_old(struct log_entry *ent) 293static int btt_log_get_old(struct arena_info *a, struct log_group *log)
287{ 294{
295 int idx0 = a->log_index[0];
296 int idx1 = a->log_index[1];
288 int old; 297 int old;
289 298
290 /* 299 /*
@@ -292,23 +301,23 @@ static int btt_log_get_old(struct log_entry *ent)
292 * the next time, the following logic works out to put this 301 * the next time, the following logic works out to put this
293 * (next) entry into [1] 302 * (next) entry into [1]
294 */ 303 */
295 if (ent[0].seq == 0) { 304 if (log_seq(log, idx0) == 0) {
296 ent[0].seq = cpu_to_le32(1); 305 log->ent[idx0].seq = cpu_to_le32(1);
297 return 0; 306 return 0;
298 } 307 }
299 308
300 if (ent[0].seq == ent[1].seq) 309 if (log_seq(log, idx0) == log_seq(log, idx1))
301 return -EINVAL; 310 return -EINVAL;
302 if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5) 311 if (log_seq(log, idx0) + log_seq(log, idx1) > 5)
303 return -EINVAL; 312 return -EINVAL;
304 313
305 if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) { 314 if (log_seq(log, idx0) < log_seq(log, idx1)) {
306 if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1) 315 if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1)
307 old = 0; 316 old = 0;
308 else 317 else
309 old = 1; 318 old = 1;
310 } else { 319 } else {
311 if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1) 320 if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1)
312 old = 1; 321 old = 1;
313 else 322 else
314 old = 0; 323 old = 0;
@@ -328,17 +337,18 @@ static int btt_log_read(struct arena_info *arena, u32 lane,
328{ 337{
329 int ret; 338 int ret;
330 int old_ent, ret_ent; 339 int old_ent, ret_ent;
331 struct log_entry log[2]; 340 struct log_group log;
332 341
333 ret = btt_log_read_pair(arena, lane, log); 342 ret = btt_log_group_read(arena, lane, &log);
334 if (ret) 343 if (ret)
335 return -EIO; 344 return -EIO;
336 345
337 old_ent = btt_log_get_old(log); 346 old_ent = btt_log_get_old(arena, &log);
338 if (old_ent < 0 || old_ent > 1) { 347 if (old_ent < 0 || old_ent > 1) {
339 dev_err(to_dev(arena), 348 dev_err(to_dev(arena),
340 "log corruption (%d): lane %d seq [%d, %d]\n", 349 "log corruption (%d): lane %d seq [%d, %d]\n",
341 old_ent, lane, log[0].seq, log[1].seq); 350 old_ent, lane, log.ent[arena->log_index[0]].seq,
351 log.ent[arena->log_index[1]].seq);
342 /* TODO set error state? */ 352 /* TODO set error state? */
343 return -EIO; 353 return -EIO;
344 } 354 }
@@ -346,7 +356,7 @@ static int btt_log_read(struct arena_info *arena, u32 lane,
346 ret_ent = (old_flag ? old_ent : (1 - old_ent)); 356 ret_ent = (old_flag ? old_ent : (1 - old_ent));
347 357
348 if (ent != NULL) 358 if (ent != NULL)
349 memcpy(ent, &log[ret_ent], LOG_ENT_SIZE); 359 memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE);
350 360
351 return ret_ent; 361 return ret_ent;
352} 362}
@@ -360,17 +370,13 @@ static int __btt_log_write(struct arena_info *arena, u32 lane,
360 u32 sub, struct log_entry *ent, unsigned long flags) 370 u32 sub, struct log_entry *ent, unsigned long flags)
361{ 371{
362 int ret; 372 int ret;
363 /* 373 u32 group_slot = arena->log_index[sub];
364 * Ignore the padding in log_entry for calculating log_half. 374 unsigned int log_half = LOG_ENT_SIZE / 2;
365 * The entry is 'committed' when we write the sequence number,
366 * and we want to ensure that that is the last thing written.
367 * We don't bother writing the padding as that would be extra
368 * media wear and write amplification
369 */
370 unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2;
371 u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE);
372 void *src = ent; 375 void *src = ent;
376 u64 ns_off;
373 377
378 ns_off = arena->logoff + (lane * LOG_GRP_SIZE) +
379 (group_slot * LOG_ENT_SIZE);
374 /* split the 16B write into atomic, durable halves */ 380 /* split the 16B write into atomic, durable halves */
375 ret = arena_write_bytes(arena, ns_off, src, log_half, flags); 381 ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
376 if (ret) 382 if (ret)
@@ -453,7 +459,7 @@ static int btt_log_init(struct arena_info *arena)
453{ 459{
454 size_t logsize = arena->info2off - arena->logoff; 460 size_t logsize = arena->info2off - arena->logoff;
455 size_t chunk_size = SZ_4K, offset = 0; 461 size_t chunk_size = SZ_4K, offset = 0;
456 struct log_entry log; 462 struct log_entry ent;
457 void *zerobuf; 463 void *zerobuf;
458 int ret; 464 int ret;
459 u32 i; 465 u32 i;
@@ -485,11 +491,11 @@ static int btt_log_init(struct arena_info *arena)
485 } 491 }
486 492
487 for (i = 0; i < arena->nfree; i++) { 493 for (i = 0; i < arena->nfree; i++) {
488 log.lba = cpu_to_le32(i); 494 ent.lba = cpu_to_le32(i);
489 log.old_map = cpu_to_le32(arena->external_nlba + i); 495 ent.old_map = cpu_to_le32(arena->external_nlba + i);
490 log.new_map = cpu_to_le32(arena->external_nlba + i); 496 ent.new_map = cpu_to_le32(arena->external_nlba + i);
491 log.seq = cpu_to_le32(LOG_SEQ_INIT); 497 ent.seq = cpu_to_le32(LOG_SEQ_INIT);
492 ret = __btt_log_write(arena, i, 0, &log, 0); 498 ret = __btt_log_write(arena, i, 0, &ent, 0);
493 if (ret) 499 if (ret)
494 goto free; 500 goto free;
495 } 501 }
@@ -594,6 +600,123 @@ static int btt_freelist_init(struct arena_info *arena)
594 return 0; 600 return 0;
595} 601}
596 602
603static bool ent_is_padding(struct log_entry *ent)
604{
605 return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0)
606 && (ent->seq == 0);
607}
608
609/*
610 * Detecting valid log indices: We read a log group (see the comments in btt.h
611 * for a description of a 'log_group' and its 'slots'), and iterate over its
612 * four slots. We expect that a padding slot will be all-zeroes, and use this
613 * to detect a padding slot vs. an actual entry.
614 *
615 * If a log_group is in the initial state, i.e. hasn't been used since the
616 * creation of this BTT layout, it will have three of the four slots with
617 * zeroes. We skip over these log_groups for the detection of log_index. If
618 * all log_groups are in the initial state (i.e. the BTT has never been
619 * written to), it is safe to assume the 'new format' of log entries in slots
620 * (0, 1).
621 */
622static int log_set_indices(struct arena_info *arena)
623{
624 bool idx_set = false, initial_state = true;
625 int ret, log_index[2] = {-1, -1};
626 u32 i, j, next_idx = 0;
627 struct log_group log;
628 u32 pad_count = 0;
629
630 for (i = 0; i < arena->nfree; i++) {
631 ret = btt_log_group_read(arena, i, &log);
632 if (ret < 0)
633 return ret;
634
635 for (j = 0; j < 4; j++) {
636 if (!idx_set) {
637 if (ent_is_padding(&log.ent[j])) {
638 pad_count++;
639 continue;
640 } else {
641 /* Skip if index has been recorded */
642 if ((next_idx == 1) &&
643 (j == log_index[0]))
644 continue;
645 /* valid entry, record index */
646 log_index[next_idx] = j;
647 next_idx++;
648 }
649 if (next_idx == 2) {
650 /* two valid entries found */
651 idx_set = true;
652 } else if (next_idx > 2) {
653 /* too many valid indices */
654 return -ENXIO;
655 }
656 } else {
657 /*
658 * once the indices have been set, just verify
659 * that all subsequent log groups are either in
660 * their initial state or follow the same
661 * indices.
662 */
663 if (j == log_index[0]) {
664 /* entry must be 'valid' */
665 if (ent_is_padding(&log.ent[j]))
666 return -ENXIO;
667 } else if (j == log_index[1]) {
668 ;
669 /*
670 * log_index[1] can be padding if the
671 * lane never got used and it is still
672 * in the initial state (three 'padding'
673 * entries)
674 */
675 } else {
676 /* entry must be invalid (padding) */
677 if (!ent_is_padding(&log.ent[j]))
678 return -ENXIO;
679 }
680 }
681 }
682 /*
683 * If any of the log_groups have more than one valid,
684 * non-padding entry, then the we are no longer in the
685 * initial_state
686 */
687 if (pad_count < 3)
688 initial_state = false;
689 pad_count = 0;
690 }
691
692 if (!initial_state && !idx_set)
693 return -ENXIO;
694
695 /*
696 * If all the entries in the log were in the initial state,
697 * assume new padding scheme
698 */
699 if (initial_state)
700 log_index[1] = 1;
701
702 /*
703 * Only allow the known permutations of log/padding indices,
704 * i.e. (0, 1), and (0, 2)
705 */
706 if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2)))
707 ; /* known index possibilities */
708 else {
709 dev_err(to_dev(arena), "Found an unknown padding scheme\n");
710 return -ENXIO;
711 }
712
713 arena->log_index[0] = log_index[0];
714 arena->log_index[1] = log_index[1];
715 dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]);
716 dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]);
717 return 0;
718}
719
597static int btt_rtt_init(struct arena_info *arena) 720static int btt_rtt_init(struct arena_info *arena)
598{ 721{
599 arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL); 722 arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
@@ -650,8 +773,7 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size,
650 available -= 2 * BTT_PG_SIZE; 773 available -= 2 * BTT_PG_SIZE;
651 774
652 /* The log takes a fixed amount of space based on nfree */ 775 /* The log takes a fixed amount of space based on nfree */
653 logsize = roundup(2 * arena->nfree * sizeof(struct log_entry), 776 logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE);
654 BTT_PG_SIZE);
655 available -= logsize; 777 available -= logsize;
656 778
657 /* Calculate optimal split between map and data area */ 779 /* Calculate optimal split between map and data area */
@@ -668,6 +790,10 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size,
668 arena->mapoff = arena->dataoff + datasize; 790 arena->mapoff = arena->dataoff + datasize;
669 arena->logoff = arena->mapoff + mapsize; 791 arena->logoff = arena->mapoff + mapsize;
670 arena->info2off = arena->logoff + logsize; 792 arena->info2off = arena->logoff + logsize;
793
794 /* Default log indices are (0,1) */
795 arena->log_index[0] = 0;
796 arena->log_index[1] = 1;
671 return arena; 797 return arena;
672} 798}
673 799
@@ -758,6 +884,13 @@ static int discover_arenas(struct btt *btt)
758 arena->external_lba_start = cur_nlba; 884 arena->external_lba_start = cur_nlba;
759 parse_arena_meta(arena, super, cur_off); 885 parse_arena_meta(arena, super, cur_off);
760 886
887 ret = log_set_indices(arena);
888 if (ret) {
889 dev_err(to_dev(arena),
890 "Unable to deduce log/padding indices\n");
891 goto out;
892 }
893
761 mutex_init(&arena->err_lock); 894 mutex_init(&arena->err_lock);
762 ret = btt_freelist_init(arena); 895 ret = btt_freelist_init(arena);
763 if (ret) 896 if (ret)
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
index 884fbbbdd18a..db3cb6d4d0d4 100644
--- a/drivers/nvdimm/btt.h
+++ b/drivers/nvdimm/btt.h
@@ -27,6 +27,7 @@
27#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT) 27#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
28#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT))) 28#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
29#define MAP_ENT_NORMAL 0xC0000000 29#define MAP_ENT_NORMAL 0xC0000000
30#define LOG_GRP_SIZE sizeof(struct log_group)
30#define LOG_ENT_SIZE sizeof(struct log_entry) 31#define LOG_ENT_SIZE sizeof(struct log_entry)
31#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */ 32#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */
32#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */ 33#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */
@@ -50,12 +51,52 @@ enum btt_init_state {
50 INIT_READY 51 INIT_READY
51}; 52};
52 53
54/*
55 * A log group represents one log 'lane', and consists of four log entries.
56 * Two of the four entries are valid entries, and the remaining two are
57 * padding. Due to an old bug in the padding location, we need to perform a
58 * test to determine the padding scheme being used, and use that scheme
59 * thereafter.
60 *
61 * In kernels prior to 4.15, 'log group' would have actual log entries at
62 * indices (0, 2) and padding at indices (1, 3), where as the correct/updated
63 * format has log entries at indices (0, 1) and padding at indices (2, 3).
64 *
65 * Old (pre 4.15) format:
66 * +-----------------+-----------------+
67 * | ent[0] | ent[1] |
68 * | 16B | 16B |
69 * | lba/old/new/seq | pad |
70 * +-----------------------------------+
71 * | ent[2] | ent[3] |
72 * | 16B | 16B |
73 * | lba/old/new/seq | pad |
74 * +-----------------+-----------------+
75 *
76 * New format:
77 * +-----------------+-----------------+
78 * | ent[0] | ent[1] |
79 * | 16B | 16B |
80 * | lba/old/new/seq | lba/old/new/seq |
81 * +-----------------------------------+
82 * | ent[2] | ent[3] |
83 * | 16B | 16B |
84 * | pad | pad |
85 * +-----------------+-----------------+
86 *
87 * We detect during start-up which format is in use, and set
88 * arena->log_index[(0, 1)] with the detected format.
89 */
90
53struct log_entry { 91struct log_entry {
54 __le32 lba; 92 __le32 lba;
55 __le32 old_map; 93 __le32 old_map;
56 __le32 new_map; 94 __le32 new_map;
57 __le32 seq; 95 __le32 seq;
58 __le64 padding[2]; 96};
97
98struct log_group {
99 struct log_entry ent[4];
59}; 100};
60 101
61struct btt_sb { 102struct btt_sb {
@@ -126,6 +167,7 @@ struct aligned_lock {
126 * @debugfs_dir: Debugfs dentry 167 * @debugfs_dir: Debugfs dentry
127 * @flags: Arena flags - may signify error states. 168 * @flags: Arena flags - may signify error states.
128 * @err_lock: Mutex for synchronizing error clearing. 169 * @err_lock: Mutex for synchronizing error clearing.
170 * @log_index: Indices of the valid log entries in a log_group
129 * 171 *
130 * arena_info is a per-arena handle. Once an arena is narrowed down for an 172 * arena_info is a per-arena handle. Once an arena is narrowed down for an
131 * IO, this struct is passed around for the duration of the IO. 173 * IO, this struct is passed around for the duration of the IO.
@@ -158,6 +200,7 @@ struct arena_info {
158 /* Arena flags */ 200 /* Arena flags */
159 u32 flags; 201 u32 flags;
160 struct mutex err_lock; 202 struct mutex err_lock;
203 int log_index[2];
161}; 204};
162 205
163/** 206/**