aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJoe Thornber <ejt@redhat.com>2014-03-07 09:57:19 -0500
committerMike Snitzer <snitzer@redhat.com>2014-03-07 12:02:47 -0500
commitcebc2de44d3bce53e46476e774126c298ca2c8a9 (patch)
tree81050974e59be27436092b16cd47262cb5a9f93f /drivers/md
parentf6d16d32797f665100b1507f6a77c4cd0acb30c5 (diff)
dm space map metadata: fix refcount decrement below 0 which caused corruption
This has been a relatively long-standing issue that wasn't nailed down until Teng-Feng Yang's meticulous bug report to dm-devel on 3/7/2014, see: http://www.redhat.com/archives/dm-devel/2014-March/msg00021.html From that report: "When decreasing the reference count of a metadata block with its reference count equals 3, we will call dm_btree_remove() to remove this enrty from the B+tree which keeps the reference count info in metadata device. The B+tree will try to rebalance the entry of the child nodes in each node it traversed, and the rebalance process contains the following steps. (1) Finding the corresponding children in current node (shadow_current(s)) (2) Shadow the children block (issue BOP_INC) (3) redistribute keys among children, and free children if necessary (issue BOP_DEC) Since the update of a metadata block's reference count could be recursive, we will stash these reference count update operations in smm->uncommitted and then process them in a FILO fashion. The problem is that step(3) could free the children which is created in step(2), so the BOP_DEC issued in step(3) will be carried out before the BOP_INC issued in step(2) since these BOPs will be processed in FILO fashion. Once the BOP_DEC from step(3) tries to decrease the reference count of newly shadow block, it will report failure for its reference equals 0 before decreasing. It looks like we can solve this issue by processing these BOPs in a FIFO fashion instead of FILO." Commit 5b564d80 ("dm space map: disallow decrementing a reference count below zero") changed the code to report an error for this temporary refcount decrement below zero. So what was previously a harmless invalid refcount became a hard failure due to the new error path: device-mapper: space map common: unable to decrement a reference count below 0 device-mapper: thin: 253:6: dm_thin_insert_block() failed: error = -22 device-mapper: thin: 253:6: switching pool to read-only mode This bug is in dm persistent-data code that is common to the DM thin and cache targets. So any users of those targets should apply this fix. Fix this by applying recursive space map operations in FIFO order rather than FILO. Resolves: https://bugzilla.kernel.org/show_bug.cgi?id=68801 Reported-by: Apollon Oikonomopoulos <apoikos@debian.org> Reported-by: edwillam1007@gmail.com Reported-by: Teng-Feng Yang <shinrairis@gmail.com> Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org # 3.13+
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c113
1 files changed, 92 insertions, 21 deletions
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e9bdd462f4f5..786b689bdfc7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -91,6 +91,69 @@ struct block_op {
91 dm_block_t block; 91 dm_block_t block;
92}; 92};
93 93
94struct bop_ring_buffer {
95 unsigned begin;
96 unsigned end;
97 struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
98};
99
100static void brb_init(struct bop_ring_buffer *brb)
101{
102 brb->begin = 0;
103 brb->end = 0;
104}
105
106static bool brb_empty(struct bop_ring_buffer *brb)
107{
108 return brb->begin == brb->end;
109}
110
111static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
112{
113 unsigned r = old + 1;
114 return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
115}
116
117static int brb_push(struct bop_ring_buffer *brb,
118 enum block_op_type type, dm_block_t b)
119{
120 struct block_op *bop;
121 unsigned next = brb_next(brb, brb->end);
122
123 /*
124 * We don't allow the last bop to be filled, this way we can
125 * differentiate between full and empty.
126 */
127 if (next == brb->begin)
128 return -ENOMEM;
129
130 bop = brb->bops + brb->end;
131 bop->type = type;
132 bop->block = b;
133
134 brb->end = next;
135
136 return 0;
137}
138
139static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
140{
141 struct block_op *bop;
142
143 if (brb_empty(brb))
144 return -ENODATA;
145
146 bop = brb->bops + brb->begin;
147 result->type = bop->type;
148 result->block = bop->block;
149
150 brb->begin = brb_next(brb, brb->begin);
151
152 return 0;
153}
154
155/*----------------------------------------------------------------*/
156
94struct sm_metadata { 157struct sm_metadata {
95 struct dm_space_map sm; 158 struct dm_space_map sm;
96 159
@@ -101,25 +164,20 @@ struct sm_metadata {
101 164
102 unsigned recursion_count; 165 unsigned recursion_count;
103 unsigned allocated_this_transaction; 166 unsigned allocated_this_transaction;
104 unsigned nr_uncommitted; 167 struct bop_ring_buffer uncommitted;
105 struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
106 168
107 struct threshold threshold; 169 struct threshold threshold;
108}; 170};
109 171
110static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) 172static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
111{ 173{
112 struct block_op *op; 174 int r = brb_push(&smm->uncommitted, type, b);
113 175
114 if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { 176 if (r) {
115 DMERR("too many recursive allocations"); 177 DMERR("too many recursive allocations");
116 return -ENOMEM; 178 return -ENOMEM;
117 } 179 }
118 180
119 op = smm->uncommitted + smm->nr_uncommitted++;
120 op->type = type;
121 op->block = b;
122
123 return 0; 181 return 0;
124} 182}
125 183
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
158 return -ENOMEM; 216 return -ENOMEM;
159 } 217 }
160 218
161 if (smm->recursion_count == 1 && smm->nr_uncommitted) { 219 if (smm->recursion_count == 1) {
162 while (smm->nr_uncommitted && !r) { 220 while (!brb_empty(&smm->uncommitted)) {
163 smm->nr_uncommitted--; 221 struct block_op bop;
164 r = commit_bop(smm, smm->uncommitted + 222
165 smm->nr_uncommitted); 223 r = brb_pop(&smm->uncommitted, &bop);
224 if (r) {
225 DMERR("bug in bop ring buffer");
226 break;
227 }
228
229 r = commit_bop(smm, &bop);
166 if (r) 230 if (r)
167 break; 231 break;
168 } 232 }
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
217static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, 281static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
218 uint32_t *result) 282 uint32_t *result)
219{ 283{
220 int r, i; 284 int r;
285 unsigned i;
221 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 286 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
222 unsigned adjustment = 0; 287 unsigned adjustment = 0;
223 288
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
225 * We may have some uncommitted adjustments to add. This list 290 * We may have some uncommitted adjustments to add. This list
226 * should always be really short. 291 * should always be really short.
227 */ 292 */
228 for (i = 0; i < smm->nr_uncommitted; i++) { 293 for (i = smm->uncommitted.begin;
229 struct block_op *op = smm->uncommitted + i; 294 i != smm->uncommitted.end;
295 i = brb_next(&smm->uncommitted, i)) {
296 struct block_op *op = smm->uncommitted.bops + i;
230 297
231 if (op->block != b) 298 if (op->block != b)
232 continue; 299 continue;
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
254static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, 321static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
255 dm_block_t b, int *result) 322 dm_block_t b, int *result)
256{ 323{
257 int r, i, adjustment = 0; 324 int r, adjustment = 0;
325 unsigned i;
258 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 326 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
259 uint32_t rc; 327 uint32_t rc;
260 328
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
262 * We may have some uncommitted adjustments to add. This list 330 * We may have some uncommitted adjustments to add. This list
263 * should always be really short. 331 * should always be really short.
264 */ 332 */
265 for (i = 0; i < smm->nr_uncommitted; i++) { 333 for (i = smm->uncommitted.begin;
266 struct block_op *op = smm->uncommitted + i; 334 i != smm->uncommitted.end;
335 i = brb_next(&smm->uncommitted, i)) {
336
337 struct block_op *op = smm->uncommitted.bops + i;
267 338
268 if (op->block != b) 339 if (op->block != b)
269 continue; 340 continue;
@@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
671 smm->begin = superblock + 1; 742 smm->begin = superblock + 1;
672 smm->recursion_count = 0; 743 smm->recursion_count = 0;
673 smm->allocated_this_transaction = 0; 744 smm->allocated_this_transaction = 0;
674 smm->nr_uncommitted = 0; 745 brb_init(&smm->uncommitted);
675 threshold_init(&smm->threshold); 746 threshold_init(&smm->threshold);
676 747
677 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); 748 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -715,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
715 smm->begin = 0; 786 smm->begin = 0;
716 smm->recursion_count = 0; 787 smm->recursion_count = 0;
717 smm->allocated_this_transaction = 0; 788 smm->allocated_this_transaction = 0;
718 smm->nr_uncommitted = 0; 789 brb_init(&smm->uncommitted);
719 threshold_init(&smm->threshold); 790 threshold_init(&smm->threshold);
720 791
721 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); 792 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));