diff options
author | Kent Overstreet <kmo@daterainc.com> | 2013-09-10 22:07:00 -0400 |
---|---|---|
committer | Kent Overstreet <kmo@daterainc.com> | 2013-11-11 00:56:37 -0500 |
commit | a1f0358b2bf69be216cb6e4ea40fe7ae4d38b8a6 (patch) | |
tree | 287cf5c3daa6ae4ec2ffc55b33b6df52c617f7f7 /drivers/md/bcache/btree.c | |
parent | 8835c1234dd9a838993a2d5cb7572f57992ebbee (diff) |
bcache: Incremental gc
Big garbage collection rewrite; now, garbage collection uses the same
mechanisms as used elsewhere for inserting/updating btree node pointers,
instead of rewriting interior btree nodes in place.
This makes the code significantly cleaner and less fragile, and means we
can now make garbage collection incremental - it doesn't have to hold a
write lock on the root of the btree for the entire duration of garbage
collection.
This means that there's less of a latency hit for doing garbage
collection, which means we can gc more frequently (and do a better job
of reclaiming from the cache), and we can coalesce across more btree
nodes (improving our space efficiency).
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Diffstat (limited to 'drivers/md/bcache/btree.c')
-rw-r--r-- | drivers/md/bcache/btree.c | 388 |
1 files changed, 225 insertions, 163 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index a3f8ca4ee6e0..7d283d217438 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -1176,12 +1176,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | |||
1176 | 1176 | ||
1177 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) | 1177 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) |
1178 | 1178 | ||
1179 | static int btree_gc_mark_node(struct btree *b, unsigned *keys, | 1179 | static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) |
1180 | struct gc_stat *gc) | ||
1181 | { | 1180 | { |
1182 | uint8_t stale = 0; | 1181 | uint8_t stale = 0; |
1183 | unsigned last_dev = -1; | 1182 | unsigned keys = 0, good_keys = 0; |
1184 | struct bcache_device *d = NULL; | ||
1185 | struct bkey *k; | 1183 | struct bkey *k; |
1186 | struct btree_iter iter; | 1184 | struct btree_iter iter; |
1187 | struct bset_tree *t; | 1185 | struct bset_tree *t; |
@@ -1189,27 +1187,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, | |||
1189 | gc->nodes++; | 1187 | gc->nodes++; |
1190 | 1188 | ||
1191 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1189 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { |
1192 | if (last_dev != KEY_INODE(k)) { | ||
1193 | last_dev = KEY_INODE(k); | ||
1194 | |||
1195 | d = KEY_INODE(k) < b->c->nr_uuids | ||
1196 | ? b->c->devices[last_dev] | ||
1197 | : NULL; | ||
1198 | } | ||
1199 | |||
1200 | stale = max(stale, btree_mark_key(b, k)); | 1190 | stale = max(stale, btree_mark_key(b, k)); |
1191 | keys++; | ||
1201 | 1192 | ||
1202 | if (bch_ptr_bad(b, k)) | 1193 | if (bch_ptr_bad(b, k)) |
1203 | continue; | 1194 | continue; |
1204 | 1195 | ||
1205 | *keys += bkey_u64s(k); | ||
1206 | |||
1207 | gc->key_bytes += bkey_u64s(k); | 1196 | gc->key_bytes += bkey_u64s(k); |
1208 | gc->nkeys++; | 1197 | gc->nkeys++; |
1198 | good_keys++; | ||
1209 | 1199 | ||
1210 | gc->data += KEY_SIZE(k); | 1200 | gc->data += KEY_SIZE(k); |
1211 | if (KEY_DIRTY(k)) | ||
1212 | gc->dirty += KEY_SIZE(k); | ||
1213 | } | 1201 | } |
1214 | 1202 | ||
1215 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 1203 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) |
@@ -1218,79 +1206,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, | |||
1218 | bkey_cmp(&b->key, &t->end) < 0, | 1206 | bkey_cmp(&b->key, &t->end) < 0, |
1219 | b, "found short btree key in gc"); | 1207 | b, "found short btree key in gc"); |
1220 | 1208 | ||
1221 | return stale; | 1209 | if (b->c->gc_always_rewrite) |
1222 | } | 1210 | return true; |
1223 | 1211 | ||
1224 | static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) | 1212 | if (stale > 10) |
1225 | { | 1213 | return true; |
1226 | /* | ||
1227 | * We block priorities from being written for the duration of garbage | ||
1228 | * collection, so we can't sleep in btree_alloc() -> | ||
1229 | * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it | ||
1230 | * our closure. | ||
1231 | */ | ||
1232 | struct btree *n = btree_node_alloc_replacement(b); | ||
1233 | |||
1234 | if (!IS_ERR_OR_NULL(n)) { | ||
1235 | swap(b, n); | ||
1236 | 1214 | ||
1237 | memcpy(k->ptr, b->key.ptr, | 1215 | if ((keys - good_keys) * 2 > keys) |
1238 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | 1216 | return true; |
1239 | |||
1240 | btree_node_free(n); | ||
1241 | up_write(&n->lock); | ||
1242 | } | ||
1243 | 1217 | ||
1244 | return b; | 1218 | return false; |
1245 | } | 1219 | } |
1246 | 1220 | ||
1247 | /* | 1221 | #define GC_MERGE_NODES 4U |
1248 | * Leaving this at 2 until we've got incremental garbage collection done; it | ||
1249 | * could be higher (and has been tested with 4) except that garbage collection | ||
1250 | * could take much longer, adversely affecting latency. | ||
1251 | */ | ||
1252 | #define GC_MERGE_NODES 2U | ||
1253 | 1222 | ||
1254 | struct gc_merge_info { | 1223 | struct gc_merge_info { |
1255 | struct btree *b; | 1224 | struct btree *b; |
1256 | struct bkey *k; | ||
1257 | unsigned keys; | 1225 | unsigned keys; |
1258 | }; | 1226 | }; |
1259 | 1227 | ||
1260 | static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, | 1228 | static int bch_btree_insert_node(struct btree *, struct btree_op *, |
1261 | struct gc_merge_info *r) | 1229 | struct keylist *, atomic_t *, struct bkey *); |
1230 | |||
1231 | static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | ||
1232 | struct keylist *keylist, struct gc_stat *gc, | ||
1233 | struct gc_merge_info *r) | ||
1262 | { | 1234 | { |
1263 | unsigned nodes = 0, keys = 0, blocks; | 1235 | unsigned i, nodes = 0, keys = 0, blocks; |
1264 | int i; | 1236 | struct btree *new_nodes[GC_MERGE_NODES]; |
1265 | struct closure cl; | 1237 | struct closure cl; |
1238 | struct bkey *k; | ||
1266 | 1239 | ||
1240 | memset(new_nodes, 0, sizeof(new_nodes)); | ||
1267 | closure_init_stack(&cl); | 1241 | closure_init_stack(&cl); |
1268 | 1242 | ||
1269 | while (nodes < GC_MERGE_NODES && r[nodes].b) | 1243 | while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) |
1270 | keys += r[nodes++].keys; | 1244 | keys += r[nodes++].keys; |
1271 | 1245 | ||
1272 | blocks = btree_default_blocks(b->c) * 2 / 3; | 1246 | blocks = btree_default_blocks(b->c) * 2 / 3; |
1273 | 1247 | ||
1274 | if (nodes < 2 || | 1248 | if (nodes < 2 || |
1275 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) | 1249 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) |
1276 | return; | 1250 | return 0; |
1277 | |||
1278 | for (i = nodes - 1; i >= 0; --i) { | ||
1279 | if (r[i].b->written) | ||
1280 | r[i].b = btree_gc_alloc(r[i].b, r[i].k); | ||
1281 | 1251 | ||
1282 | if (r[i].b->written) | 1252 | for (i = 0; i < nodes; i++) { |
1283 | return; | 1253 | new_nodes[i] = btree_node_alloc_replacement(r[i].b); |
1254 | if (IS_ERR_OR_NULL(new_nodes[i])) | ||
1255 | goto out_nocoalesce; | ||
1284 | } | 1256 | } |
1285 | 1257 | ||
1286 | for (i = nodes - 1; i > 0; --i) { | 1258 | for (i = nodes - 1; i > 0; --i) { |
1287 | struct bset *n1 = r[i].b->sets->data; | 1259 | struct bset *n1 = new_nodes[i]->sets->data; |
1288 | struct bset *n2 = r[i - 1].b->sets->data; | 1260 | struct bset *n2 = new_nodes[i - 1]->sets->data; |
1289 | struct bkey *k, *last = NULL; | 1261 | struct bkey *k, *last = NULL; |
1290 | 1262 | ||
1291 | keys = 0; | 1263 | keys = 0; |
1292 | 1264 | ||
1293 | if (i == 1) { | 1265 | if (i > 1) { |
1266 | for (k = n2->start; | ||
1267 | k < end(n2); | ||
1268 | k = bkey_next(k)) { | ||
1269 | if (__set_blocks(n1, n1->keys + keys + | ||
1270 | bkey_u64s(k), b->c) > blocks) | ||
1271 | break; | ||
1272 | |||
1273 | last = k; | ||
1274 | keys += bkey_u64s(k); | ||
1275 | } | ||
1276 | } else { | ||
1294 | /* | 1277 | /* |
1295 | * Last node we're not getting rid of - we're getting | 1278 | * Last node we're not getting rid of - we're getting |
1296 | * rid of the node at r[0]. Have to try and fit all of | 1279 | * rid of the node at r[0]. Have to try and fit all of |
@@ -1299,37 +1282,27 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, | |||
1299 | * length keys (shouldn't be possible in practice, | 1282 | * length keys (shouldn't be possible in practice, |
1300 | * though) | 1283 | * though) |
1301 | */ | 1284 | */ |
1302 | if (__set_blocks(n1, n1->keys + r->keys, | 1285 | if (__set_blocks(n1, n1->keys + n2->keys, |
1303 | b->c) > btree_blocks(r[i].b)) | 1286 | b->c) > btree_blocks(new_nodes[i])) |
1304 | return; | 1287 | goto out_nocoalesce; |
1305 | 1288 | ||
1306 | keys = n2->keys; | 1289 | keys = n2->keys; |
1290 | /* Take the key of the node we're getting rid of */ | ||
1307 | last = &r->b->key; | 1291 | last = &r->b->key; |
1308 | } else | 1292 | } |
1309 | for (k = n2->start; | ||
1310 | k < end(n2); | ||
1311 | k = bkey_next(k)) { | ||
1312 | if (__set_blocks(n1, n1->keys + keys + | ||
1313 | bkey_u64s(k), b->c) > blocks) | ||
1314 | break; | ||
1315 | |||
1316 | last = k; | ||
1317 | keys += bkey_u64s(k); | ||
1318 | } | ||
1319 | 1293 | ||
1320 | BUG_ON(__set_blocks(n1, n1->keys + keys, | 1294 | BUG_ON(__set_blocks(n1, n1->keys + keys, |
1321 | b->c) > btree_blocks(r[i].b)); | 1295 | b->c) > btree_blocks(new_nodes[i])); |
1322 | 1296 | ||
1323 | if (last) { | 1297 | if (last) |
1324 | bkey_copy_key(&r[i].b->key, last); | 1298 | bkey_copy_key(&new_nodes[i]->key, last); |
1325 | bkey_copy_key(r[i].k, last); | ||
1326 | } | ||
1327 | 1299 | ||
1328 | memcpy(end(n1), | 1300 | memcpy(end(n1), |
1329 | n2->start, | 1301 | n2->start, |
1330 | (void *) node(n2, keys) - (void *) n2->start); | 1302 | (void *) node(n2, keys) - (void *) n2->start); |
1331 | 1303 | ||
1332 | n1->keys += keys; | 1304 | n1->keys += keys; |
1305 | r[i].keys = n1->keys; | ||
1333 | 1306 | ||
1334 | memmove(n2->start, | 1307 | memmove(n2->start, |
1335 | node(n2, keys), | 1308 | node(n2, keys), |
@@ -1337,93 +1310,175 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, | |||
1337 | 1310 | ||
1338 | n2->keys -= keys; | 1311 | n2->keys -= keys; |
1339 | 1312 | ||
1340 | r[i].keys = n1->keys; | 1313 | if (bch_keylist_realloc(keylist, |
1341 | r[i - 1].keys = n2->keys; | 1314 | KEY_PTRS(&new_nodes[i]->key), b->c)) |
1315 | goto out_nocoalesce; | ||
1316 | |||
1317 | bch_btree_node_write(new_nodes[i], &cl); | ||
1318 | bch_keylist_add(keylist, &new_nodes[i]->key); | ||
1342 | } | 1319 | } |
1343 | 1320 | ||
1344 | btree_node_free(r->b); | 1321 | for (i = 0; i < nodes; i++) { |
1345 | up_write(&r->b->lock); | 1322 | if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) |
1323 | goto out_nocoalesce; | ||
1346 | 1324 | ||
1347 | trace_bcache_btree_gc_coalesce(nodes); | 1325 | make_btree_freeing_key(r[i].b, keylist->top); |
1326 | bch_keylist_push(keylist); | ||
1327 | } | ||
1328 | |||
1329 | /* We emptied out this node */ | ||
1330 | BUG_ON(new_nodes[0]->sets->data->keys); | ||
1331 | btree_node_free(new_nodes[0]); | ||
1332 | rw_unlock(true, new_nodes[0]); | ||
1348 | 1333 | ||
1334 | closure_sync(&cl); | ||
1335 | |||
1336 | for (i = 0; i < nodes; i++) { | ||
1337 | btree_node_free(r[i].b); | ||
1338 | rw_unlock(true, r[i].b); | ||
1339 | |||
1340 | r[i].b = new_nodes[i]; | ||
1341 | } | ||
1342 | |||
1343 | bch_btree_insert_node(b, op, keylist, NULL, NULL); | ||
1344 | BUG_ON(!bch_keylist_empty(keylist)); | ||
1345 | |||
1346 | memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); | ||
1347 | r[nodes - 1].b = ERR_PTR(-EINTR); | ||
1348 | |||
1349 | trace_bcache_btree_gc_coalesce(nodes); | ||
1349 | gc->nodes--; | 1350 | gc->nodes--; |
1350 | nodes--; | ||
1351 | 1351 | ||
1352 | memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); | 1352 | /* Invalidated our iterator */ |
1353 | memset(&r[nodes], 0, sizeof(struct gc_merge_info)); | 1353 | return -EINTR; |
1354 | |||
1355 | out_nocoalesce: | ||
1356 | closure_sync(&cl); | ||
1357 | |||
1358 | while ((k = bch_keylist_pop(keylist))) | ||
1359 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
1360 | atomic_dec(&b->c->prio_blocked); | ||
1361 | |||
1362 | for (i = 0; i < nodes; i++) | ||
1363 | if (!IS_ERR_OR_NULL(new_nodes[i])) { | ||
1364 | btree_node_free(new_nodes[i]); | ||
1365 | rw_unlock(true, new_nodes[i]); | ||
1366 | } | ||
1367 | return 0; | ||
1354 | } | 1368 | } |
1355 | 1369 | ||
1356 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | 1370 | static unsigned btree_gc_count_keys(struct btree *b) |
1357 | struct closure *writes, struct gc_stat *gc) | ||
1358 | { | 1371 | { |
1359 | void write(struct btree *r) | 1372 | struct bkey *k; |
1360 | { | 1373 | struct btree_iter iter; |
1361 | if (!r->written || btree_node_dirty(r)) | 1374 | unsigned ret = 0; |
1362 | bch_btree_node_write(r, writes); | ||
1363 | 1375 | ||
1364 | up_write(&r->lock); | 1376 | for_each_key_filter(b, k, &iter, bch_ptr_bad) |
1365 | } | 1377 | ret += bkey_u64s(k); |
1366 | 1378 | ||
1367 | int ret = 0, stale; | 1379 | return ret; |
1380 | } | ||
1381 | |||
1382 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | ||
1383 | struct closure *writes, struct gc_stat *gc) | ||
1384 | { | ||
1368 | unsigned i; | 1385 | unsigned i; |
1386 | int ret = 0; | ||
1387 | bool should_rewrite; | ||
1388 | struct btree *n; | ||
1389 | struct bkey *k; | ||
1390 | struct keylist keys; | ||
1391 | struct btree_iter iter; | ||
1369 | struct gc_merge_info r[GC_MERGE_NODES]; | 1392 | struct gc_merge_info r[GC_MERGE_NODES]; |
1393 | struct gc_merge_info *last = r + GC_MERGE_NODES - 1; | ||
1370 | 1394 | ||
1371 | memset(r, 0, sizeof(r)); | 1395 | bch_keylist_init(&keys); |
1396 | bch_btree_iter_init(b, &iter, &b->c->gc_done); | ||
1372 | 1397 | ||
1373 | while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { | 1398 | for (i = 0; i < GC_MERGE_NODES; i++) |
1374 | r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true); | 1399 | r[i].b = ERR_PTR(-EINTR); |
1375 | 1400 | ||
1376 | if (IS_ERR(r->b)) { | 1401 | while (1) { |
1377 | ret = PTR_ERR(r->b); | 1402 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); |
1378 | break; | 1403 | if (k) { |
1404 | r->b = bch_btree_node_get(b->c, k, b->level - 1, true); | ||
1405 | if (IS_ERR(r->b)) { | ||
1406 | ret = PTR_ERR(r->b); | ||
1407 | break; | ||
1408 | } | ||
1409 | |||
1410 | r->keys = btree_gc_count_keys(r->b); | ||
1411 | |||
1412 | ret = btree_gc_coalesce(b, op, &keys, gc, r); | ||
1413 | if (ret) | ||
1414 | break; | ||
1379 | } | 1415 | } |
1380 | 1416 | ||
1381 | r->keys = 0; | 1417 | if (!last->b) |
1382 | stale = btree_gc_mark_node(r->b, &r->keys, gc); | 1418 | break; |
1383 | 1419 | ||
1384 | if (!b->written && | 1420 | if (!IS_ERR(last->b)) { |
1385 | (r->b->level || stale > 10 || | 1421 | should_rewrite = btree_gc_mark_node(last->b, gc); |
1386 | b->c->gc_always_rewrite)) | 1422 | if (should_rewrite) { |
1387 | r->b = btree_gc_alloc(r->b, r->k); | 1423 | n = btree_node_alloc_replacement(last->b); |
1388 | 1424 | ||
1389 | if (r->b->level) | 1425 | if (!IS_ERR_OR_NULL(n)) { |
1390 | ret = btree_gc_recurse(r->b, op, writes, gc); | 1426 | bch_btree_node_write_sync(n); |
1427 | bch_keylist_add(&keys, &n->key); | ||
1391 | 1428 | ||
1392 | if (ret) { | 1429 | make_btree_freeing_key(last->b, |
1393 | write(r->b); | 1430 | keys.top); |
1394 | break; | 1431 | bch_keylist_push(&keys); |
1395 | } | 1432 | |
1433 | btree_node_free(last->b); | ||
1396 | 1434 | ||
1397 | bkey_copy_key(&b->c->gc_done, r->k); | 1435 | bch_btree_insert_node(b, op, &keys, |
1436 | NULL, NULL); | ||
1437 | BUG_ON(!bch_keylist_empty(&keys)); | ||
1398 | 1438 | ||
1399 | if (!b->written) | 1439 | rw_unlock(true, last->b); |
1400 | btree_gc_coalesce(b, gc, r); | 1440 | last->b = n; |
1441 | |||
1442 | /* Invalidated our iterator */ | ||
1443 | ret = -EINTR; | ||
1444 | break; | ||
1445 | } | ||
1446 | } | ||
1401 | 1447 | ||
1402 | if (r[GC_MERGE_NODES - 1].b) | 1448 | if (last->b->level) { |
1403 | write(r[GC_MERGE_NODES - 1].b); | 1449 | ret = btree_gc_recurse(last->b, op, writes, gc); |
1450 | if (ret) | ||
1451 | break; | ||
1452 | } | ||
1404 | 1453 | ||
1405 | memmove(&r[1], &r[0], | 1454 | bkey_copy_key(&b->c->gc_done, &last->b->key); |
1406 | sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); | 1455 | |
1456 | /* | ||
1457 | * Must flush leaf nodes before gc ends, since replace | ||
1458 | * operations aren't journalled | ||
1459 | */ | ||
1460 | if (btree_node_dirty(last->b)) | ||
1461 | bch_btree_node_write(last->b, writes); | ||
1462 | rw_unlock(true, last->b); | ||
1463 | } | ||
1464 | |||
1465 | memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); | ||
1466 | r->b = NULL; | ||
1407 | 1467 | ||
1408 | /* When we've got incremental GC working, we'll want to do | ||
1409 | * if (should_resched()) | ||
1410 | * return -EAGAIN; | ||
1411 | */ | ||
1412 | cond_resched(); | ||
1413 | #if 0 | ||
1414 | if (need_resched()) { | 1468 | if (need_resched()) { |
1415 | ret = -EAGAIN; | 1469 | ret = -EAGAIN; |
1416 | break; | 1470 | break; |
1417 | } | 1471 | } |
1418 | #endif | ||
1419 | } | 1472 | } |
1420 | 1473 | ||
1421 | for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) | 1474 | for (i = 0; i < GC_MERGE_NODES; i++) |
1422 | write(r[i].b); | 1475 | if (!IS_ERR_OR_NULL(r[i].b)) { |
1476 | if (btree_node_dirty(r[i].b)) | ||
1477 | bch_btree_node_write(r[i].b, writes); | ||
1478 | rw_unlock(true, r[i].b); | ||
1479 | } | ||
1423 | 1480 | ||
1424 | /* Might have freed some children, must remove their keys */ | 1481 | bch_keylist_free(&keys); |
1425 | if (!b->written) | ||
1426 | bch_btree_sort(b); | ||
1427 | 1482 | ||
1428 | return ret; | 1483 | return ret; |
1429 | } | 1484 | } |
@@ -1432,27 +1487,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | |||
1432 | struct closure *writes, struct gc_stat *gc) | 1487 | struct closure *writes, struct gc_stat *gc) |
1433 | { | 1488 | { |
1434 | struct btree *n = NULL; | 1489 | struct btree *n = NULL; |
1435 | unsigned keys = 0; | 1490 | int ret = 0; |
1436 | int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); | 1491 | bool should_rewrite; |
1437 | 1492 | ||
1438 | if (b->level || stale > 10) | 1493 | should_rewrite = btree_gc_mark_node(b, gc); |
1494 | if (should_rewrite) { | ||
1439 | n = btree_node_alloc_replacement(b); | 1495 | n = btree_node_alloc_replacement(b); |
1440 | 1496 | ||
1441 | if (!IS_ERR_OR_NULL(n)) | 1497 | if (!IS_ERR_OR_NULL(n)) { |
1442 | swap(b, n); | 1498 | bch_btree_node_write_sync(n); |
1443 | 1499 | bch_btree_set_root(n); | |
1444 | if (b->level) | 1500 | btree_node_free(b); |
1445 | ret = btree_gc_recurse(b, op, writes, gc); | 1501 | rw_unlock(true, n); |
1446 | 1502 | ||
1447 | if (!b->written || btree_node_dirty(b)) | 1503 | return -EINTR; |
1448 | bch_btree_node_write_sync(b); | 1504 | } |
1505 | } | ||
1449 | 1506 | ||
1450 | if (!IS_ERR_OR_NULL(n)) { | 1507 | if (b->level) { |
1451 | bch_btree_set_root(b); | 1508 | ret = btree_gc_recurse(b, op, writes, gc); |
1452 | btree_node_free(n); | 1509 | if (ret) |
1453 | rw_unlock(true, b); | 1510 | return ret; |
1454 | } | 1511 | } |
1455 | 1512 | ||
1513 | bkey_copy_key(&b->c->gc_done, &b->key); | ||
1514 | |||
1456 | return ret; | 1515 | return ret; |
1457 | } | 1516 | } |
1458 | 1517 | ||
@@ -1550,29 +1609,20 @@ static void bch_btree_gc(struct cache_set *c) | |||
1550 | 1609 | ||
1551 | btree_gc_start(c); | 1610 | btree_gc_start(c); |
1552 | 1611 | ||
1553 | atomic_inc(&c->prio_blocked); | 1612 | do { |
1554 | 1613 | ret = btree_root(gc_root, c, &op, &writes, &stats); | |
1555 | ret = btree_root(gc_root, c, &op, &writes, &stats); | 1614 | closure_sync(&writes); |
1556 | closure_sync(&writes); | ||
1557 | |||
1558 | if (ret) { | ||
1559 | pr_warn("gc failed!"); | ||
1560 | return; | ||
1561 | } | ||
1562 | 1615 | ||
1563 | /* Possibly wait for new UUIDs or whatever to hit disk */ | 1616 | if (ret && ret != -EAGAIN) |
1564 | bch_journal_meta(c, &writes); | 1617 | pr_warn("gc failed!"); |
1565 | closure_sync(&writes); | 1618 | } while (ret); |
1566 | 1619 | ||
1567 | available = bch_btree_gc_finish(c); | 1620 | available = bch_btree_gc_finish(c); |
1568 | |||
1569 | atomic_dec(&c->prio_blocked); | ||
1570 | wake_up_allocators(c); | 1621 | wake_up_allocators(c); |
1571 | 1622 | ||
1572 | bch_time_stats_update(&c->btree_gc_time, start_time); | 1623 | bch_time_stats_update(&c->btree_gc_time, start_time); |
1573 | 1624 | ||
1574 | stats.key_bytes *= sizeof(uint64_t); | 1625 | stats.key_bytes *= sizeof(uint64_t); |
1575 | stats.dirty <<= 9; | ||
1576 | stats.data <<= 9; | 1626 | stats.data <<= 9; |
1577 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | 1627 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; |
1578 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | 1628 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); |
@@ -1585,14 +1635,28 @@ static void bch_btree_gc(struct cache_set *c) | |||
1585 | static int bch_gc_thread(void *arg) | 1635 | static int bch_gc_thread(void *arg) |
1586 | { | 1636 | { |
1587 | struct cache_set *c = arg; | 1637 | struct cache_set *c = arg; |
1638 | struct cache *ca; | ||
1639 | unsigned i; | ||
1588 | 1640 | ||
1589 | while (1) { | 1641 | while (1) { |
1642 | again: | ||
1590 | bch_btree_gc(c); | 1643 | bch_btree_gc(c); |
1591 | 1644 | ||
1592 | set_current_state(TASK_INTERRUPTIBLE); | 1645 | set_current_state(TASK_INTERRUPTIBLE); |
1593 | if (kthread_should_stop()) | 1646 | if (kthread_should_stop()) |
1594 | break; | 1647 | break; |
1595 | 1648 | ||
1649 | mutex_lock(&c->bucket_lock); | ||
1650 | |||
1651 | for_each_cache(ca, c, i) | ||
1652 | if (ca->invalidate_needs_gc) { | ||
1653 | mutex_unlock(&c->bucket_lock); | ||
1654 | set_current_state(TASK_RUNNING); | ||
1655 | goto again; | ||
1656 | } | ||
1657 | |||
1658 | mutex_unlock(&c->bucket_lock); | ||
1659 | |||
1596 | try_to_freeze(); | 1660 | try_to_freeze(); |
1597 | schedule(); | 1661 | schedule(); |
1598 | } | 1662 | } |
@@ -2083,8 +2147,6 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, | |||
2083 | 2147 | ||
2084 | bch_keylist_init(&split_keys); | 2148 | bch_keylist_init(&split_keys); |
2085 | 2149 | ||
2086 | BUG_ON(b->level); | ||
2087 | |||
2088 | do { | 2150 | do { |
2089 | BUG_ON(b->level && replace_key); | 2151 | BUG_ON(b->level && replace_key); |
2090 | 2152 | ||