aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Marzinski <bmarzins@redhat.com>2015-12-01 09:46:55 -0500
committerBob Peterson <rpeterso@redhat.com>2015-12-14 13:19:37 -0500
commit471f3db2786bc32011d6693413eb93b0c3da2579 (patch)
treea3af645fd5dbb014bffa50a715422b954e83564f
parent340174722929d80a107120400bab527cfc7e47f1 (diff)
gfs2: change gfs2 readdir cookie
gfs2 currently returns 31 bits of filename hash as a cookie that readdir uses for an offset into the directory. When there are a large number of directory entries, the likelihood of a collision goes up way too quickly. GFS2 will now return cookies that are guaranteed unique for a while, and then fail back to using 30 bits of filename hash. Specifically, the directory leaf blocks are divided up into chunks based on the minimum size of a gfs2 directory entry (48 bytes). Each entry's cookie is based off the chunk where it starts, in the linked list of leaf blocks that it hashes to (there are 131072 hash buckets). Directory entries will have unique names until they take reach chunk 8192. Assuming the largest filenames possible, and the least efficient spacing possible, this new method will still be able to return unique names when the previous method has statistically more than a 99% chance of a collision. The non-unique names it fails back to are guaranteed to not collide with the unique names. unique cookies will be in this format: - 1 bit "0" to make sure the the returned cookie is positive - 17 bits for the hash table index - 1 bit for the mode "0" - 13 bits for the offset non-unique cookies will be in this format: - 1 bit "0" to make sure the the returned cookie is positive - 17 bits for the hash table index - 1 bit for the mode "1" - 13 more bits of the name hash Another benefit of location based cookies, is that once a directory's exhash table is fully extended (so that multiple hash table indexs do not use the same leaf blocks), gfs2 can skip sorting the directory entries until it reaches the non-unique ones, and then it only needs to sort these. This provides a significant speed up for directory reads of very large directories. The only issue is that for these cookies to continue to point to the correct entry as files are added and removed from the directory, gfs2 must keep the entries at the same offset in the leaf block when they are split (see my previous patch). This means that until all the nodes in a cluster are running with code that will split the directory leaf blocks this way, none of the nodes can use the new cookie code. To deal with this, gfs2 now has the mount option loccookie, which, if set, will make it return these new location based cookies. This option must not be set until all nodes in the cluster are at least running this version of the kernel code, and you have guaranteed that there are no outstanding cookies required by other software, such as NFS. gfs2 uses some of the extra space at the end of the gfs2_dirent structure to store the calculated readdir cookies. This keeps us from needing to allocate a seperate array to hold these values. gfs2 recomputes the cookie stored in de_cookie for every readdir call. The time it takes to do so is small, and if gfs2 expected this value to be saved on disk, the new code wouldn't work correctly on filesystems created with an earlier version of gfs2. One issue with adding de_cookie to the union in the gfs2_dirent structure is that it caused the union to align itself to a 4 byte boundary, instead of its previous 2 byte boundary. This changed the offset of de_rahead. To solve that, I pulled de_rahead out of the union, since it does not need to be there. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
-rw-r--r--fs/gfs2/dir.c91
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/ops_fstype.c3
-rw-r--r--fs/gfs2/super.c12
-rw-r--r--include/uapi/linux/gfs2_ondisk.h9
5 files changed, 95 insertions, 23 deletions
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 4ee008c6d64b..6a92592304fb 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,6 +82,8 @@
82 82
83#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) 83#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
84#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) 84#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
85#define GFS2_HASH_INDEX_MASK 0xffffc000
86#define GFS2_USE_HASH_FLAG 0x2000
85 87
86struct qstr gfs2_qdot __read_mostly; 88struct qstr gfs2_qdot __read_mostly;
87struct qstr gfs2_qdotdot __read_mostly; 89struct qstr gfs2_qdotdot __read_mostly;
@@ -1223,10 +1225,10 @@ static int compare_dents(const void *a, const void *b)
1223 int ret = 0; 1225 int ret = 0;
1224 1226
1225 dent_a = *(const struct gfs2_dirent **)a; 1227 dent_a = *(const struct gfs2_dirent **)a;
1226 hash_a = be32_to_cpu(dent_a->de_hash); 1228 hash_a = dent_a->de_cookie;
1227 1229
1228 dent_b = *(const struct gfs2_dirent **)b; 1230 dent_b = *(const struct gfs2_dirent **)b;
1229 hash_b = be32_to_cpu(dent_b->de_hash); 1231 hash_b = dent_b->de_cookie;
1230 1232
1231 if (hash_a > hash_b) 1233 if (hash_a > hash_b)
1232 ret = 1; 1234 ret = 1;
@@ -1264,19 +1266,20 @@ static int compare_dents(const void *a, const void *b)
1264 */ 1266 */
1265 1267
1266static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, 1268static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
1267 const struct gfs2_dirent **darr, u32 entries, 1269 struct gfs2_dirent **darr, u32 entries,
1268 int *copied) 1270 u32 sort_start, int *copied)
1269{ 1271{
1270 const struct gfs2_dirent *dent, *dent_next; 1272 const struct gfs2_dirent *dent, *dent_next;
1271 u64 off, off_next; 1273 u64 off, off_next;
1272 unsigned int x, y; 1274 unsigned int x, y;
1273 int run = 0; 1275 int run = 0;
1274 1276
1275 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); 1277 if (sort_start < entries)
1278 sort(&darr[sort_start], entries - sort_start,
1279 sizeof(struct gfs2_dirent *), compare_dents, NULL);
1276 1280
1277 dent_next = darr[0]; 1281 dent_next = darr[0];
1278 off_next = be32_to_cpu(dent_next->de_hash); 1282 off_next = dent_next->de_cookie;
1279 off_next = gfs2_disk_hash2offset(off_next);
1280 1283
1281 for (x = 0, y = 1; x < entries; x++, y++) { 1284 for (x = 0, y = 1; x < entries; x++, y++) {
1282 dent = dent_next; 1285 dent = dent_next;
@@ -1284,8 +1287,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
1284 1287
1285 if (y < entries) { 1288 if (y < entries) {
1286 dent_next = darr[y]; 1289 dent_next = darr[y];
1287 off_next = be32_to_cpu(dent_next->de_hash); 1290 off_next = dent_next->de_cookie;
1288 off_next = gfs2_disk_hash2offset(off_next);
1289 1291
1290 if (off < ctx->pos) 1292 if (off < ctx->pos)
1291 continue; 1293 continue;
@@ -1332,6 +1334,40 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
1332 return ptr; 1334 return ptr;
1333} 1335}
1334 1336
1337
1338static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
1339 unsigned leaf_nr, struct gfs2_dirent **darr,
1340 unsigned entries)
1341{
1342 int sort_id = -1;
1343 int i;
1344
1345 for (i = 0; i < entries; i++) {
1346 unsigned offset;
1347
1348 darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash);
1349 darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie);
1350
1351 if (!sdp->sd_args.ar_loccookie)
1352 continue;
1353 offset = (char *)(darr[i]) -
1354 (bh->b_data + gfs2_dirent_offset(bh->b_data));
1355 offset /= GFS2_MIN_DIRENT_SIZE;
1356 offset += leaf_nr * sdp->sd_max_dents_per_leaf;
1357 if (offset >= GFS2_USE_HASH_FLAG ||
1358 leaf_nr >= GFS2_USE_HASH_FLAG) {
1359 darr[i]->de_cookie |= GFS2_USE_HASH_FLAG;
1360 if (sort_id < 0)
1361 sort_id = i;
1362 continue;
1363 }
1364 darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK;
1365 darr[i]->de_cookie |= offset;
1366 }
1367 return sort_id;
1368}
1369
1370
1335static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, 1371static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
1336 int *copied, unsigned *depth, 1372 int *copied, unsigned *depth,
1337 u64 leaf_no) 1373 u64 leaf_no)
@@ -1341,12 +1377,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
1341 struct buffer_head *bh; 1377 struct buffer_head *bh;
1342 struct gfs2_leaf *lf; 1378 struct gfs2_leaf *lf;
1343 unsigned entries = 0, entries2 = 0; 1379 unsigned entries = 0, entries2 = 0;
1344 unsigned leaves = 0; 1380 unsigned leaves = 0, leaf = 0, offset, sort_offset;
1345 const struct gfs2_dirent **darr, *dent; 1381 struct gfs2_dirent **darr, *dent;
1346 struct dirent_gather g; 1382 struct dirent_gather g;
1347 struct buffer_head **larr; 1383 struct buffer_head **larr;
1348 int leaf = 0; 1384 int error, i, need_sort = 0, sort_id;
1349 int error, i;
1350 u64 lfn = leaf_no; 1385 u64 lfn = leaf_no;
1351 1386
1352 do { 1387 do {
@@ -1362,6 +1397,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
1362 brelse(bh); 1397 brelse(bh);
1363 } while(lfn); 1398 } while(lfn);
1364 1399
1400 if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) {
1401 need_sort = 1;
1402 sort_offset = 0;
1403 }
1404
1365 if (!entries) 1405 if (!entries)
1366 return 0; 1406 return 0;
1367 1407
@@ -1375,8 +1415,8 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
1375 larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); 1415 larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
1376 if (!larr) 1416 if (!larr)
1377 goto out; 1417 goto out;
1378 darr = (const struct gfs2_dirent **)(larr + leaves); 1418 darr = (struct gfs2_dirent **)(larr + leaves);
1379 g.pdent = darr; 1419 g.pdent = (const struct gfs2_dirent **)darr;
1380 g.offset = 0; 1420 g.offset = 0;
1381 lfn = leaf_no; 1421 lfn = leaf_no;
1382 1422
@@ -1387,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
1387 lf = (struct gfs2_leaf *)bh->b_data; 1427 lf = (struct gfs2_leaf *)bh->b_data;
1388 lfn = be64_to_cpu(lf->lf_next); 1428 lfn = be64_to_cpu(lf->lf_next);
1389 if (lf->lf_entries) { 1429 if (lf->lf_entries) {
1430 offset = g.offset;
1390 entries2 += be16_to_cpu(lf->lf_entries); 1431 entries2 += be16_to_cpu(lf->lf_entries);
1391 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, 1432 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1392 gfs2_dirent_gather, NULL, &g); 1433 gfs2_dirent_gather, NULL, &g);
@@ -1404,17 +1445,26 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
1404 goto out_free; 1445 goto out_free;
1405 } 1446 }
1406 error = 0; 1447 error = 0;
1448 sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset],
1449 be16_to_cpu(lf->lf_entries));
1450 if (!need_sort && sort_id >= 0) {
1451 need_sort = 1;
1452 sort_offset = offset + sort_id;
1453 }
1407 larr[leaf++] = bh; 1454 larr[leaf++] = bh;
1408 } else { 1455 } else {
1456 larr[leaf++] = NULL;
1409 brelse(bh); 1457 brelse(bh);
1410 } 1458 }
1411 } while(lfn); 1459 } while(lfn);
1412 1460
1413 BUG_ON(entries2 != entries); 1461 BUG_ON(entries2 != entries);
1414 error = do_filldir_main(ip, ctx, darr, entries, copied); 1462 error = do_filldir_main(ip, ctx, darr, entries, need_sort ?
1463 sort_offset : entries, copied);
1415out_free: 1464out_free:
1416 for(i = 0; i < leaf; i++) 1465 for(i = 0; i < leaf; i++)
1417 brelse(larr[i]); 1466 if (larr[i])
1467 brelse(larr[i]);
1418 kvfree(larr); 1468 kvfree(larr);
1419out: 1469out:
1420 return error; 1470 return error;
@@ -1520,7 +1570,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
1520 struct gfs2_inode *dip = GFS2_I(inode); 1570 struct gfs2_inode *dip = GFS2_I(inode);
1521 struct gfs2_sbd *sdp = GFS2_SB(inode); 1571 struct gfs2_sbd *sdp = GFS2_SB(inode);
1522 struct dirent_gather g; 1572 struct dirent_gather g;
1523 const struct gfs2_dirent **darr, *dent; 1573 struct gfs2_dirent **darr, *dent;
1524 struct buffer_head *dibh; 1574 struct buffer_head *dibh;
1525 int copied = 0; 1575 int copied = 0;
1526 int error; 1576 int error;
@@ -1544,7 +1594,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
1544 /* 96 is max number of dirents which can be stuffed into an inode */ 1594 /* 96 is max number of dirents which can be stuffed into an inode */
1545 darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS); 1595 darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
1546 if (darr) { 1596 if (darr) {
1547 g.pdent = darr; 1597 g.pdent = (const struct gfs2_dirent **)darr;
1548 g.offset = 0; 1598 g.offset = 0;
1549 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size, 1599 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1550 gfs2_dirent_gather, NULL, &g); 1600 gfs2_dirent_gather, NULL, &g);
@@ -1561,8 +1611,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
1561 error = -EIO; 1611 error = -EIO;
1562 goto out; 1612 goto out;
1563 } 1613 }
1614 gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries);
1564 error = do_filldir_main(dip, ctx, darr, 1615 error = do_filldir_main(dip, ctx, darr,
1565 dip->i_entries, &copied); 1616 dip->i_entries, 0, &copied);
1566out: 1617out:
1567 kfree(darr); 1618 kfree(darr);
1568 } 1619 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 921304e1d785..845fb09cc606 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -562,6 +562,8 @@ struct gfs2_args {
562 unsigned int ar_errors:2; /* errors=withdraw | panic */ 562 unsigned int ar_errors:2; /* errors=withdraw | panic */
563 unsigned int ar_nobarrier:1; /* do not send barriers */ 563 unsigned int ar_nobarrier:1; /* do not send barriers */
564 unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ 564 unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */
565 unsigned int ar_loccookie:1; /* use location based readdir
566 cookies */
565 int ar_commit; /* Commit interval */ 567 int ar_commit; /* Commit interval */
566 int ar_statfs_quantum; /* The fast statfs interval */ 568 int ar_statfs_quantum; /* The fast statfs interval */
567 int ar_quota_quantum; /* The quota interval */ 569 int ar_quota_quantum; /* The quota interval */
@@ -689,6 +691,7 @@ struct gfs2_sbd {
689 u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; 691 u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
690 u32 sd_max_jheight; /* Max height of journaled file's meta tree */ 692 u32 sd_max_jheight; /* Max height of journaled file's meta tree */
691 u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; 693 u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
694 u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
692 695
693 struct gfs2_args sd_args; /* Mount arguments */ 696 struct gfs2_args sd_args; /* Mount arguments */
694 struct gfs2_tune sd_tune; /* Filesystem tuning structure */ 697 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1f9de173c4a0..7aacdf2bafd1 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
352 sdp->sd_jheightsize[x] = ~0; 352 sdp->sd_jheightsize[x] = ~0;
353 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); 353 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
354 354
355 sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
356 sizeof(struct gfs2_leaf)) /
357 GFS2_MIN_DIRENT_SIZE;
355 return 0; 358 return 0;
356} 359}
357 360
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 03fa155f703e..0f3d64606e93 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -83,6 +83,8 @@ enum {
83 Opt_nobarrier, 83 Opt_nobarrier,
84 Opt_rgrplvb, 84 Opt_rgrplvb,
85 Opt_norgrplvb, 85 Opt_norgrplvb,
86 Opt_loccookie,
87 Opt_noloccookie,
86 Opt_error, 88 Opt_error,
87}; 89};
88 90
@@ -122,6 +124,8 @@ static const match_table_t tokens = {
122 {Opt_nobarrier, "nobarrier"}, 124 {Opt_nobarrier, "nobarrier"},
123 {Opt_rgrplvb, "rgrplvb"}, 125 {Opt_rgrplvb, "rgrplvb"},
124 {Opt_norgrplvb, "norgrplvb"}, 126 {Opt_norgrplvb, "norgrplvb"},
127 {Opt_loccookie, "loccookie"},
128 {Opt_noloccookie, "noloccookie"},
125 {Opt_error, NULL} 129 {Opt_error, NULL}
126}; 130};
127 131
@@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
278 case Opt_norgrplvb: 282 case Opt_norgrplvb:
279 args->ar_rgrplvb = 0; 283 args->ar_rgrplvb = 0;
280 break; 284 break;
285 case Opt_loccookie:
286 args->ar_loccookie = 1;
287 break;
288 case Opt_noloccookie:
289 args->ar_loccookie = 0;
290 break;
281 case Opt_error: 291 case Opt_error:
282 default: 292 default:
283 pr_warn("invalid mount option: %s\n", o); 293 pr_warn("invalid mount option: %s\n", o);
@@ -1418,6 +1428,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1418 seq_puts(s, ",demote_interface_used"); 1428 seq_puts(s, ",demote_interface_used");
1419 if (args->ar_rgrplvb) 1429 if (args->ar_rgrplvb)
1420 seq_puts(s, ",rgrplvb"); 1430 seq_puts(s, ",rgrplvb");
1431 if (args->ar_loccookie)
1432 seq_puts(s, ",loccookie");
1421 return 0; 1433 return 0;
1422} 1434}
1423 1435
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 1a763eaae0bb..7c4be7711c81 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -297,6 +297,8 @@ struct gfs2_dinode {
297 297
298#define GFS2_FNAMESIZE 255 298#define GFS2_FNAMESIZE 255
299#define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7) 299#define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
300#define GFS2_MIN_DIRENT_SIZE (GFS2_DIRENT_SIZE(1))
301
300 302
301struct gfs2_dirent { 303struct gfs2_dirent {
302 struct gfs2_inum de_inum; 304 struct gfs2_inum de_inum;
@@ -304,11 +306,12 @@ struct gfs2_dirent {
304 __be16 de_rec_len; 306 __be16 de_rec_len;
305 __be16 de_name_len; 307 __be16 de_name_len;
306 __be16 de_type; 308 __be16 de_type;
309 __be16 de_rahead;
307 union { 310 union {
308 __u8 __pad[14]; 311 __u8 __pad[12];
309 struct { 312 struct {
310 __be16 de_rahead; 313 __u32 de_cookie; /* ondisk value not used */
311 __u8 pad2[12]; 314 __u8 pad3[8];
312 }; 315 };
313 }; 316 };
314}; 317};