aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorMark Fasheh <mfasheh@suse.com>2008-11-12 19:27:44 -0500
committerMark Fasheh <mfasheh@suse.com>2009-04-03 14:39:15 -0400
commit9b7895efac906d66d19856194e1ba61f37e231a4 (patch)
tree1ee6d2630cf3617251638170dcaceef41ddda8ec /fs/ocfs2
parent4a12ca3a00a244e1fd1e673d151ea38b71e11d55 (diff)
ocfs2: Add a name indexed b-tree to directory inodes
This patch makes use of Ocfs2's flexible btree code to add an additional tree to directory inodes. The new tree stores an array of small, fixed-length records in each leaf block. Each record stores a hash value, and pointer to a block in the traditional (unindexed) directory tree where a dirent with the given name hash resides. Lookup exclusively uses this tree to find dirents, thus providing us with constant time name lookups. Some of the hashing code was copied from ext3. Unfortunately, it has lots of unfixed checkpatch errors. I left that as-is so that tracking changes would be easier. Signed-off-by: Mark Fasheh <mfasheh@suse.com> Acked-by: Joel Becker <joel.becker@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/alloc.c57
-rw-r--r--fs/ocfs2/alloc.h3
-rw-r--r--fs/ocfs2/dir.c1908
-rw-r--r--fs/ocfs2/dir.h21
-rw-r--r--fs/ocfs2/inode.c12
-rw-r--r--fs/ocfs2/journal.c30
-rw-r--r--fs/ocfs2/journal.h51
-rw-r--r--fs/ocfs2/namei.c45
-rw-r--r--fs/ocfs2/ocfs2.h16
-rw-r--r--fs/ocfs2/ocfs2_fs.h102
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/ocfs2/xattr.c8
-rw-r--r--fs/ocfs2/xattr.h2
13 files changed, 2157 insertions, 104 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19e3a96aa02c..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
295}; 295};
296 296
297static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
298 u64 blkno)
299{
300 struct ocfs2_dx_root_block *dx_root = et->et_object;
301
302 dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
303}
304
305static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
306{
307 struct ocfs2_dx_root_block *dx_root = et->et_object;
308
309 return le64_to_cpu(dx_root->dr_last_eb_blk);
310}
311
312static void ocfs2_dx_root_update_clusters(struct inode *inode,
313 struct ocfs2_extent_tree *et,
314 u32 clusters)
315{
316 struct ocfs2_dx_root_block *dx_root = et->et_object;
317
318 le32_add_cpu(&dx_root->dr_clusters, clusters);
319}
320
321static int ocfs2_dx_root_sanity_check(struct inode *inode,
322 struct ocfs2_extent_tree *et)
323{
324 struct ocfs2_dx_root_block *dx_root = et->et_object;
325
326 BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
327
328 return 0;
329}
330
331static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
332{
333 struct ocfs2_dx_root_block *dx_root = et->et_object;
334
335 et->et_root_el = &dx_root->dr_list;
336}
337
338static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
339 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
340 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
341 .eo_update_clusters = ocfs2_dx_root_update_clusters,
342 .eo_sanity_check = ocfs2_dx_root_sanity_check,
343 .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
344};
345
297static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 346static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
298 struct inode *inode, 347 struct inode *inode,
299 struct buffer_head *bh, 348 struct buffer_head *bh,
@@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 &ocfs2_xattr_value_et_ops); 388 &ocfs2_xattr_value_et_ops);
340} 389}
341 390
391void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 struct inode *inode,
393 struct buffer_head *bh)
394{
395 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
396 NULL, &ocfs2_dx_root_et_ops);
397}
398
342static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 399static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
343 u64 new_last_eb_blk) 400 u64 new_last_eb_blk)
344{ 401{
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
76 struct inode *inode, 76 struct inode *inode,
77 struct ocfs2_xattr_value_buf *vb); 77 struct ocfs2_xattr_value_buf *vb);
78void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
79 struct inode *inode,
80 struct buffer_head *bh);
78 81
79/* 82/*
80 * Read an extent block into *bh. If *bh is NULL, a bh will be 83 * Read an extent block into *bh. If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 76ffb5c10b3e..0b8c88b47a4e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/sort.h>
44 45
45#define MLOG_MASK_PREFIX ML_NAMEI 46#define MLOG_MASK_PREFIX ML_NAMEI
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -58,6 +59,7 @@
58#include "namei.h" 59#include "namei.h"
59#include "suballoc.h" 60#include "suballoc.h"
60#include "super.h" 61#include "super.h"
62#include "sysfile.h"
61#include "uptodate.h" 63#include "uptodate.h"
62 64
63#include "buffer_head_io.h" 65#include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
71 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 73 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
72}; 74};
73 75
74static int ocfs2_extend_dir(struct ocfs2_super *osb,
75 struct inode *dir,
76 struct buffer_head *parent_fe_bh,
77 unsigned int blocks_wanted,
78 struct buffer_head **new_de_bh);
79static int ocfs2_do_extend_dir(struct super_block *sb, 76static int ocfs2_do_extend_dir(struct super_block *sb,
80 handle_t *handle, 77 handle_t *handle,
81 struct inode *dir, 78 struct inode *dir,
@@ -155,6 +152,105 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
155void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) 152void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
156{ 153{
157 brelse(res->dl_leaf_bh); 154 brelse(res->dl_leaf_bh);
155 brelse(res->dl_dx_leaf_bh);
156}
157
158static int ocfs2_dir_indexed(struct inode *inode)
159{
160 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
161 return 1;
162 return 0;
163}
164
165/*
166 * Hashing code adapted from ext3
167 */
168#define DELTA 0x9E3779B9
169
170static void TEA_transform(__u32 buf[4], __u32 const in[])
171{
172 __u32 sum = 0;
173 __u32 b0 = buf[0], b1 = buf[1];
174 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
175 int n = 16;
176
177 do {
178 sum += DELTA;
179 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
180 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
181 } while (--n);
182
183 buf[0] += b0;
184 buf[1] += b1;
185}
186
187static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
188{
189 __u32 pad, val;
190 int i;
191
192 pad = (__u32)len | ((__u32)len << 8);
193 pad |= pad << 16;
194
195 val = pad;
196 if (len > num*4)
197 len = num * 4;
198 for (i = 0; i < len; i++) {
199 if ((i % 4) == 0)
200 val = pad;
201 val = msg[i] + (val << 8);
202 if ((i % 4) == 3) {
203 *buf++ = val;
204 val = pad;
205 num--;
206 }
207 }
208 if (--num >= 0)
209 *buf++ = val;
210 while (--num >= 0)
211 *buf++ = pad;
212}
213
214static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
215 struct ocfs2_dx_hinfo *hinfo)
216{
217 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
218 const char *p;
219 __u32 in[8], buf[4];
220
221 /*
222 * XXX: Is this really necessary, if the index is never looked
223 * at by readdir? Is a hash value of '0' a bad idea?
224 */
225 if ((len == 1 && !strncmp(".", name, 1)) ||
226 (len == 2 && !strncmp("..", name, 2))) {
227 buf[0] = buf[1] = 0;
228 goto out;
229 }
230
231#ifdef OCFS2_DEBUG_DX_DIRS
232 /*
233 * This makes it very easy to debug indexing problems. We
234 * should never allow this to be selected without hand editing
235 * this file though.
236 */
237 buf[0] = buf[1] = len;
238 goto out;
239#endif
240
241 memcpy(buf, osb->osb_dx_seed, sizeof(buf));
242
243 p = name;
244 while (len > 0) {
245 str2hashbuf(p, len, in, 4);
246 TEA_transform(buf, in);
247 len -= 16;
248 p += 16;
249 }
250
251out:
252 hinfo->major_hash = buf[0];
253 hinfo->minor_hash = buf[1];
158} 254}
159 255
160/* 256/*
@@ -317,6 +413,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
317} 413}
318 414
319/* 415/*
416 * Validate a directory trailer.
417 *
418 * We check the trailer here rather than in ocfs2_validate_dir_block()
419 * because that function doesn't have the inode to test.
420 */
421static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
422{
423 int rc = 0;
424 struct ocfs2_dir_block_trailer *trailer;
425
426 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
427 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
428 rc = -EINVAL;
429 ocfs2_error(dir->i_sb,
430 "Invalid dirblock #%llu: "
431 "signature = %.*s\n",
432 (unsigned long long)bh->b_blocknr, 7,
433 trailer->db_signature);
434 goto out;
435 }
436 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
437 rc = -EINVAL;
438 ocfs2_error(dir->i_sb,
439 "Directory block #%llu has an invalid "
440 "db_blkno of %llu",
441 (unsigned long long)bh->b_blocknr,
442 (unsigned long long)le64_to_cpu(trailer->db_blkno));
443 goto out;
444 }
445 if (le64_to_cpu(trailer->db_parent_dinode) !=
446 OCFS2_I(dir)->ip_blkno) {
447 rc = -EINVAL;
448 ocfs2_error(dir->i_sb,
449 "Directory block #%llu on dinode "
450 "#%llu has an invalid parent_dinode "
451 "of %llu",
452 (unsigned long long)bh->b_blocknr,
453 (unsigned long long)OCFS2_I(dir)->ip_blkno,
454 (unsigned long long)le64_to_cpu(trailer->db_blkno));
455 goto out;
456 }
457out:
458 return rc;
459}
460
461/*
320 * This function forces all errors to -EIO for consistency with its 462 * This function forces all errors to -EIO for consistency with its
321 * predecessor, ocfs2_bread(). We haven't audited what returning the 463 * predecessor, ocfs2_bread(). We haven't audited what returning the
322 * real error codes would do to callers. We log the real codes with 464 * real error codes would do to callers. We log the real codes with
@@ -327,7 +469,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
327{ 469{
328 int rc = 0; 470 int rc = 0;
329 struct buffer_head *tmp = *bh; 471 struct buffer_head *tmp = *bh;
330 struct ocfs2_dir_block_trailer *trailer;
331 472
332 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, 473 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
333 ocfs2_validate_dir_block); 474 ocfs2_validate_dir_block);
@@ -336,42 +477,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
336 goto out; 477 goto out;
337 } 478 }
338 479
339 /*
340 * We check the trailer here rather than in
341 * ocfs2_validate_dir_block() because that function doesn't have
342 * the inode to test.
343 */
344 if (!(flags & OCFS2_BH_READAHEAD) && 480 if (!(flags & OCFS2_BH_READAHEAD) &&
345 ocfs2_dir_has_trailer(inode)) { 481 ocfs2_dir_has_trailer(inode)) {
346 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); 482 rc = ocfs2_check_dir_trailer(inode, tmp);
347 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { 483 if (rc) {
348 rc = -EINVAL; 484 if (!*bh)
349 ocfs2_error(inode->i_sb, 485 brelse(tmp);
350 "Invalid dirblock #%llu: " 486 mlog_errno(rc);
351 "signature = %.*s\n",
352 (unsigned long long)tmp->b_blocknr, 7,
353 trailer->db_signature);
354 goto out;
355 }
356 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
357 rc = -EINVAL;
358 ocfs2_error(inode->i_sb,
359 "Directory block #%llu has an invalid "
360 "db_blkno of %llu",
361 (unsigned long long)tmp->b_blocknr,
362 (unsigned long long)le64_to_cpu(trailer->db_blkno));
363 goto out;
364 }
365 if (le64_to_cpu(trailer->db_parent_dinode) !=
366 OCFS2_I(inode)->ip_blkno) {
367 rc = -EINVAL;
368 ocfs2_error(inode->i_sb,
369 "Directory block #%llu on dinode "
370 "#%llu has an invalid parent_dinode "
371 "of %llu",
372 (unsigned long long)tmp->b_blocknr,
373 (unsigned long long)OCFS2_I(inode)->ip_blkno,
374 (unsigned long long)le64_to_cpu(trailer->db_blkno));
375 goto out; 487 goto out;
376 } 488 }
377 } 489 }
@@ -384,6 +496,141 @@ out:
384 return rc ? -EIO : 0; 496 return rc ? -EIO : 0;
385} 497}
386 498
499/*
500 * Read the block at 'phys' which belongs to this directory
501 * inode. This function does no virtual->physical block translation -
502 * what's passed in is assumed to be a valid directory block.
503 */
504static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
505 struct buffer_head **bh)
506{
507 int ret;
508 struct buffer_head *tmp = *bh;
509
510 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
511 if (ret) {
512 mlog_errno(ret);
513 goto out;
514 }
515
516 if (ocfs2_supports_dir_trailer(dir)) {
517 ret = ocfs2_check_dir_trailer(dir, tmp);
518 if (ret) {
519 if (!*bh)
520 brelse(tmp);
521 mlog_errno(ret);
522 goto out;
523 }
524 }
525
526 if (!ret && !*bh)
527 *bh = tmp;
528out:
529 return ret;
530}
531
532static int ocfs2_validate_dx_root(struct super_block *sb,
533 struct buffer_head *bh)
534{
535 int ret;
536 struct ocfs2_dx_root_block *dx_root;
537
538 BUG_ON(!buffer_uptodate(bh));
539
540 dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
541
542 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
543 if (ret) {
544 mlog(ML_ERROR,
545 "Checksum failed for dir index root block %llu\n",
546 (unsigned long long)bh->b_blocknr);
547 return ret;
548 }
549
550 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
551 ocfs2_error(sb,
552 "Dir Index Root # %llu has bad signature %.*s",
553 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
554 7, dx_root->dr_signature);
555 return -EINVAL;
556 }
557
558 return 0;
559}
560
561static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
562 struct buffer_head **dx_root_bh)
563{
564 int ret;
565 u64 blkno = le64_to_cpu(di->i_dx_root);
566 struct buffer_head *tmp = *dx_root_bh;
567
568 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
569
570 /* If ocfs2_read_block() got us a new bh, pass it up. */
571 if (!ret && !*dx_root_bh)
572 *dx_root_bh = tmp;
573
574 return ret;
575}
576
577static int ocfs2_validate_dx_leaf(struct super_block *sb,
578 struct buffer_head *bh)
579{
580 int ret;
581 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
582
583 BUG_ON(!buffer_uptodate(bh));
584
585 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
586 if (ret) {
587 mlog(ML_ERROR,
588 "Checksum failed for dir index leaf block %llu\n",
589 (unsigned long long)bh->b_blocknr);
590 return ret;
591 }
592
593 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
594 ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
595 7, dx_leaf->dl_signature);
596 return -EROFS;
597 }
598
599 return 0;
600}
601
602static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
603 struct buffer_head **dx_leaf_bh)
604{
605 int ret;
606 struct buffer_head *tmp = *dx_leaf_bh;
607
608 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
609
610 /* If ocfs2_read_block() got us a new bh, pass it up. */
611 if (!ret && !*dx_leaf_bh)
612 *dx_leaf_bh = tmp;
613
614 return ret;
615}
616
617/*
618 * Read a series of dx_leaf blocks. This expects all buffer_head
619 * pointers to be NULL on function entry.
620 */
621static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
622 struct buffer_head **dx_leaf_bhs)
623{
624 int ret;
625
626 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
627 ocfs2_validate_dx_leaf);
628 if (ret)
629 mlog_errno(ret);
630
631 return ret;
632}
633
387static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 634static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
388 struct inode *dir, 635 struct inode *dir,
389 struct ocfs2_dir_entry **res_dir) 636 struct ocfs2_dir_entry **res_dir)
@@ -485,6 +732,270 @@ cleanup_and_exit:
485 return ret; 732 return ret;
486} 733}
487 734
735static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
736 struct ocfs2_extent_list *el,
737 u32 major_hash,
738 u32 *ret_cpos,
739 u64 *ret_phys_blkno,
740 unsigned int *ret_clen)
741{
742 int ret = 0, i, found;
743 struct buffer_head *eb_bh = NULL;
744 struct ocfs2_extent_block *eb;
745 struct ocfs2_extent_rec *rec = NULL;
746
747 if (el->l_tree_depth) {
748 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
749 if (ret) {
750 mlog_errno(ret);
751 goto out;
752 }
753
754 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
755 el = &eb->h_list;
756
757 if (el->l_tree_depth) {
758 ocfs2_error(inode->i_sb,
759 "Inode %lu has non zero tree depth in "
760 "btree tree block %llu\n", inode->i_ino,
761 (unsigned long long)eb_bh->b_blocknr);
762 ret = -EROFS;
763 goto out;
764 }
765 }
766
767 found = 0;
768 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
769 rec = &el->l_recs[i];
770
771 if (le32_to_cpu(rec->e_cpos) <= major_hash) {
772 found = 1;
773 break;
774 }
775 }
776
777 if (!found) {
778 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
779 "record (%u, %u, 0) in btree", inode->i_ino,
780 le32_to_cpu(rec->e_cpos),
781 ocfs2_rec_clusters(el, rec));
782 ret = -EROFS;
783 goto out;
784 }
785
786 if (ret_phys_blkno)
787 *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
788 if (ret_cpos)
789 *ret_cpos = le32_to_cpu(rec->e_cpos);
790 if (ret_clen)
791 *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
792
793out:
794 brelse(eb_bh);
795 return ret;
796}
797
798/*
799 * Returns the block index, from the start of the cluster which this
800 * hash belongs too.
801 */
802static unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
803 struct ocfs2_dx_hinfo *hinfo)
804{
805 u32 minor_hash = hinfo->minor_hash;
806 return minor_hash & osb->osb_dx_mask;
807}
808
809static int ocfs2_dx_dir_lookup(struct inode *inode,
810 struct ocfs2_extent_list *el,
811 struct ocfs2_dx_hinfo *hinfo,
812 u32 *ret_cpos,
813 u64 *ret_phys_blkno)
814{
815 int ret = 0;
816 unsigned int cend, uninitialized_var(clen);
817 u32 uninitialized_var(cpos);
818 u64 uninitialized_var(blkno);
819 u32 name_hash = hinfo->major_hash;
820
821 ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
822 &clen);
823 if (ret) {
824 mlog_errno(ret);
825 goto out;
826 }
827
828 cend = cpos + clen;
829 if (name_hash >= cend) {
830 /* We want the last cluster */
831 blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
832 cpos += clen - 1;
833 } else {
834 blkno += ocfs2_clusters_to_blocks(inode->i_sb,
835 name_hash - cpos);
836 cpos = name_hash;
837 }
838
839 /*
840 * We now have the cluster which should hold our entry. To
841 * find the exact block from the start of the cluster to
842 * search, we take the lower bits of the hash.
843 */
844 blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
845
846 if (ret_phys_blkno)
847 *ret_phys_blkno = blkno;
848 if (ret_cpos)
849 *ret_cpos = cpos;
850
851out:
852
853 return ret;
854}
855
856static int ocfs2_dx_dir_search(const char *name, int namelen,
857 struct inode *dir,
858 struct ocfs2_extent_list *dr_el,
859 struct ocfs2_dir_lookup_result *res)
860{
861 int ret, i, found;
862 u64 uninitialized_var(phys);
863 struct buffer_head *dx_leaf_bh = NULL;
864 struct ocfs2_dx_leaf *dx_leaf;
865 struct ocfs2_dx_entry *dx_entry = NULL;
866 struct buffer_head *dir_ent_bh = NULL;
867 struct ocfs2_dir_entry *dir_ent = NULL;
868 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
869
870 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
871
872 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
873 if (ret) {
874 mlog_errno(ret);
875 goto out;
876 }
877
878 mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
879 "returns: %llu\n",
880 (unsigned long long)OCFS2_I(dir)->ip_blkno,
881 namelen, name, hinfo->major_hash, hinfo->minor_hash,
882 (unsigned long long)phys);
883
884 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
885 if (ret) {
886 mlog_errno(ret);
887 goto out;
888 }
889
890 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
891
892 mlog(0, "leaf info: num_used: %d, count: %d\n",
893 le16_to_cpu(dx_leaf->dl_list.de_num_used),
894 le16_to_cpu(dx_leaf->dl_list.de_count));
895
896 /*
897 * Empty leaf is legal, so no need to check for that.
898 */
899 found = 0;
900 for (i = 0; i < le16_to_cpu(dx_leaf->dl_list.de_num_used); i++) {
901 dx_entry = &dx_leaf->dl_list.de_entries[i];
902
903 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
904 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
905 continue;
906
907 /*
908 * Search unindexed leaf block now. We're not
909 * guaranteed to find anything.
910 */
911 ret = ocfs2_read_dir_block_direct(dir,
912 le64_to_cpu(dx_entry->dx_dirent_blk),
913 &dir_ent_bh);
914 if (ret) {
915 mlog_errno(ret);
916 goto out;
917 }
918
919 /*
920 * XXX: We should check the unindexed block here,
921 * before using it.
922 */
923
924 found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
925 0, dir_ent_bh->b_data,
926 dir->i_sb->s_blocksize, &dir_ent);
927 if (found == 1)
928 break;
929
930 if (found == -1) {
931 /* This means we found a bad directory entry. */
932 ret = -EIO;
933 mlog_errno(ret);
934 goto out;
935 }
936
937 brelse(dir_ent_bh);
938 dir_ent_bh = NULL;
939 }
940
941 if (found <= 0) {
942 ret = -ENOENT;
943 goto out;
944 }
945
946 res->dl_leaf_bh = dir_ent_bh;
947 res->dl_entry = dir_ent;
948 res->dl_dx_leaf_bh = dx_leaf_bh;
949 res->dl_dx_entry = dx_entry;
950
951 ret = 0;
952out:
953 if (ret) {
954 brelse(dx_leaf_bh);
955 brelse(dir_ent_bh);
956 }
957 return ret;
958}
959
960static int ocfs2_find_entry_dx(const char *name, int namelen,
961 struct inode *dir,
962 struct ocfs2_dir_lookup_result *lookup)
963{
964 int ret;
965 struct buffer_head *di_bh = NULL;
966 struct ocfs2_dinode *di;
967 struct buffer_head *dx_root_bh = NULL;
968 struct ocfs2_dx_root_block *dx_root;
969
970 ret = ocfs2_read_inode_block(dir, &di_bh);
971 if (ret) {
972 mlog_errno(ret);
973 goto out;
974 }
975
976 di = (struct ocfs2_dinode *)di_bh->b_data;
977
978 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
979 if (ret) {
980 mlog_errno(ret);
981 goto out;
982 }
983 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
984
985 ret = ocfs2_dx_dir_search(name, namelen, dir, &dx_root->dr_list,
986 lookup);
987 if (ret) {
988 if (ret != -ENOENT)
989 mlog_errno(ret);
990 goto out;
991 }
992
993out:
994 brelse(di_bh);
995 brelse(dx_root_bh);
996 return ret;
997}
998
488/* 999/*
489 * Try to find an entry of the provided name within 'dir'. 1000 * Try to find an entry of the provided name within 'dir'.
490 * 1001 *
@@ -493,10 +1004,11 @@ cleanup_and_exit:
493 * other directory manipulation functions. 1004 * other directory manipulation functions.
494 * 1005 *
495 * Caller can NOT assume anything about the contents of the 1006 * Caller can NOT assume anything about the contents of the
496 * buffer_heads - they are passed back only so that it can be passed into 1007 * buffer_heads - they are passed back only so that it can be passed
497 * any one of the manipulation functions (add entry, delete entry, 1008 * into any one of the manipulation functions (add entry, delete
498 * etc). As an example, bh in the extent directory case is a data 1009 * entry, etc). As an example, bh in the extent directory case is a
499 * block, in the inline-data case it actually points to an inode. 1010 * data block, in the inline-data case it actually points to an inode,
1011 * in the indexed directory case, multiple buffers are involved.
500 */ 1012 */
501int ocfs2_find_entry(const char *name, int namelen, 1013int ocfs2_find_entry(const char *name, int namelen,
502 struct inode *dir, struct ocfs2_dir_lookup_result *lookup) 1014 struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
@@ -504,6 +1016,14 @@ int ocfs2_find_entry(const char *name, int namelen,
504 struct buffer_head *bh; 1016 struct buffer_head *bh;
505 struct ocfs2_dir_entry *res_dir = NULL; 1017 struct ocfs2_dir_entry *res_dir = NULL;
506 1018
1019 if (ocfs2_dir_indexed(dir))
1020 return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1021
1022 /*
1023 * The unindexed dir code only uses part of the lookup
1024 * structure, so there's no reason to push it down further
1025 * than this.
1026 */
507 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1027 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
508 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); 1028 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
509 else 1029 else
@@ -553,6 +1073,10 @@ out:
553 return ret; 1073 return ret;
554} 1074}
555 1075
1076/*
1077 * __ocfs2_delete_entry deletes a directory entry by merging it with the
1078 * previous entry
1079 */
556static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, 1080static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
557 struct ocfs2_dir_entry *de_del, 1081 struct ocfs2_dir_entry *de_del,
558 struct buffer_head *bh, char *first_de, 1082 struct buffer_head *bh, char *first_de,
@@ -602,6 +1126,79 @@ bail:
602 return status; 1126 return status;
603} 1127}
604 1128
1129static void ocfs2_dx_leaf_remove_entry(struct ocfs2_dx_leaf *dx_leaf, int index)
1130{
1131 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
1132 int num_used = le16_to_cpu(dl_list->de_num_used);
1133
1134 if (num_used == 1 || index == (num_used - 1))
1135 goto clear;
1136
1137 memmove(&dl_list->de_entries[index], &dl_list->de_entries[index + 1],
1138 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1139clear:
1140 num_used--;
1141 memset(&dl_list->de_entries[num_used], 0,
1142 sizeof(struct ocfs2_dx_entry));
1143 dl_list->de_num_used = cpu_to_le16(num_used);
1144}
1145
1146static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1147 struct ocfs2_dir_lookup_result *lookup)
1148{
1149 int ret, index;
1150 struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1151 struct ocfs2_dx_leaf *dx_leaf;
1152 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1153
1154 dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1155 /* Neither of these are a disk corruption - that should have
1156 * been caught by lookup, before we got here. */
1157 BUG_ON(le16_to_cpu(dx_leaf->dl_list.de_count) <= 0);
1158 BUG_ON(le16_to_cpu(dx_leaf->dl_list.de_num_used) <= 0);
1159
1160 index = (char *)dx_entry - (char *)dx_leaf->dl_list.de_entries;
1161 index /= sizeof(*dx_entry);
1162
1163 if (index >= le16_to_cpu(dx_leaf->dl_list.de_num_used)) {
1164 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1165 (unsigned long long)OCFS2_I(dir)->ip_blkno, index, dx_leaf,
1166 dx_entry);
1167 return -EIO;
1168 }
1169
1170 mlog(0, "Dir %llu: delete entry at index: %d\n",
1171 (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
1172
1173 /*
1174 * Add the index leaf into the journal before removing the
1175 * unindexed entry. If we get an error return from
1176 * __ocfs2_delete_entry(), then it hasn't removed the entry
1177 * yet. Likewise, successful return means we *must* remove the
1178 * indexed entry.
1179 */
1180 ret = ocfs2_journal_access_dl(handle, dir, lookup->dl_dx_leaf_bh,
1181 OCFS2_JOURNAL_ACCESS_WRITE);
1182 if (ret) {
1183 mlog_errno(ret);
1184 goto out;
1185 }
1186
1187 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1188 leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1189 if (ret) {
1190 mlog_errno(ret);
1191 goto out;
1192 }
1193
1194 ocfs2_dx_leaf_remove_entry(dx_leaf, index);
1195
1196 ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1197
1198out:
1199 return ret;
1200}
1201
605static inline int ocfs2_delete_entry_id(handle_t *handle, 1202static inline int ocfs2_delete_entry_id(handle_t *handle,
606 struct inode *dir, 1203 struct inode *dir,
607 struct ocfs2_dir_entry *de_del, 1204 struct ocfs2_dir_entry *de_del,
@@ -639,13 +1236,16 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
639} 1236}
640 1237
641/* 1238/*
642 * ocfs2_delete_entry deletes a directory entry by merging it with the 1239 * Delete a directory entry. Hide the details of directory
643 * previous entry 1240 * implementation from the caller.
644 */ 1241 */
645int ocfs2_delete_entry(handle_t *handle, 1242int ocfs2_delete_entry(handle_t *handle,
646 struct inode *dir, 1243 struct inode *dir,
647 struct ocfs2_dir_lookup_result *res) 1244 struct ocfs2_dir_lookup_result *res)
648{ 1245{
1246 if (ocfs2_dir_indexed(dir))
1247 return ocfs2_delete_entry_dx(handle, dir, res);
1248
649 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1249 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
650 return ocfs2_delete_entry_id(handle, dir, res->dl_entry, 1250 return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
651 res->dl_leaf_bh); 1251 res->dl_leaf_bh);
@@ -679,6 +1279,58 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
679 return 0; 1279 return 0;
680} 1280}
681 1281
1282static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1283 struct ocfs2_dx_entry *dx_new_entry)
1284{
1285 int i;
1286
1287 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1288 dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1289
1290 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1291}
1292
1293static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1294 struct ocfs2_dx_hinfo *hinfo,
1295 u64 dirent_blk,
1296 struct buffer_head *dx_leaf_bh)
1297{
1298 int ret, i;
1299 struct ocfs2_dx_entry *dx_entry;
1300 struct ocfs2_dx_leaf *dx_leaf;
1301
1302 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
1303 OCFS2_JOURNAL_ACCESS_WRITE);
1304 if (ret) {
1305 mlog_errno(ret);
1306 goto out;
1307 }
1308
1309 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1310 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1311 dx_entry = &dx_leaf->dl_list.de_entries[i];
1312
1313 memset(dx_entry, 0, sizeof(*dx_entry));
1314 dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1315 dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1316 dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1317
1318 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1319
1320 ocfs2_journal_dirty(handle, dx_leaf_bh);
1321
1322out:
1323 return ret;
1324}
1325
1326static int ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1327 struct ocfs2_dir_lookup_result *lookup)
1328{
1329 return __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1330 lookup->dl_leaf_bh->b_blocknr,
1331 lookup->dl_dx_leaf_bh);
1332}
1333
682/* we don't always have a dentry for what we want to add, so people 1334/* we don't always have a dentry for what we want to add, so people
683 * like orphan dir can call this instead. 1335 * like orphan dir can call this instead.
684 * 1336 *
@@ -754,10 +1406,21 @@ int __ocfs2_add_entry(handle_t *handle,
754 status = ocfs2_journal_access_di(handle, dir, 1406 status = ocfs2_journal_access_di(handle, dir,
755 insert_bh, 1407 insert_bh,
756 OCFS2_JOURNAL_ACCESS_WRITE); 1408 OCFS2_JOURNAL_ACCESS_WRITE);
757 else 1409 else {
758 status = ocfs2_journal_access_db(handle, dir, 1410 status = ocfs2_journal_access_db(handle, dir,
759 insert_bh, 1411 insert_bh,
760 OCFS2_JOURNAL_ACCESS_WRITE); 1412 OCFS2_JOURNAL_ACCESS_WRITE);
1413 if (ocfs2_dir_indexed(dir)) {
1414 status = ocfs2_dx_dir_leaf_insert(dir,
1415 handle,
1416 lookup);
1417 if (status) {
1418 mlog_errno(status);
1419 goto bail;
1420 }
1421 }
1422 }
1423
761 /* By now the buffer is marked for journaling */ 1424 /* By now the buffer is marked for journaling */
762 offset += le16_to_cpu(de->rec_len); 1425 offset += le16_to_cpu(de->rec_len);
763 if (le64_to_cpu(de->inode)) { 1426 if (le64_to_cpu(de->inode)) {
@@ -887,6 +1550,10 @@ out:
887 return 0; 1550 return 0;
888} 1551}
889 1552
1553/*
1554 * NOTE: This function can be called against unindexed directories,
1555 * and indexed ones.
1556 */
890static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1557static int ocfs2_dir_foreach_blk_el(struct inode *inode,
891 u64 *f_version, 1558 u64 *f_version,
892 loff_t *f_pos, void *priv, 1559 loff_t *f_pos, void *priv,
@@ -1184,6 +1851,8 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1184 * routine to check that the specified directory is empty (for rmdir) 1851 * routine to check that the specified directory is empty (for rmdir)
1185 * 1852 *
1186 * Returns 1 if dir is empty, zero otherwise. 1853 * Returns 1 if dir is empty, zero otherwise.
1854 *
1855 * XXX: This is a performance problem
1187 */ 1856 */
1188int ocfs2_empty_dir(struct inode *inode) 1857int ocfs2_empty_dir(struct inode *inode)
1189{ 1858{
@@ -1285,7 +1954,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1285 struct inode *parent, 1954 struct inode *parent,
1286 struct inode *inode, 1955 struct inode *inode,
1287 struct buffer_head *fe_bh, 1956 struct buffer_head *fe_bh,
1288 struct ocfs2_alloc_context *data_ac) 1957 struct ocfs2_alloc_context *data_ac,
1958 struct buffer_head **ret_new_bh)
1289{ 1959{
1290 int status; 1960 int status;
1291 unsigned int size = osb->sb->s_blocksize; 1961 unsigned int size = osb->sb->s_blocksize;
@@ -1334,6 +2004,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1334 } 2004 }
1335 2005
1336 status = 0; 2006 status = 0;
2007 if (ret_new_bh) {
2008 *ret_new_bh = new_bh;
2009 new_bh = NULL;
2010 }
1337bail: 2011bail:
1338 brelse(new_bh); 2012 brelse(new_bh);
1339 2013
@@ -1341,20 +2015,382 @@ bail:
1341 return status; 2015 return status;
1342} 2016}
1343 2017
2018static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2019 handle_t *handle, struct inode *dir,
2020 struct buffer_head *di_bh,
2021 struct ocfs2_alloc_context *meta_ac,
2022 struct buffer_head **ret_dx_root_bh)
2023{
2024 int ret;
2025 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2026 u16 dr_suballoc_bit;
2027 u64 dr_blkno;
2028 unsigned int num_bits;
2029 struct buffer_head *dx_root_bh = NULL;
2030 struct ocfs2_dx_root_block *dx_root;
2031
2032 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
2033 &num_bits, &dr_blkno);
2034 if (ret) {
2035 mlog_errno(ret);
2036 goto out;
2037 }
2038
2039 mlog(0, "Dir %llu, attach new index block: %llu\n",
2040 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2041 (unsigned long long)dr_blkno);
2042
2043 dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2044 if (dx_root_bh == NULL) {
2045 ret = -EIO;
2046 goto out;
2047 }
2048 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
2049
2050 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
2051 OCFS2_JOURNAL_ACCESS_CREATE);
2052 if (ret < 0) {
2053 mlog_errno(ret);
2054 goto out;
2055 }
2056
2057 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2058 memset(dx_root, 0, osb->sb->s_blocksize);
2059 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2060 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
2061 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2062 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2063 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2064 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2065 dx_root->dr_list.l_count =
2066 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2067
2068 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2069 if (ret)
2070 mlog_errno(ret);
2071
2072 ret = ocfs2_journal_access_di(handle, dir, di_bh,
2073 OCFS2_JOURNAL_ACCESS_CREATE);
2074 if (ret) {
2075 mlog_errno(ret);
2076 goto out;
2077 }
2078
2079 di->i_dx_root = cpu_to_le64(dr_blkno);
2080
2081 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2082 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2083
2084 ret = ocfs2_journal_dirty(handle, di_bh);
2085 if (ret)
2086 mlog_errno(ret);
2087
2088 *ret_dx_root_bh = dx_root_bh;
2089 dx_root_bh = NULL;
2090
2091out:
2092 brelse(dx_root_bh);
2093 return ret;
2094}
2095
2096static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2097 handle_t *handle, struct inode *dir,
2098 struct buffer_head **dx_leaves,
2099 int num_dx_leaves, u64 start_blk)
2100{
2101 int ret, i;
2102 struct ocfs2_dx_leaf *dx_leaf;
2103 struct buffer_head *bh;
2104
2105 for (i = 0; i < num_dx_leaves; i++) {
2106 bh = sb_getblk(osb->sb, start_blk + i);
2107 if (bh == NULL) {
2108 ret = -EIO;
2109 goto out;
2110 }
2111 dx_leaves[i] = bh;
2112
2113 ocfs2_set_new_buffer_uptodate(dir, bh);
2114
2115 ret = ocfs2_journal_access_dl(handle, dir, bh,
2116 OCFS2_JOURNAL_ACCESS_CREATE);
2117 if (ret < 0) {
2118 mlog_errno(ret);
2119 goto out;
2120 }
2121
2122 dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2123
2124 memset(dx_leaf, 0, osb->sb->s_blocksize);
2125 strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2126 dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2127 dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2128 dx_leaf->dl_list.de_count =
2129 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2130
2131 mlog(0,
2132 "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
2133 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2134 (unsigned long long)bh->b_blocknr,
2135 le16_to_cpu(dx_leaf->dl_list.de_count));
2136
2137 ocfs2_journal_dirty(handle, bh);
2138 }
2139
2140 ret = 0;
2141out:
2142 return ret;
2143}
2144
2145/*
2146 * Allocates and formats a new cluster for use in an indexed dir
2147 * leaf. This version will not do the extent insert, so that it can be
2148 * used by operations which need careful ordering.
2149 */
2150static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2151 u32 cpos, handle_t *handle,
2152 struct ocfs2_alloc_context *data_ac,
2153 struct buffer_head **dx_leaves,
2154 int num_dx_leaves, u64 *ret_phys_blkno)
2155{
2156 int ret;
2157 u32 phys, num;
2158 u64 phys_blkno;
2159 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2160
2161 /*
2162 * XXX: For create, this should claim cluster for the index
2163 * *before* the unindexed insert so that we have a better
2164 * chance of contiguousness as the directory grows in number
2165 * of entries.
2166 */
2167 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
2168 if (ret) {
2169 mlog_errno(ret);
2170 goto out;
2171 }
2172
2173 /*
2174 * Format the new cluster first. That way, we're inserting
2175 * valid data.
2176 */
2177 phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2178 ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2179 num_dx_leaves, phys_blkno);
2180 if (ret) {
2181 mlog_errno(ret);
2182 goto out;
2183 }
2184
2185 *ret_phys_blkno = phys_blkno;
2186out:
2187 return ret;
2188}
2189
2190static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2191 struct ocfs2_extent_tree *et,
2192 u32 cpos, handle_t *handle,
2193 struct ocfs2_alloc_context *data_ac,
2194 struct ocfs2_alloc_context *meta_ac,
2195 struct buffer_head **dx_leaves,
2196 int num_dx_leaves)
2197{
2198 int ret;
2199 u64 phys_blkno;
2200 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2201
2202 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2203 num_dx_leaves, &phys_blkno);
2204 if (ret) {
2205 mlog_errno(ret);
2206 goto out;
2207 }
2208
2209 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
2210 meta_ac);
2211 if (ret)
2212 mlog_errno(ret);
2213out:
2214 return ret;
2215}
2216
2217static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2218 int *ret_num_leaves)
2219{
2220 int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2221 struct buffer_head **dx_leaves;
2222
2223 dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2224 GFP_NOFS);
2225 if (dx_leaves && ret_num_leaves)
2226 *ret_num_leaves = num_dx_leaves;
2227
2228 return dx_leaves;
2229}
2230
2231static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2232 handle_t *handle,
2233 struct inode *parent,
2234 struct inode *inode,
2235 struct buffer_head *di_bh,
2236 struct ocfs2_alloc_context *data_ac,
2237 struct ocfs2_alloc_context *meta_ac)
2238{
2239 int ret, num_dx_leaves, i;
2240 struct buffer_head *leaf_bh = NULL;
2241 struct buffer_head *dx_root_bh = NULL;
2242 struct buffer_head **dx_leaves = NULL;
2243 struct ocfs2_extent_tree et;
2244 struct ocfs2_dx_hinfo hinfo;
2245 u64 insert_blkno;
2246
2247 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
2248 if (!dx_leaves) {
2249 ret = -ENOMEM;
2250 mlog_errno(ret);
2251 goto out;
2252 }
2253
2254 /*
2255 * Our strategy is to create the directory as though it were
2256 * unindexed, then add the index block. This works with very
2257 * little complication since the state of a new directory is a
2258 * very well known quantity.
2259 *
2260 * Essentially, we have two dirents ("." and ".."), in the 1st
2261 * block which need indexing.
2262 */
2263
2264 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2265 data_ac, &leaf_bh);
2266 if (ret) {
2267 mlog_errno(ret);
2268 goto out;
2269 }
2270
2271 /*
2272 * Allocate and format the index leaf first, before attaching
2273 * the index root. That way we're sure that the main bitmap
2274 * won't -enospc on us with a half-created dir index.
2275 *
2276 * The meta data allocation for our index block will not
2277 * -enospc on us unless there is a disk corruption.
2278 */
2279
2280 ret = __ocfs2_dx_dir_new_cluster(inode, 0, handle, data_ac, dx_leaves,
2281 num_dx_leaves, &insert_blkno);
2282 if (ret) {
2283 mlog_errno(ret);
2284 goto out;
2285 }
2286
2287 ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2288 i = ocfs2_dx_dir_hash_idx(osb, &hinfo);
2289 ret = __ocfs2_dx_dir_leaf_insert(inode, handle, &hinfo,
2290 leaf_bh->b_blocknr, dx_leaves[i]);
2291 if (ret) {
2292 mlog_errno(ret);
2293 goto out;
2294 }
2295
2296 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2297 i = ocfs2_dx_dir_hash_idx(osb, &hinfo);
2298 ret = __ocfs2_dx_dir_leaf_insert(inode, handle, &hinfo,
2299 leaf_bh->b_blocknr, dx_leaves[i]);
2300 if (ret) {
2301 mlog_errno(ret);
2302 goto out;
2303 }
2304
2305 ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, meta_ac,
2306 &dx_root_bh);
2307 if (ret) {
2308 mlog_errno(ret);
2309 goto out;
2310 }
2311
2312 /* This should never fail considering we start with an empty
2313 * dx_root. */
2314 ocfs2_init_dx_root_extent_tree(&et, inode, dx_root_bh);
2315 ret = ocfs2_insert_extent(osb, handle, inode, &et, 0,
2316 insert_blkno, 1, 0, NULL);
2317 if (ret)
2318 mlog_errno(ret);
2319
2320out:
2321 if (dx_leaves) {
2322 for (i = 0; i < num_dx_leaves; i++)
2323 brelse(dx_leaves[i]);
2324 kfree(dx_leaves);
2325 }
2326 brelse(dx_root_bh);
2327 brelse(leaf_bh);
2328 return ret;
2329}
2330
1344int ocfs2_fill_new_dir(struct ocfs2_super *osb, 2331int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1345 handle_t *handle, 2332 handle_t *handle,
1346 struct inode *parent, 2333 struct inode *parent,
1347 struct inode *inode, 2334 struct inode *inode,
1348 struct buffer_head *fe_bh, 2335 struct buffer_head *fe_bh,
1349 struct ocfs2_alloc_context *data_ac) 2336 struct ocfs2_alloc_context *data_ac,
2337 struct ocfs2_alloc_context *meta_ac)
2338
1350{ 2339{
1351 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); 2340 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
1352 2341
1353 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2342 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1354 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); 2343 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
1355 2344
2345 if (ocfs2_supports_indexed_dirs(osb))
2346 return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2347 data_ac, meta_ac);
2348
1356 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, 2349 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
1357 data_ac); 2350 data_ac, NULL);
2351}
2352
2353static int ocfs2_dx_dir_index_block(struct inode *dir,
2354 handle_t *handle,
2355 struct buffer_head **dx_leaves,
2356 int num_dx_leaves,
2357 struct buffer_head *dirent_bh)
2358{
2359 int ret, namelen, i;
2360 char *de_buf, *limit;
2361 struct ocfs2_dir_entry *de;
2362 struct buffer_head *dx_leaf_bh;
2363 struct ocfs2_dx_hinfo hinfo;
2364 u64 dirent_blk = dirent_bh->b_blocknr;
2365
2366 de_buf = dirent_bh->b_data;
2367 limit = de_buf + dir->i_sb->s_blocksize;
2368
2369 while (de_buf < limit) {
2370 de = (struct ocfs2_dir_entry *)de_buf;
2371
2372 namelen = de->name_len;
2373 if (!namelen || !de->inode)
2374 goto inc;
2375
2376 ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2377
2378 i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2379 dx_leaf_bh = dx_leaves[i];
2380
2381 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2382 dirent_blk, dx_leaf_bh);
2383 if (ret) {
2384 mlog_errno(ret);
2385 goto out;
2386 }
2387
2388inc:
2389 de_buf += le16_to_cpu(de->rec_len);
2390 }
2391
2392out:
2393 return ret;
1358} 2394}
1359 2395
1360/* 2396/*
@@ -1401,29 +2437,57 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1401 */ 2437 */
1402static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, 2438static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1403 unsigned int blocks_wanted, 2439 unsigned int blocks_wanted,
2440 struct ocfs2_dir_lookup_result *lookup,
1404 struct buffer_head **first_block_bh) 2441 struct buffer_head **first_block_bh)
1405{ 2442{
1406 u32 alloc, bit_off, len; 2443 u32 alloc, dx_alloc, bit_off, len;
1407 struct super_block *sb = dir->i_sb; 2444 struct super_block *sb = dir->i_sb;
1408 int ret, credits = ocfs2_inline_to_extents_credits(sb); 2445 int ret, i, num_dx_leaves = 0,
1409 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 2446 credits = ocfs2_inline_to_extents_credits(sb);
2447 u64 dx_insert_blkno, blkno,
2448 bytes = blocks_wanted << sb->s_blocksize_bits;
1410 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2449 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1411 struct ocfs2_inode_info *oi = OCFS2_I(dir); 2450 struct ocfs2_inode_info *oi = OCFS2_I(dir);
1412 struct ocfs2_alloc_context *data_ac; 2451 struct ocfs2_alloc_context *data_ac;
2452 struct ocfs2_alloc_context *meta_ac = NULL;
1413 struct buffer_head *dirdata_bh = NULL; 2453 struct buffer_head *dirdata_bh = NULL;
2454 struct buffer_head *dx_root_bh = NULL;
2455 struct buffer_head **dx_leaves = NULL;
1414 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2456 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1415 handle_t *handle; 2457 handle_t *handle;
1416 struct ocfs2_extent_tree et; 2458 struct ocfs2_extent_tree et;
1417 int did_quota = 0; 2459 struct ocfs2_extent_tree dx_et;
2460 int did_quota = 0, bytes_allocated = 0;
1418 2461
1419 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2462 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1420 2463
1421 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2464 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2465 dx_alloc = 0;
2466
2467 if (ocfs2_supports_indexed_dirs(osb)) {
2468 /* Add one more cluster for an index leaf */
2469 dx_alloc++;
2470 credits += ocfs2_add_dir_index_credits(sb);
2471
2472 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, &num_dx_leaves);
2473 if (!dx_leaves) {
2474 ret = -ENOMEM;
2475 mlog_errno(ret);
2476 goto out;
2477 }
2478
2479 /* This gets us the dx_root */
2480 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2481 if (ret) {
2482 mlog_errno(ret);
2483 goto out;
2484 }
2485 }
1422 2486
1423 /* 2487 /*
1424 * We should never need more than 2 clusters for this - 2488 * We should never need more than 2 clusters for the unindexed
1425 * maximum dirent size is far less than one block. In fact, 2489 * tree - maximum dirent size is far less than one block. In
1426 * the only time we'd need more than one cluster is if 2490 * fact, the only time we'd need more than one cluster is if
1427 * blocksize == clustersize and the dirent won't fit in the 2491 * blocksize == clustersize and the dirent won't fit in the
1428 * extra space that the expansion to a single block gives. As 2492 * extra space that the expansion to a single block gives. As
1429 * of today, that only happens on 4k/4k file systems. 2493 * of today, that only happens on 4k/4k file systems.
@@ -1440,7 +2504,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1440 2504
1441 /* 2505 /*
1442 * Prepare for worst case allocation scenario of two separate 2506 * Prepare for worst case allocation scenario of two separate
1443 * extents. 2507 * extents in the unindexed tree.
1444 */ 2508 */
1445 if (alloc == 2) 2509 if (alloc == 2)
1446 credits += OCFS2_SUBALLOC_ALLOC; 2510 credits += OCFS2_SUBALLOC_ALLOC;
@@ -1453,11 +2517,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1453 } 2517 }
1454 2518
1455 if (vfs_dq_alloc_space_nodirty(dir, 2519 if (vfs_dq_alloc_space_nodirty(dir,
1456 ocfs2_clusters_to_bytes(osb->sb, alloc))) { 2520 ocfs2_clusters_to_bytes(osb->sb,
2521 alloc + dx_alloc))) {
1457 ret = -EDQUOT; 2522 ret = -EDQUOT;
1458 goto out_commit; 2523 goto out_commit;
1459 } 2524 }
1460 did_quota = 1; 2525 did_quota = 1;
2526
2527 if (ocfs2_supports_indexed_dirs(osb)) {
2528 /*
2529 * Allocate our index cluster first, to maximize the
2530 * possibility that unindexed leaves grow
2531 * contiguously.
2532 */
2533 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2534 dx_leaves, num_dx_leaves,
2535 &dx_insert_blkno);
2536 if (ret) {
2537 mlog_errno(ret);
2538 goto out_commit;
2539 }
2540 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2541 }
2542
1461 /* 2543 /*
1462 * Try to claim as many clusters as the bitmap can give though 2544 * Try to claim as many clusters as the bitmap can give though
1463 * if we only get one now, that's enough to continue. The rest 2545 * if we only get one now, that's enough to continue. The rest
@@ -1468,6 +2550,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1468 mlog_errno(ret); 2550 mlog_errno(ret);
1469 goto out_commit; 2551 goto out_commit;
1470 } 2552 }
2553 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1471 2554
1472 /* 2555 /*
1473 * Operations are carefully ordered so that we set up the new 2556 * Operations are carefully ordered so that we set up the new
@@ -1504,6 +2587,15 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1504 goto out_commit; 2587 goto out_commit;
1505 } 2588 }
1506 2589
2590 if (ocfs2_supports_indexed_dirs(osb)) {
2591 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
2592 num_dx_leaves, dirdata_bh);
2593 if (ret) {
2594 mlog_errno(ret);
2595 goto out_commit;
2596 }
2597 }
2598
1507 /* 2599 /*
1508 * Set extent, i_size, etc on the directory. After this, the 2600 * Set extent, i_size, etc on the directory. After this, the
1509 * inode should contain the same exact dirents as before and 2601 * inode should contain the same exact dirents as before and
@@ -1556,6 +2648,21 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1556 goto out_commit; 2648 goto out_commit;
1557 } 2649 }
1558 2650
2651 if (ocfs2_supports_indexed_dirs(osb)) {
2652 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
2653 meta_ac, &dx_root_bh);
2654 if (ret) {
2655 mlog_errno(ret);
2656 goto out_commit;
2657 }
2658
2659 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
2660 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
2661 dx_insert_blkno, 1, 0, NULL);
2662 if (ret)
2663 mlog_errno(ret);
2664 }
2665
1559 /* 2666 /*
1560 * We asked for two clusters, but only got one in the 1st 2667 * We asked for two clusters, but only got one in the 1st
1561 * pass. Claim the 2nd cluster as a separate extent. 2668 * pass. Claim the 2nd cluster as a separate extent.
@@ -1575,15 +2682,28 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1575 mlog_errno(ret); 2682 mlog_errno(ret);
1576 goto out_commit; 2683 goto out_commit;
1577 } 2684 }
2685 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1578 } 2686 }
1579 2687
1580 *first_block_bh = dirdata_bh; 2688 *first_block_bh = dirdata_bh;
1581 dirdata_bh = NULL; 2689 dirdata_bh = NULL;
2690 if (ocfs2_supports_indexed_dirs(osb)) {
2691 unsigned int off;
2692
2693 /*
2694 * We need to return the correct block within the
2695 * cluster which should hold our entry.
2696 */
2697 off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
2698 &lookup->dl_hinfo);
2699 get_bh(dx_leaves[off]);
2700 lookup->dl_dx_leaf_bh = dx_leaves[off];
2701 }
1582 2702
1583out_commit: 2703out_commit:
1584 if (ret < 0 && did_quota) 2704 if (ret < 0 && did_quota)
1585 vfs_dq_free_space_nodirty(dir, 2705 vfs_dq_free_space_nodirty(dir, bytes_allocated);
1586 ocfs2_clusters_to_bytes(osb->sb, 2)); 2706
1587 ocfs2_commit_trans(osb, handle); 2707 ocfs2_commit_trans(osb, handle);
1588 2708
1589out_sem: 2709out_sem:
@@ -1592,8 +2712,17 @@ out_sem:
1592out: 2712out:
1593 if (data_ac) 2713 if (data_ac)
1594 ocfs2_free_alloc_context(data_ac); 2714 ocfs2_free_alloc_context(data_ac);
2715 if (meta_ac)
2716 ocfs2_free_alloc_context(meta_ac);
2717
2718 if (dx_leaves) {
2719 for (i = 0; i < num_dx_leaves; i++)
2720 brelse(dx_leaves[i]);
2721 kfree(dx_leaves);
2722 }
1595 2723
1596 brelse(dirdata_bh); 2724 brelse(dirdata_bh);
2725 brelse(dx_root_bh);
1597 2726
1598 return ret; 2727 return ret;
1599} 2728}
@@ -1668,6 +2797,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1668 struct inode *dir, 2797 struct inode *dir,
1669 struct buffer_head *parent_fe_bh, 2798 struct buffer_head *parent_fe_bh,
1670 unsigned int blocks_wanted, 2799 unsigned int blocks_wanted,
2800 struct ocfs2_dir_lookup_result *lookup,
1671 struct buffer_head **new_de_bh) 2801 struct buffer_head **new_de_bh)
1672{ 2802{
1673 int status = 0; 2803 int status = 0;
@@ -1687,7 +2817,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1687 2817
1688 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2818 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1689 status = ocfs2_expand_inline_dir(dir, parent_fe_bh, 2819 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
1690 blocks_wanted, &new_bh); 2820 blocks_wanted, lookup,
2821 &new_bh);
1691 if (status) { 2822 if (status) {
1692 mlog_errno(status); 2823 mlog_errno(status);
1693 goto bail; 2824 goto bail;
@@ -1975,6 +3106,487 @@ bail:
1975 return status; 3106 return status;
1976} 3107}
1977 3108
3109static int dx_leaf_sort_cmp(const void *a, const void *b)
3110{
3111 const struct ocfs2_dx_entry *entry1 = a;
3112 const struct ocfs2_dx_entry *entry2 = b;
3113 u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3114 u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3115 u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3116 u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3117
3118 if (major_hash1 > major_hash2)
3119 return 1;
3120 if (major_hash1 < major_hash2)
3121 return -1;
3122
3123 /*
3124 * It is not strictly necessary to sort by minor
3125 */
3126 if (minor_hash1 > minor_hash2)
3127 return 1;
3128 if (minor_hash1 < minor_hash2)
3129 return -1;
3130 return 0;
3131}
3132
3133static void dx_leaf_sort_swap(void *a, void *b, int size)
3134{
3135 struct ocfs2_dx_entry *entry1 = a;
3136 struct ocfs2_dx_entry *entry2 = b;
3137 struct ocfs2_dx_entry tmp;
3138
3139 BUG_ON(size != sizeof(*entry1));
3140
3141 tmp = *entry1;
3142 *entry1 = *entry2;
3143 *entry2 = tmp;
3144}
3145
3146static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3147{
3148 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3149 int i, num = le16_to_cpu(dl_list->de_num_used);
3150
3151 for (i = 0; i < (num - 1); i++) {
3152 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3153 le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3154 return 0;
3155 }
3156
3157 return 1;
3158}
3159
3160/*
3161 * Find the optimal value to split this leaf on. This expects the leaf
3162 * entries to be in sorted order.
3163 *
3164 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3165 * the hash we want to insert.
3166 *
3167 * This function is only concerned with the major hash - that which
3168 * determines which cluster an item belongs to.
3169 */
3170static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3171 u32 leaf_cpos, u32 insert_hash,
3172 u32 *split_hash)
3173{
3174 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3175 int i, num_used = le16_to_cpu(dl_list->de_num_used);
3176 int allsame;
3177
3178 /*
3179 * There's a couple rare, but nasty corner cases we have to
3180 * check for here. All of them involve a leaf where all value
3181 * have the same hash, which is what we look for first.
3182 *
3183 * Most of the time, all of the above is false, and we simply
3184 * pick the median value for a split.
3185 */
3186 allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3187 if (allsame) {
3188 u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3189
3190 if (val == insert_hash) {
3191 /*
3192 * No matter where we would choose to split,
3193 * the new entry would want to occupy the same
3194 * block as these. Since there's no space left
3195 * in their existing block, we know there
3196 * won't be space after the split.
3197 */
3198 return -ENOSPC;
3199 }
3200
3201 if (val == leaf_cpos) {
3202 /*
3203 * Because val is the same as leaf_cpos (which
3204 * is the smallest value this leaf can have),
3205 * yet is not equal to insert_hash, then we
3206 * know that insert_hash *must* be larger than
3207 * val (and leaf_cpos). At least cpos+1 in value.
3208 *
3209 * We also know then, that there cannot be an
3210 * adjacent extent (otherwise we'd be looking
3211 * at it). Choosing this value gives us a
3212 * chance to get some contiguousness.
3213 */
3214 *split_hash = leaf_cpos + 1;
3215 return 0;
3216 }
3217
3218 if (val > insert_hash) {
3219 /*
3220 * val can not be the same as insert hash, and
3221 * also must be larger than leaf_cpos. Also,
3222 * we know that there can't be a leaf between
3223 * cpos and val, otherwise the entries with
3224 * hash 'val' would be there.
3225 */
3226 *split_hash = val;
3227 return 0;
3228 }
3229
3230 *split_hash = insert_hash;
3231 return 0;
3232 }
3233
3234 /*
3235 * Since the records are sorted and the checks above
3236 * guaranteed that not all records in this block are the same,
3237 * we simple travel forward, from the median, and pick the 1st
3238 * record whose value is larger than leaf_cpos.
3239 */
3240 for (i = (num_used / 2); i < num_used; i++)
3241 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3242 leaf_cpos)
3243 break;
3244
3245 BUG_ON(i == num_used); /* Should be impossible */
3246 *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3247 return 0;
3248}
3249
3250/*
3251 * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3252 * larger than split_hash into new_dx_leaves. We use a temporary
3253 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3254 *
3255 * Since the block offset inside a leaf (cluster) is a constant mask
3256 * of minor_hash, we can optimize - an item at block offset X within
3257 * the original cluster, will be at offset X within the new cluster.
3258 */
3259static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3260 handle_t *handle,
3261 struct ocfs2_dx_leaf *tmp_dx_leaf,
3262 struct buffer_head **orig_dx_leaves,
3263 struct buffer_head **new_dx_leaves,
3264 int num_dx_leaves)
3265{
3266 int i, j, num_used;
3267 u32 major_hash;
3268 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3269 struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
3270 struct ocfs2_dx_entry *dx_entry;
3271
3272 tmp_list = &tmp_dx_leaf->dl_list;
3273
3274 for (i = 0; i < num_dx_leaves; i++) {
3275 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3276 orig_list = &orig_dx_leaf->dl_list;
3277 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3278 new_list = &new_dx_leaf->dl_list;
3279
3280 num_used = le16_to_cpu(orig_list->de_num_used);
3281
3282 memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3283 tmp_list->de_num_used = cpu_to_le16(0);
3284 memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3285
3286 for (j = 0; j < num_used; j++) {
3287 dx_entry = &orig_list->de_entries[j];
3288 major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3289 if (major_hash >= split_hash)
3290 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3291 dx_entry);
3292 else
3293 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3294 dx_entry);
3295 }
3296 memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3297
3298 ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3299 ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3300 }
3301}
3302
3303static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3304 struct ocfs2_dx_root_block *dx_root)
3305{
3306 int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
3307
3308 credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
3309 credits += ocfs2_quota_trans_credits(osb->sb);
3310 return credits;
3311}
3312
3313/*
3314 * Find the median value in dx_leaf_bh and allocate a new leaf to move
3315 * half our entries into.
3316 */
3317static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3318 struct buffer_head *dx_root_bh,
3319 struct buffer_head *dx_leaf_bh,
3320 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3321 u64 leaf_blkno)
3322{
3323 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3324 int credits, ret, i, num_used, did_quota = 0;
3325 u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3326 u64 orig_leaves_start;
3327 int num_dx_leaves;
3328 struct buffer_head **orig_dx_leaves = NULL;
3329 struct buffer_head **new_dx_leaves = NULL;
3330 struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3331 struct ocfs2_extent_tree et;
3332 handle_t *handle = NULL;
3333 struct ocfs2_dx_root_block *dx_root;
3334 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3335
3336 mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
3337 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3338 (unsigned long long)leaf_blkno, insert_hash);
3339
3340 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
3341
3342 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3343 /*
3344 * XXX: This is a rather large limit. We should use a more
3345 * realistic value.
3346 */
3347 if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3348 return -ENOSPC;
3349
3350 num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3351 if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3352 mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3353 "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3354 (unsigned long long)leaf_blkno, num_used);
3355 ret = -EIO;
3356 goto out;
3357 }
3358
3359 orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3360 if (!orig_dx_leaves) {
3361 ret = -ENOMEM;
3362 mlog_errno(ret);
3363 goto out;
3364 }
3365
3366 new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3367 if (!new_dx_leaves) {
3368 ret = -ENOMEM;
3369 mlog_errno(ret);
3370 goto out;
3371 }
3372
3373 ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3374 if (ret) {
3375 if (ret != -ENOSPC)
3376 mlog_errno(ret);
3377 goto out;
3378 }
3379
3380 credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3381 handle = ocfs2_start_trans(osb, credits);
3382 if (IS_ERR(handle)) {
3383 ret = PTR_ERR(handle);
3384 handle = NULL;
3385 mlog_errno(ret);
3386 goto out;
3387 }
3388
3389 if (vfs_dq_alloc_space_nodirty(dir,
3390 ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
3391 ret = -EDQUOT;
3392 goto out_commit;
3393 }
3394 did_quota = 1;
3395
3396 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
3397 OCFS2_JOURNAL_ACCESS_WRITE);
3398 if (ret) {
3399 mlog_errno(ret);
3400 goto out_commit;
3401 }
3402
3403 /*
3404 * This block is changing anyway, so we can sort it in place.
3405 */
3406 sort(dx_leaf->dl_list.de_entries, num_used,
3407 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3408 dx_leaf_sort_swap);
3409
3410 ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
3411 if (ret) {
3412 mlog_errno(ret);
3413 goto out_commit;
3414 }
3415
3416 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3417 &split_hash);
3418 if (ret) {
3419 mlog_errno(ret);
3420 goto out_commit;
3421 }
3422
3423 mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
3424 leaf_cpos, split_hash, insert_hash);
3425
3426 /*
3427 * We have to carefully order operations here. There are items
3428 * which want to be in the new cluster before insert, but in
3429 * order to put those items in the new cluster, we alter the
3430 * old cluster. A failure to insert gets nasty.
3431 *
3432 * So, start by reserving writes to the old
3433 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3434 * the new cluster for us, before inserting it. The insert
3435 * won't happen if there's an error before that. Once the
3436 * insert is done then, we can transfer from one leaf into the
3437 * other without fear of hitting any error.
3438 */
3439
3440 /*
3441 * The leaf transfer wants some scratch space so that we don't
3442 * wind up doing a bunch of expensive memmove().
3443 */
3444 tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3445 if (!tmp_dx_leaf) {
3446 ret = -ENOMEM;
3447 mlog_errno(ret);
3448 goto out_commit;
3449 }
3450
3451 orig_leaves_start = leaf_blkno & ~(osb->s_clustersize_bits -
3452 osb->sb->s_blocksize_bits);
3453 ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3454 orig_dx_leaves);
3455 if (ret) {
3456 mlog_errno(ret);
3457 goto out_commit;
3458 }
3459
3460 for (i = 0; i < num_dx_leaves; i++) {
3461 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
3462 OCFS2_JOURNAL_ACCESS_WRITE);
3463 if (ret) {
3464 mlog_errno(ret);
3465 goto out_commit;
3466 }
3467 }
3468
3469 cpos = split_hash;
3470 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3471 data_ac, meta_ac, new_dx_leaves,
3472 num_dx_leaves);
3473 if (ret) {
3474 mlog_errno(ret);
3475 goto out_commit;
3476 }
3477
3478 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3479 orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3480
3481out_commit:
3482 if (ret < 0 && did_quota)
3483 vfs_dq_free_space_nodirty(dir,
3484 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3485
3486 ocfs2_commit_trans(osb, handle);
3487
3488out:
3489 if (orig_dx_leaves || new_dx_leaves) {
3490 for (i = 0; i < num_dx_leaves; i++) {
3491 if (orig_dx_leaves)
3492 brelse(orig_dx_leaves[i]);
3493 if (new_dx_leaves)
3494 brelse(new_dx_leaves[i]);
3495 }
3496 kfree(orig_dx_leaves);
3497 kfree(new_dx_leaves);
3498 }
3499
3500 if (meta_ac)
3501 ocfs2_free_alloc_context(meta_ac);
3502 if (data_ac)
3503 ocfs2_free_alloc_context(data_ac);
3504
3505 kfree(tmp_dx_leaf);
3506 return ret;
3507}
3508
3509static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
3510 struct buffer_head *di_bh, const char *name,
3511 int namelen,
3512 struct ocfs2_dir_lookup_result *lookup)
3513{
3514 int ret, rebalanced = 0;
3515 struct buffer_head *dx_root_bh = NULL;
3516 struct ocfs2_dx_root_block *dx_root;
3517 struct buffer_head *dx_leaf_bh = NULL;
3518 struct ocfs2_dx_leaf *dx_leaf;
3519 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3520 u64 blkno;
3521 u32 leaf_cpos;
3522
3523 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
3524 if (ret) {
3525 mlog_errno(ret);
3526 goto out;
3527 }
3528
3529 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3530
3531restart_search:
3532 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
3533 &leaf_cpos, &blkno);
3534 if (ret) {
3535 mlog_errno(ret);
3536 goto out;
3537 }
3538
3539 ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
3540 if (ret) {
3541 mlog_errno(ret);
3542 goto out;
3543 }
3544
3545 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3546
3547 if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
3548 le16_to_cpu(dx_leaf->dl_list.de_count)) {
3549 if (rebalanced) {
3550 /*
3551 * Rebalancing should have provided us with
3552 * space in an appropriate leaf.
3553 *
3554 * XXX: Is this an abnormal condition then?
3555 * Should we print a message here?
3556 */
3557 ret = -ENOSPC;
3558 goto out;
3559 }
3560
3561 ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
3562 &lookup->dl_hinfo, leaf_cpos,
3563 blkno);
3564 if (ret) {
3565 if (ret != -ENOSPC)
3566 mlog_errno(ret);
3567 goto out;
3568 }
3569
3570 /*
3571 * Restart the lookup. The rebalance might have
3572 * changed which block our item fits into. Mark our
3573 * progress, so we only execute this once.
3574 */
3575 brelse(dx_leaf_bh);
3576 dx_leaf_bh = NULL;
3577 rebalanced = 1;
3578 goto restart_search;
3579 }
3580
3581 lookup->dl_dx_leaf_bh = dx_leaf_bh;
3582 dx_leaf_bh = NULL;
3583
3584out:
3585 brelse(dx_leaf_bh);
3586 brelse(dx_root_bh);
3587 return ret;
3588}
3589
1978/* 3590/*
1979 * Get a directory ready for insert. Any directory allocation required 3591 * Get a directory ready for insert. Any directory allocation required
1980 * happens here. Success returns zero, and enough context in the dir 3592 * happens here. Success returns zero, and enough context in the dir
@@ -2001,6 +3613,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2001 goto out; 3613 goto out;
2002 } 3614 }
2003 3615
3616 /*
3617 * Do this up front to reduce confusion.
3618 *
3619 * The directory might start inline, then be turned into an
3620 * indexed one, in which case we'd need to hash deep inside
3621 * ocfs2_find_dir_space_id(). Since
3622 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
3623 * done, there seems no point in spreading out the calls. We
3624 * can optimize away the case where the file system doesn't
3625 * support indexing.
3626 */
3627 if (ocfs2_supports_indexed_dirs(osb))
3628 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
3629
3630 if (ocfs2_dir_indexed(dir)) {
3631 ret = ocfs2_find_dir_space_dx(osb, dir, parent_fe_bh, name,
3632 namelen, lookup);
3633 if (ret) {
3634 mlog_errno(ret);
3635 goto out;
3636 }
3637
3638 /*
3639 * We intentionally fall through so that the unindexed
3640 * tree can also be prepared.
3641 */
3642 }
3643
2004 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 3644 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2005 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, 3645 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
2006 namelen, &bh, &blocks_wanted); 3646 namelen, &bh, &blocks_wanted);
@@ -2019,7 +3659,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2019 BUG_ON(bh); 3659 BUG_ON(bh);
2020 3660
2021 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, 3661 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
2022 &bh); 3662 lookup, &bh);
2023 if (ret) { 3663 if (ret) {
2024 if (ret != -ENOSPC) 3664 if (ret != -ENOSPC)
2025 mlog_errno(ret); 3665 mlog_errno(ret);
@@ -2035,3 +3675,145 @@ out:
2035 brelse(bh); 3675 brelse(bh);
2036 return ret; 3676 return ret;
2037} 3677}
3678
3679static int ocfs2_dx_dir_remove_index(struct inode *dir,
3680 struct buffer_head *di_bh,
3681 struct buffer_head *dx_root_bh)
3682{
3683 int ret;
3684 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
3685 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3686 struct ocfs2_dx_root_block *dx_root;
3687 struct inode *dx_alloc_inode = NULL;
3688 struct buffer_head *dx_alloc_bh = NULL;
3689 handle_t *handle;
3690 u64 blk;
3691 u16 bit;
3692 u64 bg_blkno;
3693
3694 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
3695
3696 dx_alloc_inode = ocfs2_get_system_file_inode(osb,
3697 EXTENT_ALLOC_SYSTEM_INODE,
3698 le16_to_cpu(dx_root->dr_suballoc_slot));
3699 if (!dx_alloc_inode) {
3700 ret = -ENOMEM;
3701 mlog_errno(ret);
3702 goto out;
3703 }
3704 mutex_lock(&dx_alloc_inode->i_mutex);
3705
3706 ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
3707 if (ret) {
3708 mlog_errno(ret);
3709 goto out_mutex;
3710 }
3711
3712 handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
3713 if (IS_ERR(handle)) {
3714 ret = PTR_ERR(handle);
3715 mlog_errno(ret);
3716 goto out_unlock;
3717 }
3718
3719 ret = ocfs2_journal_access_di(handle, dir, di_bh,
3720 OCFS2_JOURNAL_ACCESS_WRITE);
3721 if (ret) {
3722 mlog_errno(ret);
3723 goto out_commit;
3724 }
3725
3726 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
3727 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
3728 di->i_dx_root = cpu_to_le64(0ULL);
3729
3730 ocfs2_journal_dirty(handle, di_bh);
3731
3732 blk = le64_to_cpu(dx_root->dr_blkno);
3733 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
3734 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
3735 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
3736 bit, bg_blkno, 1);
3737 if (ret)
3738 mlog_errno(ret);
3739
3740out_commit:
3741 ocfs2_commit_trans(osb, handle);
3742
3743out_unlock:
3744 ocfs2_inode_unlock(dx_alloc_inode, 1);
3745
3746out_mutex:
3747 mutex_unlock(&dx_alloc_inode->i_mutex);
3748 brelse(dx_alloc_bh);
3749out:
3750 iput(dx_alloc_inode);
3751 return ret;
3752}
3753
3754int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
3755{
3756 int ret;
3757 unsigned int uninitialized_var(clen);
3758 u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
3759 u64 uninitialized_var(blkno);
3760 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
3761 struct buffer_head *dx_root_bh = NULL;
3762 struct ocfs2_dx_root_block *dx_root;
3763 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3764 struct ocfs2_cached_dealloc_ctxt dealloc;
3765 struct ocfs2_extent_tree et;
3766
3767 ocfs2_init_dealloc_ctxt(&dealloc);
3768
3769 if (!ocfs2_dir_indexed(dir))
3770 return 0;
3771
3772 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
3773 if (ret) {
3774 mlog_errno(ret);
3775 goto out;
3776 }
3777
3778 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
3779
3780 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3781
3782 /* XXX: What if dr_clusters is too large? */
3783 while (le32_to_cpu(dx_root->dr_clusters)) {
3784 ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
3785 major_hash, &cpos, &blkno, &clen);
3786 if (ret) {
3787 mlog_errno(ret);
3788 goto out;
3789 }
3790
3791 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
3792
3793 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
3794 &dealloc);
3795 if (ret) {
3796 mlog_errno(ret);
3797 goto out;
3798 }
3799
3800 if (cpos == 0)
3801 break;
3802
3803 major_hash = cpos - 1;
3804 }
3805
3806 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
3807 if (ret) {
3808 mlog_errno(ret);
3809 goto out;
3810 }
3811
3812 ocfs2_remove_from_cache(dir, dx_root_bh);
3813out:
3814 ocfs2_schedule_truncate_log_flush(osb, 1);
3815 ocfs2_run_deallocs(osb, &dealloc);
3816
3817 brelse(dx_root_bh);
3818 return ret;
3819}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index 505d3af9bba5..d273aaef2ec2 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,9 +26,21 @@
26#ifndef OCFS2_DIR_H 26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H 27#define OCFS2_DIR_H
28 28
29struct ocfs2_dx_hinfo {
30 u32 major_hash;
31 u32 minor_hash;
32};
33
29struct ocfs2_dir_lookup_result { 34struct ocfs2_dir_lookup_result {
30 struct buffer_head *dl_leaf_bh; 35 struct buffer_head *dl_leaf_bh; /* Unindexed leaf
31 struct ocfs2_dir_entry *dl_entry; 36 * block */
37 struct ocfs2_dir_entry *dl_entry; /* Target dirent in
38 * unindexed leaf */
39
40 struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */
41 struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in
42 * indexed leaf */
43 struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */
32}; 44};
33void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res); 45void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
34 46
@@ -85,7 +97,10 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
85 struct inode *parent, 97 struct inode *parent,
86 struct inode *inode, 98 struct inode *inode,
87 struct buffer_head *fe_bh, 99 struct buffer_head *fe_bh,
88 struct ocfs2_alloc_context *data_ac); 100 struct ocfs2_alloc_context *data_ac,
101 struct ocfs2_alloc_context *meta_ac);
102
103int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
89 104
90struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, 105struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
91 void *data); 106 void *data);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..d273c4a51a00 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "alloc.h" 40#include "alloc.h"
41#include "dir.h"
41#include "blockcheck.h" 42#include "blockcheck.h"
42#include "dlmglue.h" 43#include "dlmglue.h"
43#include "extent_map.h" 44#include "extent_map.h"
@@ -606,7 +607,7 @@ static int ocfs2_remove_inode(struct inode *inode,
606 } 607 }
607 608
608 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + 609 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
609 ocfs2_quota_trans_credits(inode->i_sb)); 610 ocfs2_quota_trans_credits(inode->i_sb));
610 if (IS_ERR(handle)) { 611 if (IS_ERR(handle)) {
611 status = PTR_ERR(handle); 612 status = PTR_ERR(handle);
612 mlog_errno(status); 613 mlog_errno(status);
@@ -740,6 +741,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
740 goto bail_unlock_dir; 741 goto bail_unlock_dir;
741 } 742 }
742 743
744 /* Remove any dir index tree */
745 if (S_ISDIR(inode->i_mode)) {
746 status = ocfs2_dx_dir_truncate(inode, di_bh);
747 if (status) {
748 mlog_errno(status);
749 goto bail_unlock_dir;
750 }
751 }
752
743 /*Free extended attribute resources associated with this inode.*/ 753 /*Free extended attribute resources associated with this inode.*/
744 status = ocfs2_xattr_remove(inode, di_bh); 754 status = ocfs2_xattr_remove(inode, di_bh);
745 if (status < 0) { 755 if (status < 0) {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4c8f3557fe93..a70d49da2ceb 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -484,6 +484,22 @@ static struct ocfs2_triggers dq_triggers = {
484 }, 484 },
485}; 485};
486 486
487static struct ocfs2_triggers dr_triggers = {
488 .ot_triggers = {
489 .t_commit = ocfs2_commit_trigger,
490 .t_abort = ocfs2_abort_trigger,
491 },
492 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
493};
494
495static struct ocfs2_triggers dl_triggers = {
496 .ot_triggers = {
497 .t_commit = ocfs2_commit_trigger,
498 .t_abort = ocfs2_abort_trigger,
499 },
500 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
501};
502
487static int __ocfs2_journal_access(handle_t *handle, 503static int __ocfs2_journal_access(handle_t *handle,
488 struct inode *inode, 504 struct inode *inode,
489 struct buffer_head *bh, 505 struct buffer_head *bh,
@@ -588,6 +604,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
588 type); 604 type);
589} 605}
590 606
607int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
608 struct buffer_head *bh, int type)
609{
610 return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
611 type);
612}
613
614int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
615 struct buffer_head *bh, int type)
616{
617 return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
618 type);
619}
620
591int ocfs2_journal_access(handle_t *handle, struct inode *inode, 621int ocfs2_journal_access(handle_t *handle, struct inode *inode,
592 struct buffer_head *bh, int type) 622 struct buffer_head *bh, int type)
593{ 623{
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 21601ee3f25a..4939c04290ca 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -277,6 +277,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
277/* dirblock */ 277/* dirblock */
278int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 278int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
279 struct buffer_head *bh, int type); 279 struct buffer_head *bh, int type);
280/* ocfs2_dx_root_block */
281int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
282 struct buffer_head *bh, int type);
283/* ocfs2_dx_leaf */
284int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
285 struct buffer_head *bh, int type);
280/* Anything that has no ecc */ 286/* Anything that has no ecc */
281int ocfs2_journal_access(handle_t *handle, struct inode *inode, 287int ocfs2_journal_access(handle_t *handle, struct inode *inode,
282 struct buffer_head *bh, int type); 288 struct buffer_head *bh, int type);
@@ -382,11 +388,26 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
382 * bitmap block for the new bit) */ 388 * bitmap block for the new bit) */
383#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 389#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
384 390
385/* parent fe, parent block, new file entry, inode alloc fe, inode alloc 391static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
386 * group descriptor + mkdir/symlink blocks + quota update */ 392{
387static inline int ocfs2_mknod_credits(struct super_block *sb) 393 /* 1 block for index, 2 allocs (data, metadata), 1 clusters
394 * worth of blocks for initial extent. */
395 return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
396 ocfs2_clusters_to_blocks(sb, 1);
397}
398
399/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
400 * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
401 * blocks + quota update */
402static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
403 int xattr_credits)
388{ 404{
389 return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + 405 int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
406
407 if (is_dir)
408 dir_credits += ocfs2_add_dir_index_credits(sb);
409
410 return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
390 ocfs2_quota_trans_credits(sb); 411 ocfs2_quota_trans_credits(sb);
391} 412}
392 413
@@ -399,31 +420,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
399#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 420#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
400 421
401/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 422/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
402 * update on dir */ 423 * update on dir + index leaf */
403static inline int ocfs2_link_credits(struct super_block *sb) 424static inline int ocfs2_link_credits(struct super_block *sb)
404{ 425{
405 return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + 426 return 2*OCFS2_INODE_UPDATE_CREDITS + 2 +
406 ocfs2_quota_trans_credits(sb); 427 ocfs2_quota_trans_credits(sb);
407} 428}
408 429
409/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 430/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
410 * dir inode link */ 431 * dir inode link + dir inode index leaf */
411static inline int ocfs2_unlink_credits(struct super_block *sb) 432static inline int ocfs2_unlink_credits(struct super_block *sb)
412{ 433{
413 /* The quota update from ocfs2_link_credits is unused here... */ 434 /* The quota update from ocfs2_link_credits is unused here... */
414 return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); 435 return 2 * OCFS2_INODE_UPDATE_CREDITS + 2 + ocfs2_link_credits(sb);
415} 436}
416 437
417/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 438/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
418 * inode alloc group descriptor */ 439 * inode alloc group descriptor + orphan dir index leaf */
419#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) 440#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
420 441
421/* dinode update, old dir dinode update, new dir dinode update, old 442/* dinode update, old dir dinode update, new dir dinode update, old
422 * dir dir entry, new dir dir entry, dir entry update for renaming 443 * dir dir entry, new dir dir entry, dir entry update for renaming
423 * directory + target unlink */ 444 * directory + target unlink + 3 x dir index leaves */
424static inline int ocfs2_rename_credits(struct super_block *sb) 445static inline int ocfs2_rename_credits(struct super_block *sb)
425{ 446{
426 return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); 447 return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
427} 448}
428 449
429/* global bitmap dinode, group desc., relinked group, 450/* global bitmap dinode, group desc., relinked group,
@@ -433,6 +454,10 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
433 + OCFS2_INODE_UPDATE_CREDITS \ 454 + OCFS2_INODE_UPDATE_CREDITS \
434 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) 455 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
435 456
457/* inode update, removal of dx root block from allocator */
458#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
459 OCFS2_SUBALLOC_FREE)
460
436/* 461/*
437 * Please note that the caller must make sure that root_el is the root 462 * Please note that the caller must make sure that root_el is the root
438 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise 463 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -468,7 +493,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
468 493
469static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 494static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
470{ 495{
471 int blocks = ocfs2_mknod_credits(sb); 496 int blocks = ocfs2_mknod_credits(sb, 0, 0);
472 497
473 /* links can be longer than one block so we may update many 498 /* links can be longer than one block so we may update many
474 * within our single allocated extent. */ 499 * within our single allocated extent. */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d3a5a09d88ff..0c5507193200 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -231,8 +231,9 @@ static int ocfs2_mknod(struct inode *dir,
231 struct inode *inode = NULL; 231 struct inode *inode = NULL;
232 struct ocfs2_alloc_context *inode_ac = NULL; 232 struct ocfs2_alloc_context *inode_ac = NULL;
233 struct ocfs2_alloc_context *data_ac = NULL; 233 struct ocfs2_alloc_context *data_ac = NULL;
234 struct ocfs2_alloc_context *xattr_ac = NULL; 234 struct ocfs2_alloc_context *meta_ac = NULL;
235 int want_clusters = 0; 235 int want_clusters = 0;
236 int want_meta = 0;
236 int xattr_credits = 0; 237 int xattr_credits = 0;
237 struct ocfs2_security_xattr_info si = { 238 struct ocfs2_security_xattr_info si = {
238 .enable = 1, 239 .enable = 1,
@@ -308,17 +309,31 @@ static int ocfs2_mknod(struct inode *dir,
308 309
309 /* calculate meta data/clusters for setting security and acl xattr */ 310 /* calculate meta data/clusters for setting security and acl xattr */
310 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, 311 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
311 &si, &want_clusters, 312 &si, &want_clusters,
312 &xattr_credits, &xattr_ac); 313 &xattr_credits, &want_meta);
313 if (status < 0) { 314 if (status < 0) {
314 mlog_errno(status); 315 mlog_errno(status);
315 goto leave; 316 goto leave;
316 } 317 }
317 318
318 /* Reserve a cluster if creating an extent based directory. */ 319 /* Reserve a cluster if creating an extent based directory. */
319 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) 320 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
320 want_clusters += 1; 321 want_clusters += 1;
321 322
323 /* Dir indexing requires extra space as well */
324 if (ocfs2_supports_indexed_dirs(osb)) {
325 want_clusters++;
326 want_meta++;
327 }
328 }
329
330 status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
331 if (status < 0) {
332 if (status != -ENOSPC)
333 mlog_errno(status);
334 goto leave;
335 }
336
322 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); 337 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
323 if (status < 0) { 338 if (status < 0) {
324 if (status != -ENOSPC) 339 if (status != -ENOSPC)
@@ -326,8 +341,9 @@ static int ocfs2_mknod(struct inode *dir,
326 goto leave; 341 goto leave;
327 } 342 }
328 343
329 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + 344 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
330 xattr_credits); 345 S_ISDIR(mode),
346 xattr_credits));
331 if (IS_ERR(handle)) { 347 if (IS_ERR(handle)) {
332 status = PTR_ERR(handle); 348 status = PTR_ERR(handle);
333 handle = NULL; 349 handle = NULL;
@@ -355,7 +371,7 @@ static int ocfs2_mknod(struct inode *dir,
355 371
356 if (S_ISDIR(mode)) { 372 if (S_ISDIR(mode)) {
357 status = ocfs2_fill_new_dir(osb, handle, dir, inode, 373 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
358 new_fe_bh, data_ac); 374 new_fe_bh, data_ac, meta_ac);
359 if (status < 0) { 375 if (status < 0) {
360 mlog_errno(status); 376 mlog_errno(status);
361 goto leave; 377 goto leave;
@@ -377,7 +393,7 @@ static int ocfs2_mknod(struct inode *dir,
377 } 393 }
378 394
379 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, 395 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
380 xattr_ac, data_ac); 396 meta_ac, data_ac);
381 if (status < 0) { 397 if (status < 0) {
382 mlog_errno(status); 398 mlog_errno(status);
383 goto leave; 399 goto leave;
@@ -385,7 +401,7 @@ static int ocfs2_mknod(struct inode *dir,
385 401
386 if (si.enable) { 402 if (si.enable) {
387 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, 403 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
388 xattr_ac, data_ac); 404 meta_ac, data_ac);
389 if (status < 0) { 405 if (status < 0) {
390 mlog_errno(status); 406 mlog_errno(status);
391 goto leave; 407 goto leave;
@@ -440,8 +456,8 @@ leave:
440 if (data_ac) 456 if (data_ac)
441 ocfs2_free_alloc_context(data_ac); 457 ocfs2_free_alloc_context(data_ac);
442 458
443 if (xattr_ac) 459 if (meta_ac)
444 ocfs2_free_alloc_context(xattr_ac); 460 ocfs2_free_alloc_context(meta_ac);
445 461
446 mlog_exit(status); 462 mlog_exit(status);
447 463
@@ -463,6 +479,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
463 struct ocfs2_extent_list *fel; 479 struct ocfs2_extent_list *fel;
464 u64 fe_blkno = 0; 480 u64 fe_blkno = 0;
465 u16 suballoc_bit; 481 u16 suballoc_bit;
482 u16 feat;
466 483
467 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 484 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
468 inode->i_mode, (unsigned long)dev, dentry->d_name.len, 485 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -526,11 +543,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
526 fe->i_dtime = 0; 543 fe->i_dtime = 0;
527 544
528 /* 545 /*
529 * If supported, directories start with inline data. 546 * If supported, directories start with inline data. If inline
547 * isn't supported, but indexing is, we start them as indexed.
530 */ 548 */
549 feat = le16_to_cpu(fe->i_dyn_features);
531 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { 550 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
532 u16 feat = le16_to_cpu(fe->i_dyn_features);
533
534 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 551 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
535 552
536 fe->id2.i_data.id_count = cpu_to_le16( 553 fe->id2.i_data.id_count = cpu_to_le16(
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a7c8369c402f..3749c32c2fc4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -340,6 +340,9 @@ struct ocfs2_super
340 340
341 /* used to protect metaecc calculation check of xattr. */ 341 /* used to protect metaecc calculation check of xattr. */
342 spinlock_t osb_xattr_lock; 342 spinlock_t osb_xattr_lock;
343
344 unsigned int osb_dx_mask;
345 u32 osb_dx_seed[4];
343}; 346};
344 347
345#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 348#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -398,6 +401,13 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
398 return 0; 401 return 0;
399} 402}
400 403
404static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
405{
406 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
407 return 1;
408 return 0;
409}
410
401/* set / clear functions because cluster events can make these happen 411/* set / clear functions because cluster events can make these happen
402 * in parallel so we want the transitions to be atomic. this also 412 * in parallel so we want the transitions to be atomic. this also
403 * means that any future flags osb_flags must be protected by spinlock 413 * means that any future flags osb_flags must be protected by spinlock
@@ -478,6 +488,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
478#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ 488#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
479 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) 489 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
480 490
491#define OCFS2_IS_VALID_DX_ROOT(ptr) \
492 (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
493
494#define OCFS2_IS_VALID_DX_LEAF(ptr) \
495 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
496
481static inline unsigned long ino_from_blkno(struct super_block *sb, 497static inline unsigned long ino_from_blkno(struct super_block *sb,
482 u64 blkno) 498 u64 blkno)
483{ 499{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 2332ef740f4f..036eb03950a3 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" 67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" 68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
69 71
70/* Compatibility flags */ 72/* Compatibility flags */
71#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 73#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -151,6 +153,9 @@
151/* Support for extended attributes */ 153/* Support for extended attributes */
152#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 154#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
153 155
156/* Support for indexed directores */
157#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400
158
154/* Metadata checksum and error correction */ 159/* Metadata checksum and error correction */
155#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 160#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
156 161
@@ -628,8 +633,9 @@ struct ocfs2_super_block {
628/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 633/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
629 for this fs*/ 634 for this fs*/
630 __le16 s_reserved0; 635 __le16 s_reserved0;
631 __le32 s_reserved1; 636 __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
632/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ 637 * s_uuid_hash serves as seed[3]. */
638/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
633/*140*/ 639/*140*/
634 640
635 /* 641 /*
@@ -705,7 +711,8 @@ struct ocfs2_dinode {
705 __le16 i_dyn_features; 711 __le16 i_dyn_features;
706 __le64 i_xattr_loc; 712 __le64 i_xattr_loc;
707/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 713/*80*/ struct ocfs2_block_check i_check; /* Error checking */
708/*88*/ __le64 i_reserved2[6]; 714/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
715 __le64 i_reserved2[5];
709/*B8*/ union { 716/*B8*/ union {
710 __le64 i_pad1; /* Generic way to refer to this 717 __le64 i_pad1; /* Generic way to refer to this
711 64bit union */ 718 64bit union */
@@ -781,6 +788,75 @@ struct ocfs2_dir_block_trailer {
781/*40*/ 788/*40*/
782}; 789};
783 790
791 /*
792 * A directory entry in the indexed tree. We don't store the full name here,
793 * but instead provide a pointer to the full dirent in the unindexed tree.
794 *
795 * We also store name_len here so as to reduce the number of leaf blocks we
796 * need to search in case of collisions.
797 */
798struct ocfs2_dx_entry {
799 __le32 dx_major_hash; /* Used to find logical
800 * cluster in index */
801 __le32 dx_minor_hash; /* Lower bits used to find
802 * block in cluster */
803 __le64 dx_dirent_blk; /* Physical block in unindexed
804 * tree holding this dirent. */
805};
806
807struct ocfs2_dx_entry_list {
808 __le32 de_reserved;
809 __le16 de_count; /* Maximum number of entries
810 * possible in de_entries */
811 __le16 de_num_used; /* Current number of
812 * de_entries entries */
813 struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries
814 * in a packed array of
815 * length de_num_used */
816};
817
818/*
819 * A directory indexing block. Each indexed directory has one of these,
820 * pointed to by ocfs2_dinode.
821 *
822 * This block stores an indexed btree root, and a set of free space
823 * start-of-list pointers.
824 */
825struct ocfs2_dx_root_block {
826 __u8 dr_signature[8]; /* Signature for verification */
827 struct ocfs2_block_check dr_check; /* Error checking */
828 __le16 dr_suballoc_slot; /* Slot suballocator this
829 * block belongs to. */
830 __le16 dr_suballoc_bit; /* Bit offset in suballocator
831 * block group */
832 __le32 dr_fs_generation; /* Must match super block */
833 __le64 dr_blkno; /* Offset on disk, in blocks */
834 __le64 dr_last_eb_blk; /* Pointer to last
835 * extent block */
836 __le32 dr_clusters; /* Clusters allocated
837 * to the indexed tree. */
838 __le32 dr_reserved1;
839 __le64 dr_dir_blkno; /* Pointer to parent inode */
840 __le64 dr_reserved2;
841 __le64 dr_reserved3[16];
842 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
843 * bits for maximum space
844 * efficiency. */
845};
846
847/*
848 * The header of a leaf block in the indexed tree.
849 */
850struct ocfs2_dx_leaf {
851 __u8 dl_signature[8];/* Signature for verification */
852 struct ocfs2_block_check dl_check; /* Error checking */
853 __le64 dl_blkno; /* Offset on disk, in blocks */
854 __le32 dl_fs_generation;/* Must match super block */
855 __le32 dl_reserved0;
856 __le64 dl_reserved1;
857 struct ocfs2_dx_entry_list dl_list;
858};
859
784/* 860/*
785 * On disk allocator group structure for OCFS2 861 * On disk allocator group structure for OCFS2
786 */ 862 */
@@ -1112,6 +1188,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
1112 return size / sizeof(struct ocfs2_extent_rec); 1188 return size / sizeof(struct ocfs2_extent_rec);
1113} 1189}
1114 1190
1191static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
1192{
1193 int size;
1194
1195 size = sb->s_blocksize -
1196 offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
1197
1198 return size / sizeof(struct ocfs2_extent_rec);
1199}
1200
1115static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) 1201static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
1116{ 1202{
1117 int size; 1203 int size;
@@ -1132,6 +1218,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1132 return size / sizeof(struct ocfs2_extent_rec); 1218 return size / sizeof(struct ocfs2_extent_rec);
1133} 1219}
1134 1220
1221static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1222{
1223 int size;
1224
1225 size = sb->s_blocksize -
1226 offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
1227
1228 return size / sizeof(struct ocfs2_dx_entry);
1229}
1230
1135static inline u16 ocfs2_local_alloc_size(struct super_block *sb) 1231static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1136{ 1232{
1137 u16 size; 1233 u16 size;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4eaf0e602bd0..53892d874ffc 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1918,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
1918 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); 1918 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
1919 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); 1919 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
1920 1920
1921 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
1922
1923 for (i = 0; i < 3; i++)
1924 osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
1925 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
1926
1921 osb->sb = sb; 1927 osb->sb = sb;
1922 /* Save off for ocfs2_rw_direct */ 1928 /* Save off for ocfs2_rw_direct */
1923 osb->s_sectsize_bits = blksize_bits(sector_size); 1929 osb->s_sectsize_bits = blksize_bits(sector_size);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2563df89fc2a..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
512 struct ocfs2_security_xattr_info *si, 512 struct ocfs2_security_xattr_info *si,
513 int *want_clusters, 513 int *want_clusters,
514 int *xattr_credits, 514 int *xattr_credits,
515 struct ocfs2_alloc_context **xattr_ac) 515 int *want_meta)
516{ 516{
517 int ret = 0; 517 int ret = 0;
518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || 554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || 555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { 556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
557 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); 557 *want_meta = *want_meta + 1;
558 if (ret) {
559 mlog_errno(ret);
560 return ret;
561 }
562 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; 558 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
563 } 559 }
564 560
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
68 int *, int *, struct ocfs2_alloc_context **); 68 int *, int *, struct ocfs2_alloc_context **);
69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, 69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
70 int, struct ocfs2_security_xattr_info *, 70 int, struct ocfs2_security_xattr_info *,
71 int *, int *, struct ocfs2_alloc_context **); 71 int *, int *, int *);
72 72
73/* 73/*
74 * xattrs can live inside an inode, as part of an external xattr block, 74 * xattrs can live inside an inode, as part of an external xattr block,