aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/dir.c
diff options
context:
space:
mode:
authorMark Fasheh <mfasheh@suse.com>2008-11-12 19:27:44 -0500
committerMark Fasheh <mfasheh@suse.com>2009-04-03 14:39:15 -0400
commit9b7895efac906d66d19856194e1ba61f37e231a4 (patch)
tree1ee6d2630cf3617251638170dcaceef41ddda8ec /fs/ocfs2/dir.c
parent4a12ca3a00a244e1fd1e673d151ea38b71e11d55 (diff)
ocfs2: Add a name indexed b-tree to directory inodes
This patch makes use of Ocfs2's flexible btree code to add an additional tree to directory inodes. The new tree stores an array of small, fixed-length records in each leaf block. Each record stores a hash value, and pointer to a block in the traditional (unindexed) directory tree where a dirent with the given name hash resides. Lookup exclusively uses this tree to find dirents, thus providing us with constant time name lookups. Some of the hashing code was copied from ext3. Unfortunately, it has lots of unfixed checkpatch errors. I left that as-is so that tracking changes would be easier. Signed-off-by: Mark Fasheh <mfasheh@suse.com> Acked-by: Joel Becker <joel.becker@oracle.com>
Diffstat (limited to 'fs/ocfs2/dir.c')
-rw-r--r--fs/ocfs2/dir.c1908
1 files changed, 1845 insertions, 63 deletions
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 76ffb5c10b3e..0b8c88b47a4e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/sort.h>
44 45
45#define MLOG_MASK_PREFIX ML_NAMEI 46#define MLOG_MASK_PREFIX ML_NAMEI
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -58,6 +59,7 @@
58#include "namei.h" 59#include "namei.h"
59#include "suballoc.h" 60#include "suballoc.h"
60#include "super.h" 61#include "super.h"
62#include "sysfile.h"
61#include "uptodate.h" 63#include "uptodate.h"
62 64
63#include "buffer_head_io.h" 65#include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
71 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 73 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
72}; 74};
73 75
74static int ocfs2_extend_dir(struct ocfs2_super *osb,
75 struct inode *dir,
76 struct buffer_head *parent_fe_bh,
77 unsigned int blocks_wanted,
78 struct buffer_head **new_de_bh);
79static int ocfs2_do_extend_dir(struct super_block *sb, 76static int ocfs2_do_extend_dir(struct super_block *sb,
80 handle_t *handle, 77 handle_t *handle,
81 struct inode *dir, 78 struct inode *dir,
@@ -155,6 +152,105 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
155void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) 152void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
156{ 153{
157 brelse(res->dl_leaf_bh); 154 brelse(res->dl_leaf_bh);
155 brelse(res->dl_dx_leaf_bh);
156}
157
158static int ocfs2_dir_indexed(struct inode *inode)
159{
160 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
161 return 1;
162 return 0;
163}
164
165/*
166 * Hashing code adapted from ext3
167 */
168#define DELTA 0x9E3779B9
169
170static void TEA_transform(__u32 buf[4], __u32 const in[])
171{
172 __u32 sum = 0;
173 __u32 b0 = buf[0], b1 = buf[1];
174 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
175 int n = 16;
176
177 do {
178 sum += DELTA;
179 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
180 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
181 } while (--n);
182
183 buf[0] += b0;
184 buf[1] += b1;
185}
186
187static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
188{
189 __u32 pad, val;
190 int i;
191
192 pad = (__u32)len | ((__u32)len << 8);
193 pad |= pad << 16;
194
195 val = pad;
196 if (len > num*4)
197 len = num * 4;
198 for (i = 0; i < len; i++) {
199 if ((i % 4) == 0)
200 val = pad;
201 val = msg[i] + (val << 8);
202 if ((i % 4) == 3) {
203 *buf++ = val;
204 val = pad;
205 num--;
206 }
207 }
208 if (--num >= 0)
209 *buf++ = val;
210 while (--num >= 0)
211 *buf++ = pad;
212}
213
214static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
215 struct ocfs2_dx_hinfo *hinfo)
216{
217 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
218 const char *p;
219 __u32 in[8], buf[4];
220
221 /*
222 * XXX: Is this really necessary, if the index is never looked
223 * at by readdir? Is a hash value of '0' a bad idea?
224 */
225 if ((len == 1 && !strncmp(".", name, 1)) ||
226 (len == 2 && !strncmp("..", name, 2))) {
227 buf[0] = buf[1] = 0;
228 goto out;
229 }
230
231#ifdef OCFS2_DEBUG_DX_DIRS
232 /*
233 * This makes it very easy to debug indexing problems. We
234 * should never allow this to be selected without hand editing
235 * this file though.
236 */
237 buf[0] = buf[1] = len;
238 goto out;
239#endif
240
241 memcpy(buf, osb->osb_dx_seed, sizeof(buf));
242
243 p = name;
244 while (len > 0) {
245 str2hashbuf(p, len, in, 4);
246 TEA_transform(buf, in);
247 len -= 16;
248 p += 16;
249 }
250
251out:
252 hinfo->major_hash = buf[0];
253 hinfo->minor_hash = buf[1];
158} 254}
159 255
160/* 256/*
@@ -317,6 +413,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
317} 413}
318 414
319/* 415/*
416 * Validate a directory trailer.
417 *
418 * We check the trailer here rather than in ocfs2_validate_dir_block()
419 * because that function doesn't have the inode to test.
420 */
421static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
422{
423 int rc = 0;
424 struct ocfs2_dir_block_trailer *trailer;
425
426 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
427 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
428 rc = -EINVAL;
429 ocfs2_error(dir->i_sb,
430 "Invalid dirblock #%llu: "
431 "signature = %.*s\n",
432 (unsigned long long)bh->b_blocknr, 7,
433 trailer->db_signature);
434 goto out;
435 }
436 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
437 rc = -EINVAL;
438 ocfs2_error(dir->i_sb,
439 "Directory block #%llu has an invalid "
440 "db_blkno of %llu",
441 (unsigned long long)bh->b_blocknr,
442 (unsigned long long)le64_to_cpu(trailer->db_blkno));
443 goto out;
444 }
445 if (le64_to_cpu(trailer->db_parent_dinode) !=
446 OCFS2_I(dir)->ip_blkno) {
447 rc = -EINVAL;
448 ocfs2_error(dir->i_sb,
449 "Directory block #%llu on dinode "
450 "#%llu has an invalid parent_dinode "
451 "of %llu",
452 (unsigned long long)bh->b_blocknr,
453 (unsigned long long)OCFS2_I(dir)->ip_blkno,
454 (unsigned long long)le64_to_cpu(trailer->db_blkno));
455 goto out;
456 }
457out:
458 return rc;
459}
460
461/*
320 * This function forces all errors to -EIO for consistency with its 462 * This function forces all errors to -EIO for consistency with its
321 * predecessor, ocfs2_bread(). We haven't audited what returning the 463 * predecessor, ocfs2_bread(). We haven't audited what returning the
322 * real error codes would do to callers. We log the real codes with 464 * real error codes would do to callers. We log the real codes with
@@ -327,7 +469,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
327{ 469{
328 int rc = 0; 470 int rc = 0;
329 struct buffer_head *tmp = *bh; 471 struct buffer_head *tmp = *bh;
330 struct ocfs2_dir_block_trailer *trailer;
331 472
332 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, 473 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
333 ocfs2_validate_dir_block); 474 ocfs2_validate_dir_block);
@@ -336,42 +477,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
336 goto out; 477 goto out;
337 } 478 }
338 479
339 /*
340 * We check the trailer here rather than in
341 * ocfs2_validate_dir_block() because that function doesn't have
342 * the inode to test.
343 */
344 if (!(flags & OCFS2_BH_READAHEAD) && 480 if (!(flags & OCFS2_BH_READAHEAD) &&
345 ocfs2_dir_has_trailer(inode)) { 481 ocfs2_dir_has_trailer(inode)) {
346 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); 482 rc = ocfs2_check_dir_trailer(inode, tmp);
347 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { 483 if (rc) {
348 rc = -EINVAL; 484 if (!*bh)
349 ocfs2_error(inode->i_sb, 485 brelse(tmp);
350 "Invalid dirblock #%llu: " 486 mlog_errno(rc);
351 "signature = %.*s\n",
352 (unsigned long long)tmp->b_blocknr, 7,
353 trailer->db_signature);
354 goto out;
355 }
356 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
357 rc = -EINVAL;
358 ocfs2_error(inode->i_sb,
359 "Directory block #%llu has an invalid "
360 "db_blkno of %llu",
361 (unsigned long long)tmp->b_blocknr,
362 (unsigned long long)le64_to_cpu(trailer->db_blkno));
363 goto out;
364 }
365 if (le64_to_cpu(trailer->db_parent_dinode) !=
366 OCFS2_I(inode)->ip_blkno) {
367 rc = -EINVAL;
368 ocfs2_error(inode->i_sb,
369 "Directory block #%llu on dinode "
370 "#%llu has an invalid parent_dinode "
371 "of %llu",
372 (unsigned long long)tmp->b_blocknr,
373 (unsigned long long)OCFS2_I(inode)->ip_blkno,
374 (unsigned long long)le64_to_cpu(trailer->db_blkno));
375 goto out; 487 goto out;
376 } 488 }
377 } 489 }
@@ -384,6 +496,141 @@ out:
384 return rc ? -EIO : 0; 496 return rc ? -EIO : 0;
385} 497}
386 498
499/*
500 * Read the block at 'phys' which belongs to this directory
501 * inode. This function does no virtual->physical block translation -
502 * what's passed in is assumed to be a valid directory block.
503 */
504static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
505 struct buffer_head **bh)
506{
507 int ret;
508 struct buffer_head *tmp = *bh;
509
510 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
511 if (ret) {
512 mlog_errno(ret);
513 goto out;
514 }
515
516 if (ocfs2_supports_dir_trailer(dir)) {
517 ret = ocfs2_check_dir_trailer(dir, tmp);
518 if (ret) {
519 if (!*bh)
520 brelse(tmp);
521 mlog_errno(ret);
522 goto out;
523 }
524 }
525
526 if (!ret && !*bh)
527 *bh = tmp;
528out:
529 return ret;
530}
531
532static int ocfs2_validate_dx_root(struct super_block *sb,
533 struct buffer_head *bh)
534{
535 int ret;
536 struct ocfs2_dx_root_block *dx_root;
537
538 BUG_ON(!buffer_uptodate(bh));
539
540 dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
541
542 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
543 if (ret) {
544 mlog(ML_ERROR,
545 "Checksum failed for dir index root block %llu\n",
546 (unsigned long long)bh->b_blocknr);
547 return ret;
548 }
549
550 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
551 ocfs2_error(sb,
552 "Dir Index Root # %llu has bad signature %.*s",
553 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
554 7, dx_root->dr_signature);
555 return -EINVAL;
556 }
557
558 return 0;
559}
560
561static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
562 struct buffer_head **dx_root_bh)
563{
564 int ret;
565 u64 blkno = le64_to_cpu(di->i_dx_root);
566 struct buffer_head *tmp = *dx_root_bh;
567
568 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
569
570 /* If ocfs2_read_block() got us a new bh, pass it up. */
571 if (!ret && !*dx_root_bh)
572 *dx_root_bh = tmp;
573
574 return ret;
575}
576
577static int ocfs2_validate_dx_leaf(struct super_block *sb,
578 struct buffer_head *bh)
579{
580 int ret;
581 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
582
583 BUG_ON(!buffer_uptodate(bh));
584
585 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
586 if (ret) {
587 mlog(ML_ERROR,
588 "Checksum failed for dir index leaf block %llu\n",
589 (unsigned long long)bh->b_blocknr);
590 return ret;
591 }
592
593 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
594 ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
595 7, dx_leaf->dl_signature);
596 return -EROFS;
597 }
598
599 return 0;
600}
601
602static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
603 struct buffer_head **dx_leaf_bh)
604{
605 int ret;
606 struct buffer_head *tmp = *dx_leaf_bh;
607
608 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
609
610 /* If ocfs2_read_block() got us a new bh, pass it up. */
611 if (!ret && !*dx_leaf_bh)
612 *dx_leaf_bh = tmp;
613
614 return ret;
615}
616
617/*
618 * Read a series of dx_leaf blocks. This expects all buffer_head
619 * pointers to be NULL on function entry.
620 */
621static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
622 struct buffer_head **dx_leaf_bhs)
623{
624 int ret;
625
626 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
627 ocfs2_validate_dx_leaf);
628 if (ret)
629 mlog_errno(ret);
630
631 return ret;
632}
633
387static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 634static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
388 struct inode *dir, 635 struct inode *dir,
389 struct ocfs2_dir_entry **res_dir) 636 struct ocfs2_dir_entry **res_dir)
@@ -485,6 +732,270 @@ cleanup_and_exit:
485 return ret; 732 return ret;
486} 733}
487 734
735static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
736 struct ocfs2_extent_list *el,
737 u32 major_hash,
738 u32 *ret_cpos,
739 u64 *ret_phys_blkno,
740 unsigned int *ret_clen)
741{
742 int ret = 0, i, found;
743 struct buffer_head *eb_bh = NULL;
744 struct ocfs2_extent_block *eb;
745 struct ocfs2_extent_rec *rec = NULL;
746
747 if (el->l_tree_depth) {
748 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
749 if (ret) {
750 mlog_errno(ret);
751 goto out;
752 }
753
754 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
755 el = &eb->h_list;
756
757 if (el->l_tree_depth) {
758 ocfs2_error(inode->i_sb,
759 "Inode %lu has non zero tree depth in "
760 "btree tree block %llu\n", inode->i_ino,
761 (unsigned long long)eb_bh->b_blocknr);
762 ret = -EROFS;
763 goto out;
764 }
765 }
766
767 found = 0;
768 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
769 rec = &el->l_recs[i];
770
771 if (le32_to_cpu(rec->e_cpos) <= major_hash) {
772 found = 1;
773 break;
774 }
775 }
776
777 if (!found) {
778 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
779 "record (%u, %u, 0) in btree", inode->i_ino,
780 le32_to_cpu(rec->e_cpos),
781 ocfs2_rec_clusters(el, rec));
782 ret = -EROFS;
783 goto out;
784 }
785
786 if (ret_phys_blkno)
787 *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
788 if (ret_cpos)
789 *ret_cpos = le32_to_cpu(rec->e_cpos);
790 if (ret_clen)
791 *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
792
793out:
794 brelse(eb_bh);
795 return ret;
796}
797
798/*
799 * Returns the block index, from the start of the cluster which this
800 * hash belongs too.
801 */
802static unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
803 struct ocfs2_dx_hinfo *hinfo)
804{
805 u32 minor_hash = hinfo->minor_hash;
806 return minor_hash & osb->osb_dx_mask;
807}
808
809static int ocfs2_dx_dir_lookup(struct inode *inode,
810 struct ocfs2_extent_list *el,
811 struct ocfs2_dx_hinfo *hinfo,
812 u32 *ret_cpos,
813 u64 *ret_phys_blkno)
814{
815 int ret = 0;
816 unsigned int cend, uninitialized_var(clen);
817 u32 uninitialized_var(cpos);
818 u64 uninitialized_var(blkno);
819 u32 name_hash = hinfo->major_hash;
820
821 ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
822 &clen);
823 if (ret) {
824 mlog_errno(ret);
825 goto out;
826 }
827
828 cend = cpos + clen;
829 if (name_hash >= cend) {
830 /* We want the last cluster */
831 blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
832 cpos += clen - 1;
833 } else {
834 blkno += ocfs2_clusters_to_blocks(inode->i_sb,
835 name_hash - cpos);
836 cpos = name_hash;
837 }
838
839 /*
840 * We now have the cluster which should hold our entry. To
841 * find the exact block from the start of the cluster to
842 * search, we take the lower bits of the hash.
843 */
844 blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
845
846 if (ret_phys_blkno)
847 *ret_phys_blkno = blkno;
848 if (ret_cpos)
849 *ret_cpos = cpos;
850
851out:
852
853 return ret;
854}
855
856static int ocfs2_dx_dir_search(const char *name, int namelen,
857 struct inode *dir,
858 struct ocfs2_extent_list *dr_el,
859 struct ocfs2_dir_lookup_result *res)
860{
861 int ret, i, found;
862 u64 uninitialized_var(phys);
863 struct buffer_head *dx_leaf_bh = NULL;
864 struct ocfs2_dx_leaf *dx_leaf;
865 struct ocfs2_dx_entry *dx_entry = NULL;
866 struct buffer_head *dir_ent_bh = NULL;
867 struct ocfs2_dir_entry *dir_ent = NULL;
868 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
869
870 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
871
872 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
873 if (ret) {
874 mlog_errno(ret);
875 goto out;
876 }
877
878 mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
879 "returns: %llu\n",
880 (unsigned long long)OCFS2_I(dir)->ip_blkno,
881 namelen, name, hinfo->major_hash, hinfo->minor_hash,
882 (unsigned long long)phys);
883
884 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
885 if (ret) {
886 mlog_errno(ret);
887 goto out;
888 }
889
890 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
891
892 mlog(0, "leaf info: num_used: %d, count: %d\n",
893 le16_to_cpu(dx_leaf->dl_list.de_num_used),
894 le16_to_cpu(dx_leaf->dl_list.de_count));
895
896 /*
897 * Empty leaf is legal, so no need to check for that.
898 */
899 found = 0;
900 for (i = 0; i < le16_to_cpu(dx_leaf->dl_list.de_num_used); i++) {
901 dx_entry = &dx_leaf->dl_list.de_entries[i];
902
903 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
904 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
905 continue;
906
907 /*
908 * Search unindexed leaf block now. We're not
909 * guaranteed to find anything.
910 */
911 ret = ocfs2_read_dir_block_direct(dir,
912 le64_to_cpu(dx_entry->dx_dirent_blk),
913 &dir_ent_bh);
914 if (ret) {
915 mlog_errno(ret);
916 goto out;
917 }
918
919 /*
920 * XXX: We should check the unindexed block here,
921 * before using it.
922 */
923
924 found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
925 0, dir_ent_bh->b_data,
926 dir->i_sb->s_blocksize, &dir_ent);
927 if (found == 1)
928 break;
929
930 if (found == -1) {
931 /* This means we found a bad directory entry. */
932 ret = -EIO;
933 mlog_errno(ret);
934 goto out;
935 }
936
937 brelse(dir_ent_bh);
938 dir_ent_bh = NULL;
939 }
940
941 if (found <= 0) {
942 ret = -ENOENT;
943 goto out;
944 }
945
946 res->dl_leaf_bh = dir_ent_bh;
947 res->dl_entry = dir_ent;
948 res->dl_dx_leaf_bh = dx_leaf_bh;
949 res->dl_dx_entry = dx_entry;
950
951 ret = 0;
952out:
953 if (ret) {
954 brelse(dx_leaf_bh);
955 brelse(dir_ent_bh);
956 }
957 return ret;
958}
959
960static int ocfs2_find_entry_dx(const char *name, int namelen,
961 struct inode *dir,
962 struct ocfs2_dir_lookup_result *lookup)
963{
964 int ret;
965 struct buffer_head *di_bh = NULL;
966 struct ocfs2_dinode *di;
967 struct buffer_head *dx_root_bh = NULL;
968 struct ocfs2_dx_root_block *dx_root;
969
970 ret = ocfs2_read_inode_block(dir, &di_bh);
971 if (ret) {
972 mlog_errno(ret);
973 goto out;
974 }
975
976 di = (struct ocfs2_dinode *)di_bh->b_data;
977
978 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
979 if (ret) {
980 mlog_errno(ret);
981 goto out;
982 }
983 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
984
985 ret = ocfs2_dx_dir_search(name, namelen, dir, &dx_root->dr_list,
986 lookup);
987 if (ret) {
988 if (ret != -ENOENT)
989 mlog_errno(ret);
990 goto out;
991 }
992
993out:
994 brelse(di_bh);
995 brelse(dx_root_bh);
996 return ret;
997}
998
488/* 999/*
489 * Try to find an entry of the provided name within 'dir'. 1000 * Try to find an entry of the provided name within 'dir'.
490 * 1001 *
@@ -493,10 +1004,11 @@ cleanup_and_exit:
493 * other directory manipulation functions. 1004 * other directory manipulation functions.
494 * 1005 *
495 * Caller can NOT assume anything about the contents of the 1006 * Caller can NOT assume anything about the contents of the
496 * buffer_heads - they are passed back only so that it can be passed into 1007 * buffer_heads - they are passed back only so that it can be passed
497 * any one of the manipulation functions (add entry, delete entry, 1008 * into any one of the manipulation functions (add entry, delete
498 * etc). As an example, bh in the extent directory case is a data 1009 * entry, etc). As an example, bh in the extent directory case is a
499 * block, in the inline-data case it actually points to an inode. 1010 * data block, in the inline-data case it actually points to an inode,
1011 * in the indexed directory case, multiple buffers are involved.
500 */ 1012 */
501int ocfs2_find_entry(const char *name, int namelen, 1013int ocfs2_find_entry(const char *name, int namelen,
502 struct inode *dir, struct ocfs2_dir_lookup_result *lookup) 1014 struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
@@ -504,6 +1016,14 @@ int ocfs2_find_entry(const char *name, int namelen,
504 struct buffer_head *bh; 1016 struct buffer_head *bh;
505 struct ocfs2_dir_entry *res_dir = NULL; 1017 struct ocfs2_dir_entry *res_dir = NULL;
506 1018
1019 if (ocfs2_dir_indexed(dir))
1020 return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1021
1022 /*
1023 * The unindexed dir code only uses part of the lookup
1024 * structure, so there's no reason to push it down further
1025 * than this.
1026 */
507 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1027 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
508 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); 1028 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
509 else 1029 else
@@ -553,6 +1073,10 @@ out:
553 return ret; 1073 return ret;
554} 1074}
555 1075
1076/*
1077 * __ocfs2_delete_entry deletes a directory entry by merging it with the
1078 * previous entry
1079 */
556static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, 1080static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
557 struct ocfs2_dir_entry *de_del, 1081 struct ocfs2_dir_entry *de_del,
558 struct buffer_head *bh, char *first_de, 1082 struct buffer_head *bh, char *first_de,
@@ -602,6 +1126,79 @@ bail:
602 return status; 1126 return status;
603} 1127}
604 1128
1129static void ocfs2_dx_leaf_remove_entry(struct ocfs2_dx_leaf *dx_leaf, int index)
1130{
1131 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
1132 int num_used = le16_to_cpu(dl_list->de_num_used);
1133
1134 if (num_used == 1 || index == (num_used - 1))
1135 goto clear;
1136
1137 memmove(&dl_list->de_entries[index], &dl_list->de_entries[index + 1],
1138 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1139clear:
1140 num_used--;
1141 memset(&dl_list->de_entries[num_used], 0,
1142 sizeof(struct ocfs2_dx_entry));
1143 dl_list->de_num_used = cpu_to_le16(num_used);
1144}
1145
1146static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1147 struct ocfs2_dir_lookup_result *lookup)
1148{
1149 int ret, index;
1150 struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1151 struct ocfs2_dx_leaf *dx_leaf;
1152 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1153
1154 dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1155 /* Neither of these are a disk corruption - that should have
1156 * been caught by lookup, before we got here. */
1157 BUG_ON(le16_to_cpu(dx_leaf->dl_list.de_count) <= 0);
1158 BUG_ON(le16_to_cpu(dx_leaf->dl_list.de_num_used) <= 0);
1159
1160 index = (char *)dx_entry - (char *)dx_leaf->dl_list.de_entries;
1161 index /= sizeof(*dx_entry);
1162
1163 if (index >= le16_to_cpu(dx_leaf->dl_list.de_num_used)) {
1164 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1165 (unsigned long long)OCFS2_I(dir)->ip_blkno, index, dx_leaf,
1166 dx_entry);
1167 return -EIO;
1168 }
1169
1170 mlog(0, "Dir %llu: delete entry at index: %d\n",
1171 (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
1172
1173 /*
1174 * Add the index leaf into the journal before removing the
1175 * unindexed entry. If we get an error return from
1176 * __ocfs2_delete_entry(), then it hasn't removed the entry
1177 * yet. Likewise, successful return means we *must* remove the
1178 * indexed entry.
1179 */
1180 ret = ocfs2_journal_access_dl(handle, dir, lookup->dl_dx_leaf_bh,
1181 OCFS2_JOURNAL_ACCESS_WRITE);
1182 if (ret) {
1183 mlog_errno(ret);
1184 goto out;
1185 }
1186
1187 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1188 leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1189 if (ret) {
1190 mlog_errno(ret);
1191 goto out;
1192 }
1193
1194 ocfs2_dx_leaf_remove_entry(dx_leaf, index);
1195
1196 ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1197
1198out:
1199 return ret;
1200}
1201
605static inline int ocfs2_delete_entry_id(handle_t *handle, 1202static inline int ocfs2_delete_entry_id(handle_t *handle,
606 struct inode *dir, 1203 struct inode *dir,
607 struct ocfs2_dir_entry *de_del, 1204 struct ocfs2_dir_entry *de_del,
@@ -639,13 +1236,16 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
639} 1236}
640 1237
641/* 1238/*
642 * ocfs2_delete_entry deletes a directory entry by merging it with the 1239 * Delete a directory entry. Hide the details of directory
643 * previous entry 1240 * implementation from the caller.
644 */ 1241 */
645int ocfs2_delete_entry(handle_t *handle, 1242int ocfs2_delete_entry(handle_t *handle,
646 struct inode *dir, 1243 struct inode *dir,
647 struct ocfs2_dir_lookup_result *res) 1244 struct ocfs2_dir_lookup_result *res)
648{ 1245{
1246 if (ocfs2_dir_indexed(dir))
1247 return ocfs2_delete_entry_dx(handle, dir, res);
1248
649 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1249 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
650 return ocfs2_delete_entry_id(handle, dir, res->dl_entry, 1250 return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
651 res->dl_leaf_bh); 1251 res->dl_leaf_bh);
@@ -679,6 +1279,58 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
679 return 0; 1279 return 0;
680} 1280}
681 1281
1282static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1283 struct ocfs2_dx_entry *dx_new_entry)
1284{
1285 int i;
1286
1287 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1288 dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1289
1290 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1291}
1292
1293static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1294 struct ocfs2_dx_hinfo *hinfo,
1295 u64 dirent_blk,
1296 struct buffer_head *dx_leaf_bh)
1297{
1298 int ret, i;
1299 struct ocfs2_dx_entry *dx_entry;
1300 struct ocfs2_dx_leaf *dx_leaf;
1301
1302 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
1303 OCFS2_JOURNAL_ACCESS_WRITE);
1304 if (ret) {
1305 mlog_errno(ret);
1306 goto out;
1307 }
1308
1309 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1310 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1311 dx_entry = &dx_leaf->dl_list.de_entries[i];
1312
1313 memset(dx_entry, 0, sizeof(*dx_entry));
1314 dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1315 dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1316 dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1317
1318 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1319
1320 ocfs2_journal_dirty(handle, dx_leaf_bh);
1321
1322out:
1323 return ret;
1324}
1325
1326static int ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1327 struct ocfs2_dir_lookup_result *lookup)
1328{
1329 return __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1330 lookup->dl_leaf_bh->b_blocknr,
1331 lookup->dl_dx_leaf_bh);
1332}
1333
682/* we don't always have a dentry for what we want to add, so people 1334/* we don't always have a dentry for what we want to add, so people
683 * like orphan dir can call this instead. 1335 * like orphan dir can call this instead.
684 * 1336 *
@@ -754,10 +1406,21 @@ int __ocfs2_add_entry(handle_t *handle,
754 status = ocfs2_journal_access_di(handle, dir, 1406 status = ocfs2_journal_access_di(handle, dir,
755 insert_bh, 1407 insert_bh,
756 OCFS2_JOURNAL_ACCESS_WRITE); 1408 OCFS2_JOURNAL_ACCESS_WRITE);
757 else 1409 else {
758 status = ocfs2_journal_access_db(handle, dir, 1410 status = ocfs2_journal_access_db(handle, dir,
759 insert_bh, 1411 insert_bh,
760 OCFS2_JOURNAL_ACCESS_WRITE); 1412 OCFS2_JOURNAL_ACCESS_WRITE);
1413 if (ocfs2_dir_indexed(dir)) {
1414 status = ocfs2_dx_dir_leaf_insert(dir,
1415 handle,
1416 lookup);
1417 if (status) {
1418 mlog_errno(status);
1419 goto bail;
1420 }
1421 }
1422 }
1423
761 /* By now the buffer is marked for journaling */ 1424 /* By now the buffer is marked for journaling */
762 offset += le16_to_cpu(de->rec_len); 1425 offset += le16_to_cpu(de->rec_len);
763 if (le64_to_cpu(de->inode)) { 1426 if (le64_to_cpu(de->inode)) {
@@ -887,6 +1550,10 @@ out:
887 return 0; 1550 return 0;
888} 1551}
889 1552
1553/*
1554 * NOTE: This function can be called against unindexed directories,
1555 * and indexed ones.
1556 */
890static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1557static int ocfs2_dir_foreach_blk_el(struct inode *inode,
891 u64 *f_version, 1558 u64 *f_version,
892 loff_t *f_pos, void *priv, 1559 loff_t *f_pos, void *priv,
@@ -1184,6 +1851,8 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1184 * routine to check that the specified directory is empty (for rmdir) 1851 * routine to check that the specified directory is empty (for rmdir)
1185 * 1852 *
1186 * Returns 1 if dir is empty, zero otherwise. 1853 * Returns 1 if dir is empty, zero otherwise.
1854 *
1855 * XXX: This is a performance problem
1187 */ 1856 */
1188int ocfs2_empty_dir(struct inode *inode) 1857int ocfs2_empty_dir(struct inode *inode)
1189{ 1858{
@@ -1285,7 +1954,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1285 struct inode *parent, 1954 struct inode *parent,
1286 struct inode *inode, 1955 struct inode *inode,
1287 struct buffer_head *fe_bh, 1956 struct buffer_head *fe_bh,
1288 struct ocfs2_alloc_context *data_ac) 1957 struct ocfs2_alloc_context *data_ac,
1958 struct buffer_head **ret_new_bh)
1289{ 1959{
1290 int status; 1960 int status;
1291 unsigned int size = osb->sb->s_blocksize; 1961 unsigned int size = osb->sb->s_blocksize;
@@ -1334,6 +2004,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1334 } 2004 }
1335 2005
1336 status = 0; 2006 status = 0;
2007 if (ret_new_bh) {
2008 *ret_new_bh = new_bh;
2009 new_bh = NULL;
2010 }
1337bail: 2011bail:
1338 brelse(new_bh); 2012 brelse(new_bh);
1339 2013
@@ -1341,20 +2015,382 @@ bail:
1341 return status; 2015 return status;
1342} 2016}
1343 2017
2018static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2019 handle_t *handle, struct inode *dir,
2020 struct buffer_head *di_bh,
2021 struct ocfs2_alloc_context *meta_ac,
2022 struct buffer_head **ret_dx_root_bh)
2023{
2024 int ret;
2025 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2026 u16 dr_suballoc_bit;
2027 u64 dr_blkno;
2028 unsigned int num_bits;
2029 struct buffer_head *dx_root_bh = NULL;
2030 struct ocfs2_dx_root_block *dx_root;
2031
2032 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
2033 &num_bits, &dr_blkno);
2034 if (ret) {
2035 mlog_errno(ret);
2036 goto out;
2037 }
2038
2039 mlog(0, "Dir %llu, attach new index block: %llu\n",
2040 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2041 (unsigned long long)dr_blkno);
2042
2043 dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2044 if (dx_root_bh == NULL) {
2045 ret = -EIO;
2046 goto out;
2047 }
2048 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
2049
2050 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
2051 OCFS2_JOURNAL_ACCESS_CREATE);
2052 if (ret < 0) {
2053 mlog_errno(ret);
2054 goto out;
2055 }
2056
2057 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2058 memset(dx_root, 0, osb->sb->s_blocksize);
2059 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2060 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
2061 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2062 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2063 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2064 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2065 dx_root->dr_list.l_count =
2066 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2067
2068 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2069 if (ret)
2070 mlog_errno(ret);
2071
2072 ret = ocfs2_journal_access_di(handle, dir, di_bh,
2073 OCFS2_JOURNAL_ACCESS_CREATE);
2074 if (ret) {
2075 mlog_errno(ret);
2076 goto out;
2077 }
2078
2079 di->i_dx_root = cpu_to_le64(dr_blkno);
2080
2081 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2082 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2083
2084 ret = ocfs2_journal_dirty(handle, di_bh);
2085 if (ret)
2086 mlog_errno(ret);
2087
2088 *ret_dx_root_bh = dx_root_bh;
2089 dx_root_bh = NULL;
2090
2091out:
2092 brelse(dx_root_bh);
2093 return ret;
2094}
2095
2096static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2097 handle_t *handle, struct inode *dir,
2098 struct buffer_head **dx_leaves,
2099 int num_dx_leaves, u64 start_blk)
2100{
2101 int ret, i;
2102 struct ocfs2_dx_leaf *dx_leaf;
2103 struct buffer_head *bh;
2104
2105 for (i = 0; i < num_dx_leaves; i++) {
2106 bh = sb_getblk(osb->sb, start_blk + i);
2107 if (bh == NULL) {
2108 ret = -EIO;
2109 goto out;
2110 }
2111 dx_leaves[i] = bh;
2112
2113 ocfs2_set_new_buffer_uptodate(dir, bh);
2114
2115 ret = ocfs2_journal_access_dl(handle, dir, bh,
2116 OCFS2_JOURNAL_ACCESS_CREATE);
2117 if (ret < 0) {
2118 mlog_errno(ret);
2119 goto out;
2120 }
2121
2122 dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2123
2124 memset(dx_leaf, 0, osb->sb->s_blocksize);
2125 strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2126 dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2127 dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2128 dx_leaf->dl_list.de_count =
2129 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2130
2131 mlog(0,
2132 "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
2133 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2134 (unsigned long long)bh->b_blocknr,
2135 le16_to_cpu(dx_leaf->dl_list.de_count));
2136
2137 ocfs2_journal_dirty(handle, bh);
2138 }
2139
2140 ret = 0;
2141out:
2142 return ret;
2143}
2144
2145/*
2146 * Allocates and formats a new cluster for use in an indexed dir
2147 * leaf. This version will not do the extent insert, so that it can be
2148 * used by operations which need careful ordering.
2149 */
2150static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2151 u32 cpos, handle_t *handle,
2152 struct ocfs2_alloc_context *data_ac,
2153 struct buffer_head **dx_leaves,
2154 int num_dx_leaves, u64 *ret_phys_blkno)
2155{
2156 int ret;
2157 u32 phys, num;
2158 u64 phys_blkno;
2159 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2160
2161 /*
2162 * XXX: For create, this should claim cluster for the index
2163 * *before* the unindexed insert so that we have a better
2164 * chance of contiguousness as the directory grows in number
2165 * of entries.
2166 */
2167 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
2168 if (ret) {
2169 mlog_errno(ret);
2170 goto out;
2171 }
2172
2173 /*
2174 * Format the new cluster first. That way, we're inserting
2175 * valid data.
2176 */
2177 phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2178 ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2179 num_dx_leaves, phys_blkno);
2180 if (ret) {
2181 mlog_errno(ret);
2182 goto out;
2183 }
2184
2185 *ret_phys_blkno = phys_blkno;
2186out:
2187 return ret;
2188}
2189
2190static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2191 struct ocfs2_extent_tree *et,
2192 u32 cpos, handle_t *handle,
2193 struct ocfs2_alloc_context *data_ac,
2194 struct ocfs2_alloc_context *meta_ac,
2195 struct buffer_head **dx_leaves,
2196 int num_dx_leaves)
2197{
2198 int ret;
2199 u64 phys_blkno;
2200 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2201
2202 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2203 num_dx_leaves, &phys_blkno);
2204 if (ret) {
2205 mlog_errno(ret);
2206 goto out;
2207 }
2208
2209 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
2210 meta_ac);
2211 if (ret)
2212 mlog_errno(ret);
2213out:
2214 return ret;
2215}
2216
2217static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2218 int *ret_num_leaves)
2219{
2220 int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2221 struct buffer_head **dx_leaves;
2222
2223 dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2224 GFP_NOFS);
2225 if (dx_leaves && ret_num_leaves)
2226 *ret_num_leaves = num_dx_leaves;
2227
2228 return dx_leaves;
2229}
2230
2231static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2232 handle_t *handle,
2233 struct inode *parent,
2234 struct inode *inode,
2235 struct buffer_head *di_bh,
2236 struct ocfs2_alloc_context *data_ac,
2237 struct ocfs2_alloc_context *meta_ac)
2238{
2239 int ret, num_dx_leaves, i;
2240 struct buffer_head *leaf_bh = NULL;
2241 struct buffer_head *dx_root_bh = NULL;
2242 struct buffer_head **dx_leaves = NULL;
2243 struct ocfs2_extent_tree et;
2244 struct ocfs2_dx_hinfo hinfo;
2245 u64 insert_blkno;
2246
2247 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
2248 if (!dx_leaves) {
2249 ret = -ENOMEM;
2250 mlog_errno(ret);
2251 goto out;
2252 }
2253
2254 /*
2255 * Our strategy is to create the directory as though it were
2256 * unindexed, then add the index block. This works with very
2257 * little complication since the state of a new directory is a
2258 * very well known quantity.
2259 *
2260 * Essentially, we have two dirents ("." and ".."), in the 1st
2261 * block which need indexing.
2262 */
2263
2264 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2265 data_ac, &leaf_bh);
2266 if (ret) {
2267 mlog_errno(ret);
2268 goto out;
2269 }
2270
2271 /*
2272 * Allocate and format the index leaf first, before attaching
2273 * the index root. That way we're sure that the main bitmap
2274 * won't -enospc on us with a half-created dir index.
2275 *
2276 * The meta data allocation for our index block will not
2277 * -enospc on us unless there is a disk corruption.
2278 */
2279
2280 ret = __ocfs2_dx_dir_new_cluster(inode, 0, handle, data_ac, dx_leaves,
2281 num_dx_leaves, &insert_blkno);
2282 if (ret) {
2283 mlog_errno(ret);
2284 goto out;
2285 }
2286
2287 ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2288 i = ocfs2_dx_dir_hash_idx(osb, &hinfo);
2289 ret = __ocfs2_dx_dir_leaf_insert(inode, handle, &hinfo,
2290 leaf_bh->b_blocknr, dx_leaves[i]);
2291 if (ret) {
2292 mlog_errno(ret);
2293 goto out;
2294 }
2295
2296 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2297 i = ocfs2_dx_dir_hash_idx(osb, &hinfo);
2298 ret = __ocfs2_dx_dir_leaf_insert(inode, handle, &hinfo,
2299 leaf_bh->b_blocknr, dx_leaves[i]);
2300 if (ret) {
2301 mlog_errno(ret);
2302 goto out;
2303 }
2304
2305 ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, meta_ac,
2306 &dx_root_bh);
2307 if (ret) {
2308 mlog_errno(ret);
2309 goto out;
2310 }
2311
2312 /* This should never fail considering we start with an empty
2313 * dx_root. */
2314 ocfs2_init_dx_root_extent_tree(&et, inode, dx_root_bh);
2315 ret = ocfs2_insert_extent(osb, handle, inode, &et, 0,
2316 insert_blkno, 1, 0, NULL);
2317 if (ret)
2318 mlog_errno(ret);
2319
2320out:
2321 if (dx_leaves) {
2322 for (i = 0; i < num_dx_leaves; i++)
2323 brelse(dx_leaves[i]);
2324 kfree(dx_leaves);
2325 }
2326 brelse(dx_root_bh);
2327 brelse(leaf_bh);
2328 return ret;
2329}
2330
1344int ocfs2_fill_new_dir(struct ocfs2_super *osb, 2331int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1345 handle_t *handle, 2332 handle_t *handle,
1346 struct inode *parent, 2333 struct inode *parent,
1347 struct inode *inode, 2334 struct inode *inode,
1348 struct buffer_head *fe_bh, 2335 struct buffer_head *fe_bh,
1349 struct ocfs2_alloc_context *data_ac) 2336 struct ocfs2_alloc_context *data_ac,
2337 struct ocfs2_alloc_context *meta_ac)
2338
1350{ 2339{
1351 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); 2340 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
1352 2341
1353 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2342 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1354 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); 2343 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
1355 2344
2345 if (ocfs2_supports_indexed_dirs(osb))
2346 return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2347 data_ac, meta_ac);
2348
1356 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, 2349 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
1357 data_ac); 2350 data_ac, NULL);
2351}
2352
2353static int ocfs2_dx_dir_index_block(struct inode *dir,
2354 handle_t *handle,
2355 struct buffer_head **dx_leaves,
2356 int num_dx_leaves,
2357 struct buffer_head *dirent_bh)
2358{
2359 int ret, namelen, i;
2360 char *de_buf, *limit;
2361 struct ocfs2_dir_entry *de;
2362 struct buffer_head *dx_leaf_bh;
2363 struct ocfs2_dx_hinfo hinfo;
2364 u64 dirent_blk = dirent_bh->b_blocknr;
2365
2366 de_buf = dirent_bh->b_data;
2367 limit = de_buf + dir->i_sb->s_blocksize;
2368
2369 while (de_buf < limit) {
2370 de = (struct ocfs2_dir_entry *)de_buf;
2371
2372 namelen = de->name_len;
2373 if (!namelen || !de->inode)
2374 goto inc;
2375
2376 ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2377
2378 i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2379 dx_leaf_bh = dx_leaves[i];
2380
2381 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2382 dirent_blk, dx_leaf_bh);
2383 if (ret) {
2384 mlog_errno(ret);
2385 goto out;
2386 }
2387
2388inc:
2389 de_buf += le16_to_cpu(de->rec_len);
2390 }
2391
2392out:
2393 return ret;
1358} 2394}
1359 2395
1360/* 2396/*
@@ -1401,29 +2437,57 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1401 */ 2437 */
1402static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, 2438static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1403 unsigned int blocks_wanted, 2439 unsigned int blocks_wanted,
2440 struct ocfs2_dir_lookup_result *lookup,
1404 struct buffer_head **first_block_bh) 2441 struct buffer_head **first_block_bh)
1405{ 2442{
1406 u32 alloc, bit_off, len; 2443 u32 alloc, dx_alloc, bit_off, len;
1407 struct super_block *sb = dir->i_sb; 2444 struct super_block *sb = dir->i_sb;
1408 int ret, credits = ocfs2_inline_to_extents_credits(sb); 2445 int ret, i, num_dx_leaves = 0,
1409 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 2446 credits = ocfs2_inline_to_extents_credits(sb);
2447 u64 dx_insert_blkno, blkno,
2448 bytes = blocks_wanted << sb->s_blocksize_bits;
1410 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2449 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1411 struct ocfs2_inode_info *oi = OCFS2_I(dir); 2450 struct ocfs2_inode_info *oi = OCFS2_I(dir);
1412 struct ocfs2_alloc_context *data_ac; 2451 struct ocfs2_alloc_context *data_ac;
2452 struct ocfs2_alloc_context *meta_ac = NULL;
1413 struct buffer_head *dirdata_bh = NULL; 2453 struct buffer_head *dirdata_bh = NULL;
2454 struct buffer_head *dx_root_bh = NULL;
2455 struct buffer_head **dx_leaves = NULL;
1414 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2456 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1415 handle_t *handle; 2457 handle_t *handle;
1416 struct ocfs2_extent_tree et; 2458 struct ocfs2_extent_tree et;
1417 int did_quota = 0; 2459 struct ocfs2_extent_tree dx_et;
2460 int did_quota = 0, bytes_allocated = 0;
1418 2461
1419 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2462 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1420 2463
1421 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2464 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2465 dx_alloc = 0;
2466
2467 if (ocfs2_supports_indexed_dirs(osb)) {
2468 /* Add one more cluster for an index leaf */
2469 dx_alloc++;
2470 credits += ocfs2_add_dir_index_credits(sb);
2471
2472 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, &num_dx_leaves);
2473 if (!dx_leaves) {
2474 ret = -ENOMEM;
2475 mlog_errno(ret);
2476 goto out;
2477 }
2478
2479 /* This gets us the dx_root */
2480 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2481 if (ret) {
2482 mlog_errno(ret);
2483 goto out;
2484 }
2485 }
1422 2486
1423 /* 2487 /*
1424 * We should never need more than 2 clusters for this - 2488 * We should never need more than 2 clusters for the unindexed
1425 * maximum dirent size is far less than one block. In fact, 2489 * tree - maximum dirent size is far less than one block. In
1426 * the only time we'd need more than one cluster is if 2490 * fact, the only time we'd need more than one cluster is if
1427 * blocksize == clustersize and the dirent won't fit in the 2491 * blocksize == clustersize and the dirent won't fit in the
1428 * extra space that the expansion to a single block gives. As 2492 * extra space that the expansion to a single block gives. As
1429 * of today, that only happens on 4k/4k file systems. 2493 * of today, that only happens on 4k/4k file systems.
@@ -1440,7 +2504,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1440 2504
1441 /* 2505 /*
1442 * Prepare for worst case allocation scenario of two separate 2506 * Prepare for worst case allocation scenario of two separate
1443 * extents. 2507 * extents in the unindexed tree.
1444 */ 2508 */
1445 if (alloc == 2) 2509 if (alloc == 2)
1446 credits += OCFS2_SUBALLOC_ALLOC; 2510 credits += OCFS2_SUBALLOC_ALLOC;
@@ -1453,11 +2517,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1453 } 2517 }
1454 2518
1455 if (vfs_dq_alloc_space_nodirty(dir, 2519 if (vfs_dq_alloc_space_nodirty(dir,
1456 ocfs2_clusters_to_bytes(osb->sb, alloc))) { 2520 ocfs2_clusters_to_bytes(osb->sb,
2521 alloc + dx_alloc))) {
1457 ret = -EDQUOT; 2522 ret = -EDQUOT;
1458 goto out_commit; 2523 goto out_commit;
1459 } 2524 }
1460 did_quota = 1; 2525 did_quota = 1;
2526
2527 if (ocfs2_supports_indexed_dirs(osb)) {
2528 /*
2529 * Allocate our index cluster first, to maximize the
2530 * possibility that unindexed leaves grow
2531 * contiguously.
2532 */
2533 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2534 dx_leaves, num_dx_leaves,
2535 &dx_insert_blkno);
2536 if (ret) {
2537 mlog_errno(ret);
2538 goto out_commit;
2539 }
2540 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2541 }
2542
1461 /* 2543 /*
1462 * Try to claim as many clusters as the bitmap can give though 2544 * Try to claim as many clusters as the bitmap can give though
1463 * if we only get one now, that's enough to continue. The rest 2545 * if we only get one now, that's enough to continue. The rest
@@ -1468,6 +2550,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1468 mlog_errno(ret); 2550 mlog_errno(ret);
1469 goto out_commit; 2551 goto out_commit;
1470 } 2552 }
2553 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1471 2554
1472 /* 2555 /*
1473 * Operations are carefully ordered so that we set up the new 2556 * Operations are carefully ordered so that we set up the new
@@ -1504,6 +2587,15 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1504 goto out_commit; 2587 goto out_commit;
1505 } 2588 }
1506 2589
2590 if (ocfs2_supports_indexed_dirs(osb)) {
2591 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
2592 num_dx_leaves, dirdata_bh);
2593 if (ret) {
2594 mlog_errno(ret);
2595 goto out_commit;
2596 }
2597 }
2598
1507 /* 2599 /*
1508 * Set extent, i_size, etc on the directory. After this, the 2600 * Set extent, i_size, etc on the directory. After this, the
1509 * inode should contain the same exact dirents as before and 2601 * inode should contain the same exact dirents as before and
@@ -1556,6 +2648,21 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1556 goto out_commit; 2648 goto out_commit;
1557 } 2649 }
1558 2650
2651 if (ocfs2_supports_indexed_dirs(osb)) {
2652 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
2653 meta_ac, &dx_root_bh);
2654 if (ret) {
2655 mlog_errno(ret);
2656 goto out_commit;
2657 }
2658
2659 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
2660 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
2661 dx_insert_blkno, 1, 0, NULL);
2662 if (ret)
2663 mlog_errno(ret);
2664 }
2665
1559 /* 2666 /*
1560 * We asked for two clusters, but only got one in the 1st 2667 * We asked for two clusters, but only got one in the 1st
1561 * pass. Claim the 2nd cluster as a separate extent. 2668 * pass. Claim the 2nd cluster as a separate extent.
@@ -1575,15 +2682,28 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1575 mlog_errno(ret); 2682 mlog_errno(ret);
1576 goto out_commit; 2683 goto out_commit;
1577 } 2684 }
2685 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1578 } 2686 }
1579 2687
1580 *first_block_bh = dirdata_bh; 2688 *first_block_bh = dirdata_bh;
1581 dirdata_bh = NULL; 2689 dirdata_bh = NULL;
2690 if (ocfs2_supports_indexed_dirs(osb)) {
2691 unsigned int off;
2692
2693 /*
2694 * We need to return the correct block within the
2695 * cluster which should hold our entry.
2696 */
2697 off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
2698 &lookup->dl_hinfo);
2699 get_bh(dx_leaves[off]);
2700 lookup->dl_dx_leaf_bh = dx_leaves[off];
2701 }
1582 2702
1583out_commit: 2703out_commit:
1584 if (ret < 0 && did_quota) 2704 if (ret < 0 && did_quota)
1585 vfs_dq_free_space_nodirty(dir, 2705 vfs_dq_free_space_nodirty(dir, bytes_allocated);
1586 ocfs2_clusters_to_bytes(osb->sb, 2)); 2706
1587 ocfs2_commit_trans(osb, handle); 2707 ocfs2_commit_trans(osb, handle);
1588 2708
1589out_sem: 2709out_sem:
@@ -1592,8 +2712,17 @@ out_sem:
1592out: 2712out:
1593 if (data_ac) 2713 if (data_ac)
1594 ocfs2_free_alloc_context(data_ac); 2714 ocfs2_free_alloc_context(data_ac);
2715 if (meta_ac)
2716 ocfs2_free_alloc_context(meta_ac);
2717
2718 if (dx_leaves) {
2719 for (i = 0; i < num_dx_leaves; i++)
2720 brelse(dx_leaves[i]);
2721 kfree(dx_leaves);
2722 }
1595 2723
1596 brelse(dirdata_bh); 2724 brelse(dirdata_bh);
2725 brelse(dx_root_bh);
1597 2726
1598 return ret; 2727 return ret;
1599} 2728}
@@ -1668,6 +2797,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1668 struct inode *dir, 2797 struct inode *dir,
1669 struct buffer_head *parent_fe_bh, 2798 struct buffer_head *parent_fe_bh,
1670 unsigned int blocks_wanted, 2799 unsigned int blocks_wanted,
2800 struct ocfs2_dir_lookup_result *lookup,
1671 struct buffer_head **new_de_bh) 2801 struct buffer_head **new_de_bh)
1672{ 2802{
1673 int status = 0; 2803 int status = 0;
@@ -1687,7 +2817,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1687 2817
1688 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2818 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1689 status = ocfs2_expand_inline_dir(dir, parent_fe_bh, 2819 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
1690 blocks_wanted, &new_bh); 2820 blocks_wanted, lookup,
2821 &new_bh);
1691 if (status) { 2822 if (status) {
1692 mlog_errno(status); 2823 mlog_errno(status);
1693 goto bail; 2824 goto bail;
@@ -1975,6 +3106,487 @@ bail:
1975 return status; 3106 return status;
1976} 3107}
1977 3108
3109static int dx_leaf_sort_cmp(const void *a, const void *b)
3110{
3111 const struct ocfs2_dx_entry *entry1 = a;
3112 const struct ocfs2_dx_entry *entry2 = b;
3113 u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3114 u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3115 u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3116 u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3117
3118 if (major_hash1 > major_hash2)
3119 return 1;
3120 if (major_hash1 < major_hash2)
3121 return -1;
3122
3123 /*
3124 * It is not strictly necessary to sort by minor
3125 */
3126 if (minor_hash1 > minor_hash2)
3127 return 1;
3128 if (minor_hash1 < minor_hash2)
3129 return -1;
3130 return 0;
3131}
3132
3133static void dx_leaf_sort_swap(void *a, void *b, int size)
3134{
3135 struct ocfs2_dx_entry *entry1 = a;
3136 struct ocfs2_dx_entry *entry2 = b;
3137 struct ocfs2_dx_entry tmp;
3138
3139 BUG_ON(size != sizeof(*entry1));
3140
3141 tmp = *entry1;
3142 *entry1 = *entry2;
3143 *entry2 = tmp;
3144}
3145
3146static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3147{
3148 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3149 int i, num = le16_to_cpu(dl_list->de_num_used);
3150
3151 for (i = 0; i < (num - 1); i++) {
3152 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3153 le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3154 return 0;
3155 }
3156
3157 return 1;
3158}
3159
3160/*
3161 * Find the optimal value to split this leaf on. This expects the leaf
3162 * entries to be in sorted order.
3163 *
3164 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3165 * the hash we want to insert.
3166 *
3167 * This function is only concerned with the major hash - that which
3168 * determines which cluster an item belongs to.
3169 */
3170static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3171 u32 leaf_cpos, u32 insert_hash,
3172 u32 *split_hash)
3173{
3174 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3175 int i, num_used = le16_to_cpu(dl_list->de_num_used);
3176 int allsame;
3177
3178 /*
3179 * There's a couple rare, but nasty corner cases we have to
3180 * check for here. All of them involve a leaf where all value
3181 * have the same hash, which is what we look for first.
3182 *
3183 * Most of the time, all of the above is false, and we simply
3184 * pick the median value for a split.
3185 */
3186 allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3187 if (allsame) {
3188 u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3189
3190 if (val == insert_hash) {
3191 /*
3192 * No matter where we would choose to split,
3193 * the new entry would want to occupy the same
3194 * block as these. Since there's no space left
3195 * in their existing block, we know there
3196 * won't be space after the split.
3197 */
3198 return -ENOSPC;
3199 }
3200
3201 if (val == leaf_cpos) {
3202 /*
3203 * Because val is the same as leaf_cpos (which
3204 * is the smallest value this leaf can have),
3205 * yet is not equal to insert_hash, then we
3206 * know that insert_hash *must* be larger than
3207 * val (and leaf_cpos). At least cpos+1 in value.
3208 *
3209 * We also know then, that there cannot be an
3210 * adjacent extent (otherwise we'd be looking
3211 * at it). Choosing this value gives us a
3212 * chance to get some contiguousness.
3213 */
3214 *split_hash = leaf_cpos + 1;
3215 return 0;
3216 }
3217
3218 if (val > insert_hash) {
3219 /*
3220 * val can not be the same as insert hash, and
3221 * also must be larger than leaf_cpos. Also,
3222 * we know that there can't be a leaf between
3223 * cpos and val, otherwise the entries with
3224 * hash 'val' would be there.
3225 */
3226 *split_hash = val;
3227 return 0;
3228 }
3229
3230 *split_hash = insert_hash;
3231 return 0;
3232 }
3233
3234 /*
3235 * Since the records are sorted and the checks above
3236 * guaranteed that not all records in this block are the same,
3237 * we simple travel forward, from the median, and pick the 1st
3238 * record whose value is larger than leaf_cpos.
3239 */
3240 for (i = (num_used / 2); i < num_used; i++)
3241 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3242 leaf_cpos)
3243 break;
3244
3245 BUG_ON(i == num_used); /* Should be impossible */
3246 *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3247 return 0;
3248}
3249
3250/*
3251 * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3252 * larger than split_hash into new_dx_leaves. We use a temporary
3253 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3254 *
3255 * Since the block offset inside a leaf (cluster) is a constant mask
3256 * of minor_hash, we can optimize - an item at block offset X within
3257 * the original cluster, will be at offset X within the new cluster.
3258 */
3259static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3260 handle_t *handle,
3261 struct ocfs2_dx_leaf *tmp_dx_leaf,
3262 struct buffer_head **orig_dx_leaves,
3263 struct buffer_head **new_dx_leaves,
3264 int num_dx_leaves)
3265{
3266 int i, j, num_used;
3267 u32 major_hash;
3268 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3269 struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
3270 struct ocfs2_dx_entry *dx_entry;
3271
3272 tmp_list = &tmp_dx_leaf->dl_list;
3273
3274 for (i = 0; i < num_dx_leaves; i++) {
3275 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3276 orig_list = &orig_dx_leaf->dl_list;
3277 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3278 new_list = &new_dx_leaf->dl_list;
3279
3280 num_used = le16_to_cpu(orig_list->de_num_used);
3281
3282 memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3283 tmp_list->de_num_used = cpu_to_le16(0);
3284 memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3285
3286 for (j = 0; j < num_used; j++) {
3287 dx_entry = &orig_list->de_entries[j];
3288 major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3289 if (major_hash >= split_hash)
3290 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3291 dx_entry);
3292 else
3293 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3294 dx_entry);
3295 }
3296 memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3297
3298 ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3299 ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3300 }
3301}
3302
3303static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3304 struct ocfs2_dx_root_block *dx_root)
3305{
3306 int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
3307
3308 credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
3309 credits += ocfs2_quota_trans_credits(osb->sb);
3310 return credits;
3311}
3312
3313/*
3314 * Find the median value in dx_leaf_bh and allocate a new leaf to move
3315 * half our entries into.
3316 */
3317static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3318 struct buffer_head *dx_root_bh,
3319 struct buffer_head *dx_leaf_bh,
3320 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3321 u64 leaf_blkno)
3322{
3323 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3324 int credits, ret, i, num_used, did_quota = 0;
3325 u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3326 u64 orig_leaves_start;
3327 int num_dx_leaves;
3328 struct buffer_head **orig_dx_leaves = NULL;
3329 struct buffer_head **new_dx_leaves = NULL;
3330 struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3331 struct ocfs2_extent_tree et;
3332 handle_t *handle = NULL;
3333 struct ocfs2_dx_root_block *dx_root;
3334 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3335
3336 mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
3337 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3338 (unsigned long long)leaf_blkno, insert_hash);
3339
3340 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
3341
3342 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3343 /*
3344 * XXX: This is a rather large limit. We should use a more
3345 * realistic value.
3346 */
3347 if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3348 return -ENOSPC;
3349
3350 num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3351 if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3352 mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3353 "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3354 (unsigned long long)leaf_blkno, num_used);
3355 ret = -EIO;
3356 goto out;
3357 }
3358
3359 orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3360 if (!orig_dx_leaves) {
3361 ret = -ENOMEM;
3362 mlog_errno(ret);
3363 goto out;
3364 }
3365
3366 new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3367 if (!new_dx_leaves) {
3368 ret = -ENOMEM;
3369 mlog_errno(ret);
3370 goto out;
3371 }
3372
3373 ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3374 if (ret) {
3375 if (ret != -ENOSPC)
3376 mlog_errno(ret);
3377 goto out;
3378 }
3379
3380 credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3381 handle = ocfs2_start_trans(osb, credits);
3382 if (IS_ERR(handle)) {
3383 ret = PTR_ERR(handle);
3384 handle = NULL;
3385 mlog_errno(ret);
3386 goto out;
3387 }
3388
3389 if (vfs_dq_alloc_space_nodirty(dir,
3390 ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
3391 ret = -EDQUOT;
3392 goto out_commit;
3393 }
3394 did_quota = 1;
3395
3396 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
3397 OCFS2_JOURNAL_ACCESS_WRITE);
3398 if (ret) {
3399 mlog_errno(ret);
3400 goto out_commit;
3401 }
3402
3403 /*
3404 * This block is changing anyway, so we can sort it in place.
3405 */
3406 sort(dx_leaf->dl_list.de_entries, num_used,
3407 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3408 dx_leaf_sort_swap);
3409
3410 ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
3411 if (ret) {
3412 mlog_errno(ret);
3413 goto out_commit;
3414 }
3415
3416 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3417 &split_hash);
3418 if (ret) {
3419 mlog_errno(ret);
3420 goto out_commit;
3421 }
3422
3423 mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
3424 leaf_cpos, split_hash, insert_hash);
3425
3426 /*
3427 * We have to carefully order operations here. There are items
3428 * which want to be in the new cluster before insert, but in
3429 * order to put those items in the new cluster, we alter the
3430 * old cluster. A failure to insert gets nasty.
3431 *
3432 * So, start by reserving writes to the old
3433 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3434 * the new cluster for us, before inserting it. The insert
3435 * won't happen if there's an error before that. Once the
3436 * insert is done then, we can transfer from one leaf into the
3437 * other without fear of hitting any error.
3438 */
3439
3440 /*
3441 * The leaf transfer wants some scratch space so that we don't
3442 * wind up doing a bunch of expensive memmove().
3443 */
3444 tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3445 if (!tmp_dx_leaf) {
3446 ret = -ENOMEM;
3447 mlog_errno(ret);
3448 goto out_commit;
3449 }
3450
3451 orig_leaves_start = leaf_blkno & ~(osb->s_clustersize_bits -
3452 osb->sb->s_blocksize_bits);
3453 ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3454 orig_dx_leaves);
3455 if (ret) {
3456 mlog_errno(ret);
3457 goto out_commit;
3458 }
3459
3460 for (i = 0; i < num_dx_leaves; i++) {
3461 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
3462 OCFS2_JOURNAL_ACCESS_WRITE);
3463 if (ret) {
3464 mlog_errno(ret);
3465 goto out_commit;
3466 }
3467 }
3468
3469 cpos = split_hash;
3470 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3471 data_ac, meta_ac, new_dx_leaves,
3472 num_dx_leaves);
3473 if (ret) {
3474 mlog_errno(ret);
3475 goto out_commit;
3476 }
3477
3478 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3479 orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3480
3481out_commit:
3482 if (ret < 0 && did_quota)
3483 vfs_dq_free_space_nodirty(dir,
3484 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3485
3486 ocfs2_commit_trans(osb, handle);
3487
3488out:
3489 if (orig_dx_leaves || new_dx_leaves) {
3490 for (i = 0; i < num_dx_leaves; i++) {
3491 if (orig_dx_leaves)
3492 brelse(orig_dx_leaves[i]);
3493 if (new_dx_leaves)
3494 brelse(new_dx_leaves[i]);
3495 }
3496 kfree(orig_dx_leaves);
3497 kfree(new_dx_leaves);
3498 }
3499
3500 if (meta_ac)
3501 ocfs2_free_alloc_context(meta_ac);
3502 if (data_ac)
3503 ocfs2_free_alloc_context(data_ac);
3504
3505 kfree(tmp_dx_leaf);
3506 return ret;
3507}
3508
3509static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
3510 struct buffer_head *di_bh, const char *name,
3511 int namelen,
3512 struct ocfs2_dir_lookup_result *lookup)
3513{
3514 int ret, rebalanced = 0;
3515 struct buffer_head *dx_root_bh = NULL;
3516 struct ocfs2_dx_root_block *dx_root;
3517 struct buffer_head *dx_leaf_bh = NULL;
3518 struct ocfs2_dx_leaf *dx_leaf;
3519 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3520 u64 blkno;
3521 u32 leaf_cpos;
3522
3523 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
3524 if (ret) {
3525 mlog_errno(ret);
3526 goto out;
3527 }
3528
3529 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3530
3531restart_search:
3532 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
3533 &leaf_cpos, &blkno);
3534 if (ret) {
3535 mlog_errno(ret);
3536 goto out;
3537 }
3538
3539 ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
3540 if (ret) {
3541 mlog_errno(ret);
3542 goto out;
3543 }
3544
3545 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3546
3547 if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
3548 le16_to_cpu(dx_leaf->dl_list.de_count)) {
3549 if (rebalanced) {
3550 /*
3551 * Rebalancing should have provided us with
3552 * space in an appropriate leaf.
3553 *
3554 * XXX: Is this an abnormal condition then?
3555 * Should we print a message here?
3556 */
3557 ret = -ENOSPC;
3558 goto out;
3559 }
3560
3561 ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
3562 &lookup->dl_hinfo, leaf_cpos,
3563 blkno);
3564 if (ret) {
3565 if (ret != -ENOSPC)
3566 mlog_errno(ret);
3567 goto out;
3568 }
3569
3570 /*
3571 * Restart the lookup. The rebalance might have
3572 * changed which block our item fits into. Mark our
3573 * progress, so we only execute this once.
3574 */
3575 brelse(dx_leaf_bh);
3576 dx_leaf_bh = NULL;
3577 rebalanced = 1;
3578 goto restart_search;
3579 }
3580
3581 lookup->dl_dx_leaf_bh = dx_leaf_bh;
3582 dx_leaf_bh = NULL;
3583
3584out:
3585 brelse(dx_leaf_bh);
3586 brelse(dx_root_bh);
3587 return ret;
3588}
3589
1978/* 3590/*
1979 * Get a directory ready for insert. Any directory allocation required 3591 * Get a directory ready for insert. Any directory allocation required
1980 * happens here. Success returns zero, and enough context in the dir 3592 * happens here. Success returns zero, and enough context in the dir
@@ -2001,6 +3613,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2001 goto out; 3613 goto out;
2002 } 3614 }
2003 3615
3616 /*
3617 * Do this up front to reduce confusion.
3618 *
3619 * The directory might start inline, then be turned into an
3620 * indexed one, in which case we'd need to hash deep inside
3621 * ocfs2_find_dir_space_id(). Since
3622 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
3623 * done, there seems no point in spreading out the calls. We
3624 * can optimize away the case where the file system doesn't
3625 * support indexing.
3626 */
3627 if (ocfs2_supports_indexed_dirs(osb))
3628 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
3629
3630 if (ocfs2_dir_indexed(dir)) {
3631 ret = ocfs2_find_dir_space_dx(osb, dir, parent_fe_bh, name,
3632 namelen, lookup);
3633 if (ret) {
3634 mlog_errno(ret);
3635 goto out;
3636 }
3637
3638 /*
3639 * We intentionally fall through so that the unindexed
3640 * tree can also be prepared.
3641 */
3642 }
3643
2004 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 3644 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2005 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, 3645 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
2006 namelen, &bh, &blocks_wanted); 3646 namelen, &bh, &blocks_wanted);
@@ -2019,7 +3659,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2019 BUG_ON(bh); 3659 BUG_ON(bh);
2020 3660
2021 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, 3661 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
2022 &bh); 3662 lookup, &bh);
2023 if (ret) { 3663 if (ret) {
2024 if (ret != -ENOSPC) 3664 if (ret != -ENOSPC)
2025 mlog_errno(ret); 3665 mlog_errno(ret);
@@ -2035,3 +3675,145 @@ out:
2035 brelse(bh); 3675 brelse(bh);
2036 return ret; 3676 return ret;
2037} 3677}
3678
3679static int ocfs2_dx_dir_remove_index(struct inode *dir,
3680 struct buffer_head *di_bh,
3681 struct buffer_head *dx_root_bh)
3682{
3683 int ret;
3684 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
3685 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3686 struct ocfs2_dx_root_block *dx_root;
3687 struct inode *dx_alloc_inode = NULL;
3688 struct buffer_head *dx_alloc_bh = NULL;
3689 handle_t *handle;
3690 u64 blk;
3691 u16 bit;
3692 u64 bg_blkno;
3693
3694 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
3695
3696 dx_alloc_inode = ocfs2_get_system_file_inode(osb,
3697 EXTENT_ALLOC_SYSTEM_INODE,
3698 le16_to_cpu(dx_root->dr_suballoc_slot));
3699 if (!dx_alloc_inode) {
3700 ret = -ENOMEM;
3701 mlog_errno(ret);
3702 goto out;
3703 }
3704 mutex_lock(&dx_alloc_inode->i_mutex);
3705
3706 ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
3707 if (ret) {
3708 mlog_errno(ret);
3709 goto out_mutex;
3710 }
3711
3712 handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
3713 if (IS_ERR(handle)) {
3714 ret = PTR_ERR(handle);
3715 mlog_errno(ret);
3716 goto out_unlock;
3717 }
3718
3719 ret = ocfs2_journal_access_di(handle, dir, di_bh,
3720 OCFS2_JOURNAL_ACCESS_WRITE);
3721 if (ret) {
3722 mlog_errno(ret);
3723 goto out_commit;
3724 }
3725
3726 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
3727 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
3728 di->i_dx_root = cpu_to_le64(0ULL);
3729
3730 ocfs2_journal_dirty(handle, di_bh);
3731
3732 blk = le64_to_cpu(dx_root->dr_blkno);
3733 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
3734 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
3735 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
3736 bit, bg_blkno, 1);
3737 if (ret)
3738 mlog_errno(ret);
3739
3740out_commit:
3741 ocfs2_commit_trans(osb, handle);
3742
3743out_unlock:
3744 ocfs2_inode_unlock(dx_alloc_inode, 1);
3745
3746out_mutex:
3747 mutex_unlock(&dx_alloc_inode->i_mutex);
3748 brelse(dx_alloc_bh);
3749out:
3750 iput(dx_alloc_inode);
3751 return ret;
3752}
3753
3754int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
3755{
3756 int ret;
3757 unsigned int uninitialized_var(clen);
3758 u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
3759 u64 uninitialized_var(blkno);
3760 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
3761 struct buffer_head *dx_root_bh = NULL;
3762 struct ocfs2_dx_root_block *dx_root;
3763 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3764 struct ocfs2_cached_dealloc_ctxt dealloc;
3765 struct ocfs2_extent_tree et;
3766
3767 ocfs2_init_dealloc_ctxt(&dealloc);
3768
3769 if (!ocfs2_dir_indexed(dir))
3770 return 0;
3771
3772 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
3773 if (ret) {
3774 mlog_errno(ret);
3775 goto out;
3776 }
3777
3778 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
3779
3780 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3781
3782 /* XXX: What if dr_clusters is too large? */
3783 while (le32_to_cpu(dx_root->dr_clusters)) {
3784 ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
3785 major_hash, &cpos, &blkno, &clen);
3786 if (ret) {
3787 mlog_errno(ret);
3788 goto out;
3789 }
3790
3791 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
3792
3793 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
3794 &dealloc);
3795 if (ret) {
3796 mlog_errno(ret);
3797 goto out;
3798 }
3799
3800 if (cpos == 0)
3801 break;
3802
3803 major_hash = cpos - 1;
3804 }
3805
3806 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
3807 if (ret) {
3808 mlog_errno(ret);
3809 goto out;
3810 }
3811
3812 ocfs2_remove_from_cache(dir, dx_root_bh);
3813out:
3814 ocfs2_schedule_truncate_log_flush(osb, 1);
3815 ocfs2_run_deallocs(osb, &dealloc);
3816
3817 brelse(dx_root_bh);
3818 return ret;
3819}