aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorLen Brown <len.brown@intel.com>2009-04-05 02:14:15 -0400
committerLen Brown <len.brown@intel.com>2009-04-05 02:14:15 -0400
commit478c6a43fcbc6c11609f8cee7c7b57223907754f (patch)
treea7f7952099da60d33032aed6de9c0c56c9f8779e /fs/ocfs2
parent8a3f257c704e02aee9869decd069a806b45be3f1 (diff)
parent6bb597507f9839b13498781e481f5458aea33620 (diff)
Merge branch 'linus' into release
Conflicts: arch/x86/kernel/cpu/cpufreq/longhaul.c Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/alloc.c57
-rw-r--r--fs/ocfs2/alloc.h3
-rw-r--r--fs/ocfs2/aops.c23
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c9
-rw-r--r--fs/ocfs2/dcache.c2
-rw-r--r--fs/ocfs2/dcache.h2
-rw-r--r--fs/ocfs2/dir.c2806
-rw-r--r--fs/ocfs2/dir.h57
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h58
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c87
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c29
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c387
-rw-r--r--fs/ocfs2/dlm/dlmthread.c20
-rw-r--r--fs/ocfs2/dlmglue.c46
-rw-r--r--fs/ocfs2/dlmglue.h2
-rw-r--r--fs/ocfs2/export.c84
-rw-r--r--fs/ocfs2/inode.c48
-rw-r--r--fs/ocfs2/inode.h5
-rw-r--r--fs/ocfs2/journal.c173
-rw-r--r--fs/ocfs2/journal.h77
-rw-r--r--fs/ocfs2/localalloc.c86
-rw-r--r--fs/ocfs2/mmap.c6
-rw-r--r--fs/ocfs2/namei.c250
-rw-r--r--fs/ocfs2/ocfs2.h76
-rw-r--r--fs/ocfs2/ocfs2_fs.h136
-rw-r--r--fs/ocfs2/ocfs2_lockid.h4
-rw-r--r--fs/ocfs2/suballoc.c254
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c188
-rw-r--r--fs/ocfs2/xattr.c8
-rw-r--r--fs/ocfs2/xattr.h2
34 files changed, 4396 insertions, 694 deletions
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
296 return PTR_ERR(acl); 296 return PTR_ERR(acl);
297 } 297 }
298 if (!acl) 298 if (!acl)
299 inode->i_mode &= ~current->fs->umask; 299 inode->i_mode &= ~current_umask();
300 } 300 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone; 302 struct posix_acl *clone;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19e3a96aa02c..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
295}; 295};
296 296
297static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
298 u64 blkno)
299{
300 struct ocfs2_dx_root_block *dx_root = et->et_object;
301
302 dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
303}
304
305static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
306{
307 struct ocfs2_dx_root_block *dx_root = et->et_object;
308
309 return le64_to_cpu(dx_root->dr_last_eb_blk);
310}
311
312static void ocfs2_dx_root_update_clusters(struct inode *inode,
313 struct ocfs2_extent_tree *et,
314 u32 clusters)
315{
316 struct ocfs2_dx_root_block *dx_root = et->et_object;
317
318 le32_add_cpu(&dx_root->dr_clusters, clusters);
319}
320
321static int ocfs2_dx_root_sanity_check(struct inode *inode,
322 struct ocfs2_extent_tree *et)
323{
324 struct ocfs2_dx_root_block *dx_root = et->et_object;
325
326 BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
327
328 return 0;
329}
330
331static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
332{
333 struct ocfs2_dx_root_block *dx_root = et->et_object;
334
335 et->et_root_el = &dx_root->dr_list;
336}
337
338static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
339 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
340 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
341 .eo_update_clusters = ocfs2_dx_root_update_clusters,
342 .eo_sanity_check = ocfs2_dx_root_sanity_check,
343 .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
344};
345
297static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 346static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
298 struct inode *inode, 347 struct inode *inode,
299 struct buffer_head *bh, 348 struct buffer_head *bh,
@@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 &ocfs2_xattr_value_et_ops); 388 &ocfs2_xattr_value_et_ops);
340} 389}
341 390
391void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 struct inode *inode,
393 struct buffer_head *bh)
394{
395 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
396 NULL, &ocfs2_dx_root_et_ops);
397}
398
342static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 399static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
343 u64 new_last_eb_blk) 400 u64 new_last_eb_blk)
344{ 401{
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
76 struct inode *inode, 76 struct inode *inode,
77 struct ocfs2_xattr_value_buf *vb); 77 struct ocfs2_xattr_value_buf *vb);
78void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
79 struct inode *inode,
80 struct buffer_head *bh);
78 81
79/* 82/*
80 * Read an extent block into *bh. If *bh is NULL, a bh will be 83 * Read an extent block into *bh. If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8e1709a679b7..b2c52b3a1484 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1956} 1956}
1957 1957
1958const struct address_space_operations ocfs2_aops = { 1958const struct address_space_operations ocfs2_aops = {
1959 .readpage = ocfs2_readpage, 1959 .readpage = ocfs2_readpage,
1960 .readpages = ocfs2_readpages, 1960 .readpages = ocfs2_readpages,
1961 .writepage = ocfs2_writepage, 1961 .writepage = ocfs2_writepage,
1962 .write_begin = ocfs2_write_begin, 1962 .write_begin = ocfs2_write_begin,
1963 .write_end = ocfs2_write_end, 1963 .write_end = ocfs2_write_end,
1964 .bmap = ocfs2_bmap, 1964 .bmap = ocfs2_bmap,
1965 .sync_page = block_sync_page, 1965 .sync_page = block_sync_page,
1966 .direct_IO = ocfs2_direct_IO, 1966 .direct_IO = ocfs2_direct_IO,
1967 .invalidatepage = ocfs2_invalidatepage, 1967 .invalidatepage = ocfs2_invalidatepage,
1968 .releasepage = ocfs2_releasepage, 1968 .releasepage = ocfs2_releasepage,
1969 .migratepage = buffer_migrate_page, 1969 .migratepage = buffer_migrate_page,
1970 .is_partially_uptodate = block_is_partially_uptodate,
1970}; 1971};
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 04697ba7f73e..4f85eceab376 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,7 @@
33#include <linux/random.h> 33#include <linux/random.h>
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h>
36 37
37#include "heartbeat.h" 38#include "heartbeat.h"
38#include "tcp.h" 39#include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
60static LIST_HEAD(o2hb_node_events); 61static LIST_HEAD(o2hb_node_events);
61static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 62static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
62 63
64#define O2HB_DEBUG_DIR "o2hb"
65#define O2HB_DEBUG_LIVENODES "livenodes"
66static struct dentry *o2hb_debug_dir;
67static struct dentry *o2hb_debug_livenodes;
68
63static LIST_HEAD(o2hb_all_regions); 69static LIST_HEAD(o2hb_all_regions);
64 70
65static struct o2hb_callback { 71static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
905 return 0; 911 return 0;
906} 912}
907 913
908void o2hb_init(void) 914#ifdef CONFIG_DEBUG_FS
915static int o2hb_debug_open(struct inode *inode, struct file *file)
916{
917 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
918 char *buf = NULL;
919 int i = -1;
920 int out = 0;
921
922 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
923 if (!buf)
924 goto bail;
925
926 o2hb_fill_node_map(map, sizeof(map));
927
928 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
929 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
930 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
931
932 i_size_write(inode, out);
933
934 file->private_data = buf;
935
936 return 0;
937bail:
938 return -ENOMEM;
939}
940
941static int o2hb_debug_release(struct inode *inode, struct file *file)
942{
943 kfree(file->private_data);
944 return 0;
945}
946
947static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
948 size_t nbytes, loff_t *ppos)
949{
950 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
951 i_size_read(file->f_mapping->host));
952}
953#else
954static int o2hb_debug_open(struct inode *inode, struct file *file)
955{
956 return 0;
957}
958static int o2hb_debug_release(struct inode *inode, struct file *file)
959{
960 return 0;
961}
962static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
963 size_t nbytes, loff_t *ppos)
964{
965 return 0;
966}
967#endif /* CONFIG_DEBUG_FS */
968
969static struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read,
973 .llseek = generic_file_llseek,
974};
975
976void o2hb_exit(void)
977{
978 if (o2hb_debug_livenodes)
979 debugfs_remove(o2hb_debug_livenodes);
980 if (o2hb_debug_dir)
981 debugfs_remove(o2hb_debug_dir);
982}
983
984int o2hb_init(void)
909{ 985{
910 int i; 986 int i;
911 987
@@ -918,6 +994,24 @@ void o2hb_init(void)
918 INIT_LIST_HEAD(&o2hb_node_events); 994 INIT_LIST_HEAD(&o2hb_node_events);
919 995
920 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 996 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
997
998 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
999 if (!o2hb_debug_dir) {
1000 mlog_errno(-ENOMEM);
1001 return -ENOMEM;
1002 }
1003
1004 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
1005 S_IFREG|S_IRUSR,
1006 o2hb_debug_dir, NULL,
1007 &o2hb_debug_fops);
1008 if (!o2hb_debug_livenodes) {
1009 mlog_errno(-ENOMEM);
1010 debugfs_remove(o2hb_debug_dir);
1011 return -ENOMEM;
1012 }
1013
1014 return 0;
921} 1015}
922 1016
923/* if we're already in a callback then we're already serialized by the sem */ 1017/* if we're already in a callback then we're already serialized by the sem */
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index e511339886b3..2f1649253b49 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
75 struct o2hb_callback_func *hc); 75 struct o2hb_callback_func *hc);
76void o2hb_fill_node_map(unsigned long *map, 76void o2hb_fill_node_map(unsigned long *map,
77 unsigned bytes); 77 unsigned bytes);
78void o2hb_init(void); 78void o2hb_exit(void);
79int o2hb_init(void);
79int o2hb_check_node_heartbeating(u8 node_num); 80int o2hb_check_node_heartbeating(u8 node_num);
80int o2hb_check_node_heartbeating_from_callback(u8 node_num); 81int o2hb_check_node_heartbeating_from_callback(u8 node_num);
81int o2hb_check_local_node_heartbeating(void); 82int o2hb_check_local_node_heartbeating(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 70e8fa9e2539..7ee6188bc79a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
881 o2cb_sys_shutdown(); 881 o2cb_sys_shutdown();
882 882
883 o2net_exit(); 883 o2net_exit();
884 o2hb_exit();
884} 885}
885 886
886static int __init init_o2nm(void) 887static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
889 890
890 cluster_print_version(); 891 cluster_print_version();
891 892
892 o2hb_init(); 893 ret = o2hb_init();
894 if (ret)
895 goto out;
893 896
894 ret = o2net_init(); 897 ret = o2net_init();
895 if (ret) 898 if (ret)
896 goto out; 899 goto out_o2hb;
897 900
898 ret = o2net_register_hb_callbacks(); 901 ret = o2net_register_hb_callbacks();
899 if (ret) 902 if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
916 o2net_unregister_hb_callbacks(); 919 o2net_unregister_hb_callbacks();
917out_o2net: 920out_o2net:
918 o2net_exit(); 921 o2net_exit();
922out_o2hb:
923 o2hb_exit();
919out: 924out:
920 return ret; 925 return ret;
921} 926}
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index e9d7c2038c0f..7d604480557a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -455,7 +455,7 @@ out_move:
455 d_move(dentry, target); 455 d_move(dentry, target);
456} 456}
457 457
458struct dentry_operations ocfs2_dentry_ops = { 458const struct dentry_operations ocfs2_dentry_ops = {
459 .d_revalidate = ocfs2_dentry_revalidate, 459 .d_revalidate = ocfs2_dentry_revalidate,
460 .d_iput = ocfs2_dentry_iput, 460 .d_iput = ocfs2_dentry_iput,
461}; 461};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index d06e16c06640..faa12e75f98d 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -26,7 +26,7 @@
26#ifndef OCFS2_DCACHE_H 26#ifndef OCFS2_DCACHE_H
27#define OCFS2_DCACHE_H 27#define OCFS2_DCACHE_H
28 28
29extern struct dentry_operations ocfs2_dentry_ops; 29extern const struct dentry_operations ocfs2_dentry_ops;
30 30
31struct ocfs2_dentry_lock { 31struct ocfs2_dentry_lock {
32 /* Use count of dentry lock */ 32 /* Use count of dentry lock */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2c4098cf337..e71160cda110 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/sort.h>
44 45
45#define MLOG_MASK_PREFIX ML_NAMEI 46#define MLOG_MASK_PREFIX ML_NAMEI
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -58,6 +59,7 @@
58#include "namei.h" 59#include "namei.h"
59#include "suballoc.h" 60#include "suballoc.h"
60#include "super.h" 61#include "super.h"
62#include "sysfile.h"
61#include "uptodate.h" 63#include "uptodate.h"
62 64
63#include "buffer_head_io.h" 65#include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
71 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 73 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
72}; 74};
73 75
74static int ocfs2_extend_dir(struct ocfs2_super *osb,
75 struct inode *dir,
76 struct buffer_head *parent_fe_bh,
77 unsigned int blocks_wanted,
78 struct buffer_head **new_de_bh);
79static int ocfs2_do_extend_dir(struct super_block *sb, 76static int ocfs2_do_extend_dir(struct super_block *sb,
80 handle_t *handle, 77 handle_t *handle,
81 struct inode *dir, 78 struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
83 struct ocfs2_alloc_context *data_ac, 80 struct ocfs2_alloc_context *data_ac,
84 struct ocfs2_alloc_context *meta_ac, 81 struct ocfs2_alloc_context *meta_ac,
85 struct buffer_head **new_bh); 82 struct buffer_head **new_bh);
83static int ocfs2_dir_indexed(struct inode *inode);
86 84
87/* 85/*
88 * These are distinct checks because future versions of the file system will 86 * These are distinct checks because future versions of the file system will
89 * want to have a trailing dirent structure independent of indexing. 87 * want to have a trailing dirent structure independent of indexing.
90 */ 88 */
91static int ocfs2_dir_has_trailer(struct inode *dir) 89static int ocfs2_supports_dir_trailer(struct inode *dir)
92{ 90{
91 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
92
93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
94 return 0; 94 return 0;
95 95
96 return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); 96 return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
97} 97}
98 98
99static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) 99/*
100 * "new' here refers to the point at which we're creating a new
101 * directory via "mkdir()", but also when we're expanding an inline
102 * directory. In either case, we don't yet have the indexing bit set
103 * on the directory, so the standard checks will fail in when metaecc
104 * is turned off. Only directory-initialization type functions should
105 * use this then. Everything else wants ocfs2_supports_dir_trailer()
106 */
107static int ocfs2_new_dir_wants_trailer(struct inode *dir)
100{ 108{
101 return ocfs2_meta_ecc(osb); 109 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
110
111 return ocfs2_meta_ecc(osb) ||
112 ocfs2_supports_indexed_dirs(osb);
102} 113}
103 114
104static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) 115static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
130{ 141{
131 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); 142 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
132 143
133 if (!ocfs2_dir_has_trailer(dir)) 144 if (!ocfs2_supports_dir_trailer(dir))
134 return 0; 145 return 0;
135 146
136 if (offset != toff) 147 if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
140} 151}
141 152
142static void ocfs2_init_dir_trailer(struct inode *inode, 153static void ocfs2_init_dir_trailer(struct inode *inode,
143 struct buffer_head *bh) 154 struct buffer_head *bh, u16 rec_len)
144{ 155{
145 struct ocfs2_dir_block_trailer *trailer; 156 struct ocfs2_dir_block_trailer *trailer;
146 157
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
150 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); 161 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
151 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); 162 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
152 trailer->db_blkno = cpu_to_le64(bh->b_blocknr); 163 trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
164 trailer->db_free_rec_len = cpu_to_le16(rec_len);
165}
166/*
167 * Link an unindexed block with a dir trailer structure into the index free
168 * list. This function will modify dirdata_bh, but assumes you've already
169 * passed it to the journal.
170 */
171static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
172 struct buffer_head *dx_root_bh,
173 struct buffer_head *dirdata_bh)
174{
175 int ret;
176 struct ocfs2_dx_root_block *dx_root;
177 struct ocfs2_dir_block_trailer *trailer;
178
179 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
180 OCFS2_JOURNAL_ACCESS_WRITE);
181 if (ret) {
182 mlog_errno(ret);
183 goto out;
184 }
185 trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
186 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
187
188 trailer->db_free_next = dx_root->dr_free_blk;
189 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
190
191 ocfs2_journal_dirty(handle, dx_root_bh);
192
193out:
194 return ret;
195}
196
197static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
198{
199 return res->dl_prev_leaf_bh == NULL;
200}
201
202void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
203{
204 brelse(res->dl_dx_root_bh);
205 brelse(res->dl_leaf_bh);
206 brelse(res->dl_dx_leaf_bh);
207 brelse(res->dl_prev_leaf_bh);
208}
209
210static int ocfs2_dir_indexed(struct inode *inode)
211{
212 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
213 return 1;
214 return 0;
215}
216
217static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
218{
219 return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
220}
221
222/*
223 * Hashing code adapted from ext3
224 */
225#define DELTA 0x9E3779B9
226
227static void TEA_transform(__u32 buf[4], __u32 const in[])
228{
229 __u32 sum = 0;
230 __u32 b0 = buf[0], b1 = buf[1];
231 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
232 int n = 16;
233
234 do {
235 sum += DELTA;
236 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
237 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
238 } while (--n);
239
240 buf[0] += b0;
241 buf[1] += b1;
242}
243
244static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
245{
246 __u32 pad, val;
247 int i;
248
249 pad = (__u32)len | ((__u32)len << 8);
250 pad |= pad << 16;
251
252 val = pad;
253 if (len > num*4)
254 len = num * 4;
255 for (i = 0; i < len; i++) {
256 if ((i % 4) == 0)
257 val = pad;
258 val = msg[i] + (val << 8);
259 if ((i % 4) == 3) {
260 *buf++ = val;
261 val = pad;
262 num--;
263 }
264 }
265 if (--num >= 0)
266 *buf++ = val;
267 while (--num >= 0)
268 *buf++ = pad;
269}
270
271static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
272 struct ocfs2_dx_hinfo *hinfo)
273{
274 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
275 const char *p;
276 __u32 in[8], buf[4];
277
278 /*
279 * XXX: Is this really necessary, if the index is never looked
280 * at by readdir? Is a hash value of '0' a bad idea?
281 */
282 if ((len == 1 && !strncmp(".", name, 1)) ||
283 (len == 2 && !strncmp("..", name, 2))) {
284 buf[0] = buf[1] = 0;
285 goto out;
286 }
287
288#ifdef OCFS2_DEBUG_DX_DIRS
289 /*
290 * This makes it very easy to debug indexing problems. We
291 * should never allow this to be selected without hand editing
292 * this file though.
293 */
294 buf[0] = buf[1] = len;
295 goto out;
296#endif
297
298 memcpy(buf, osb->osb_dx_seed, sizeof(buf));
299
300 p = name;
301 while (len > 0) {
302 str2hashbuf(p, len, in, 4);
303 TEA_transform(buf, in);
304 len -= 16;
305 p += 16;
306 }
307
308out:
309 hinfo->major_hash = buf[0];
310 hinfo->minor_hash = buf[1];
153} 311}
154 312
155/* 313/*
@@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
312} 470}
313 471
314/* 472/*
473 * Validate a directory trailer.
474 *
475 * We check the trailer here rather than in ocfs2_validate_dir_block()
476 * because that function doesn't have the inode to test.
477 */
478static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
479{
480 int rc = 0;
481 struct ocfs2_dir_block_trailer *trailer;
482
483 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
484 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
485 rc = -EINVAL;
486 ocfs2_error(dir->i_sb,
487 "Invalid dirblock #%llu: "
488 "signature = %.*s\n",
489 (unsigned long long)bh->b_blocknr, 7,
490 trailer->db_signature);
491 goto out;
492 }
493 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
494 rc = -EINVAL;
495 ocfs2_error(dir->i_sb,
496 "Directory block #%llu has an invalid "
497 "db_blkno of %llu",
498 (unsigned long long)bh->b_blocknr,
499 (unsigned long long)le64_to_cpu(trailer->db_blkno));
500 goto out;
501 }
502 if (le64_to_cpu(trailer->db_parent_dinode) !=
503 OCFS2_I(dir)->ip_blkno) {
504 rc = -EINVAL;
505 ocfs2_error(dir->i_sb,
506 "Directory block #%llu on dinode "
507 "#%llu has an invalid parent_dinode "
508 "of %llu",
509 (unsigned long long)bh->b_blocknr,
510 (unsigned long long)OCFS2_I(dir)->ip_blkno,
511 (unsigned long long)le64_to_cpu(trailer->db_blkno));
512 goto out;
513 }
514out:
515 return rc;
516}
517
518/*
315 * This function forces all errors to -EIO for consistency with its 519 * This function forces all errors to -EIO for consistency with its
316 * predecessor, ocfs2_bread(). We haven't audited what returning the 520 * predecessor, ocfs2_bread(). We haven't audited what returning the
317 * real error codes would do to callers. We log the real codes with 521 * real error codes would do to callers. We log the real codes with
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
322{ 526{
323 int rc = 0; 527 int rc = 0;
324 struct buffer_head *tmp = *bh; 528 struct buffer_head *tmp = *bh;
325 struct ocfs2_dir_block_trailer *trailer;
326 529
327 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, 530 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
328 ocfs2_validate_dir_block); 531 ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
331 goto out; 534 goto out;
332 } 535 }
333 536
334 /*
335 * We check the trailer here rather than in
336 * ocfs2_validate_dir_block() because that function doesn't have
337 * the inode to test.
338 */
339 if (!(flags & OCFS2_BH_READAHEAD) && 537 if (!(flags & OCFS2_BH_READAHEAD) &&
340 ocfs2_dir_has_trailer(inode)) { 538 ocfs2_supports_dir_trailer(inode)) {
341 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); 539 rc = ocfs2_check_dir_trailer(inode, tmp);
342 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { 540 if (rc) {
343 rc = -EINVAL; 541 if (!*bh)
344 ocfs2_error(inode->i_sb, 542 brelse(tmp);
345 "Invalid dirblock #%llu: " 543 mlog_errno(rc);
346 "signature = %.*s\n",
347 (unsigned long long)tmp->b_blocknr, 7,
348 trailer->db_signature);
349 goto out;
350 }
351 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
352 rc = -EINVAL;
353 ocfs2_error(inode->i_sb,
354 "Directory block #%llu has an invalid "
355 "db_blkno of %llu",
356 (unsigned long long)tmp->b_blocknr,
357 (unsigned long long)le64_to_cpu(trailer->db_blkno));
358 goto out;
359 }
360 if (le64_to_cpu(trailer->db_parent_dinode) !=
361 OCFS2_I(inode)->ip_blkno) {
362 rc = -EINVAL;
363 ocfs2_error(inode->i_sb,
364 "Directory block #%llu on dinode "
365 "#%llu has an invalid parent_dinode "
366 "of %llu",
367 (unsigned long long)tmp->b_blocknr,
368 (unsigned long long)OCFS2_I(inode)->ip_blkno,
369 (unsigned long long)le64_to_cpu(trailer->db_blkno));
370 goto out; 544 goto out;
371 } 545 }
372 } 546 }
@@ -379,6 +553,141 @@ out:
379 return rc ? -EIO : 0; 553 return rc ? -EIO : 0;
380} 554}
381 555
556/*
557 * Read the block at 'phys' which belongs to this directory
558 * inode. This function does no virtual->physical block translation -
559 * what's passed in is assumed to be a valid directory block.
560 */
561static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
562 struct buffer_head **bh)
563{
564 int ret;
565 struct buffer_head *tmp = *bh;
566
567 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
568 if (ret) {
569 mlog_errno(ret);
570 goto out;
571 }
572
573 if (ocfs2_supports_dir_trailer(dir)) {
574 ret = ocfs2_check_dir_trailer(dir, tmp);
575 if (ret) {
576 if (!*bh)
577 brelse(tmp);
578 mlog_errno(ret);
579 goto out;
580 }
581 }
582
583 if (!ret && !*bh)
584 *bh = tmp;
585out:
586 return ret;
587}
588
589static int ocfs2_validate_dx_root(struct super_block *sb,
590 struct buffer_head *bh)
591{
592 int ret;
593 struct ocfs2_dx_root_block *dx_root;
594
595 BUG_ON(!buffer_uptodate(bh));
596
597 dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
598
599 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
600 if (ret) {
601 mlog(ML_ERROR,
602 "Checksum failed for dir index root block %llu\n",
603 (unsigned long long)bh->b_blocknr);
604 return ret;
605 }
606
607 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
608 ocfs2_error(sb,
609 "Dir Index Root # %llu has bad signature %.*s",
610 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
611 7, dx_root->dr_signature);
612 return -EINVAL;
613 }
614
615 return 0;
616}
617
618static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
619 struct buffer_head **dx_root_bh)
620{
621 int ret;
622 u64 blkno = le64_to_cpu(di->i_dx_root);
623 struct buffer_head *tmp = *dx_root_bh;
624
625 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
626
627 /* If ocfs2_read_block() got us a new bh, pass it up. */
628 if (!ret && !*dx_root_bh)
629 *dx_root_bh = tmp;
630
631 return ret;
632}
633
634static int ocfs2_validate_dx_leaf(struct super_block *sb,
635 struct buffer_head *bh)
636{
637 int ret;
638 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
639
640 BUG_ON(!buffer_uptodate(bh));
641
642 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
643 if (ret) {
644 mlog(ML_ERROR,
645 "Checksum failed for dir index leaf block %llu\n",
646 (unsigned long long)bh->b_blocknr);
647 return ret;
648 }
649
650 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
651 ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
652 7, dx_leaf->dl_signature);
653 return -EROFS;
654 }
655
656 return 0;
657}
658
659static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
660 struct buffer_head **dx_leaf_bh)
661{
662 int ret;
663 struct buffer_head *tmp = *dx_leaf_bh;
664
665 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
666
667 /* If ocfs2_read_block() got us a new bh, pass it up. */
668 if (!ret && !*dx_leaf_bh)
669 *dx_leaf_bh = tmp;
670
671 return ret;
672}
673
674/*
675 * Read a series of dx_leaf blocks. This expects all buffer_head
676 * pointers to be NULL on function entry.
677 */
678static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
679 struct buffer_head **dx_leaf_bhs)
680{
681 int ret;
682
683 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
684 ocfs2_validate_dx_leaf);
685 if (ret)
686 mlog_errno(ret);
687
688 return ret;
689}
690
382static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 691static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
383 struct inode *dir, 692 struct inode *dir,
384 struct ocfs2_dir_entry **res_dir) 693 struct ocfs2_dir_entry **res_dir)
@@ -480,39 +789,340 @@ cleanup_and_exit:
480 return ret; 789 return ret;
481} 790}
482 791
792static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
793 struct ocfs2_extent_list *el,
794 u32 major_hash,
795 u32 *ret_cpos,
796 u64 *ret_phys_blkno,
797 unsigned int *ret_clen)
798{
799 int ret = 0, i, found;
800 struct buffer_head *eb_bh = NULL;
801 struct ocfs2_extent_block *eb;
802 struct ocfs2_extent_rec *rec = NULL;
803
804 if (el->l_tree_depth) {
805 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
806 if (ret) {
807 mlog_errno(ret);
808 goto out;
809 }
810
811 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
812 el = &eb->h_list;
813
814 if (el->l_tree_depth) {
815 ocfs2_error(inode->i_sb,
816 "Inode %lu has non zero tree depth in "
817 "btree tree block %llu\n", inode->i_ino,
818 (unsigned long long)eb_bh->b_blocknr);
819 ret = -EROFS;
820 goto out;
821 }
822 }
823
824 found = 0;
825 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
826 rec = &el->l_recs[i];
827
828 if (le32_to_cpu(rec->e_cpos) <= major_hash) {
829 found = 1;
830 break;
831 }
832 }
833
834 if (!found) {
835 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
836 "record (%u, %u, 0) in btree", inode->i_ino,
837 le32_to_cpu(rec->e_cpos),
838 ocfs2_rec_clusters(el, rec));
839 ret = -EROFS;
840 goto out;
841 }
842
843 if (ret_phys_blkno)
844 *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
845 if (ret_cpos)
846 *ret_cpos = le32_to_cpu(rec->e_cpos);
847 if (ret_clen)
848 *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
849
850out:
851 brelse(eb_bh);
852 return ret;
853}
854
855/*
856 * Returns the block index, from the start of the cluster which this
857 * hash belongs too.
858 */
859static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
860 u32 minor_hash)
861{
862 return minor_hash & osb->osb_dx_mask;
863}
864
865static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
866 struct ocfs2_dx_hinfo *hinfo)
867{
868 return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
869}
870
871static int ocfs2_dx_dir_lookup(struct inode *inode,
872 struct ocfs2_extent_list *el,
873 struct ocfs2_dx_hinfo *hinfo,
874 u32 *ret_cpos,
875 u64 *ret_phys_blkno)
876{
877 int ret = 0;
878 unsigned int cend, uninitialized_var(clen);
879 u32 uninitialized_var(cpos);
880 u64 uninitialized_var(blkno);
881 u32 name_hash = hinfo->major_hash;
882
883 ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
884 &clen);
885 if (ret) {
886 mlog_errno(ret);
887 goto out;
888 }
889
890 cend = cpos + clen;
891 if (name_hash >= cend) {
892 /* We want the last cluster */
893 blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
894 cpos += clen - 1;
895 } else {
896 blkno += ocfs2_clusters_to_blocks(inode->i_sb,
897 name_hash - cpos);
898 cpos = name_hash;
899 }
900
901 /*
902 * We now have the cluster which should hold our entry. To
903 * find the exact block from the start of the cluster to
904 * search, we take the lower bits of the hash.
905 */
906 blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
907
908 if (ret_phys_blkno)
909 *ret_phys_blkno = blkno;
910 if (ret_cpos)
911 *ret_cpos = cpos;
912
913out:
914
915 return ret;
916}
917
918static int ocfs2_dx_dir_search(const char *name, int namelen,
919 struct inode *dir,
920 struct ocfs2_dx_root_block *dx_root,
921 struct ocfs2_dir_lookup_result *res)
922{
923 int ret, i, found;
924 u64 uninitialized_var(phys);
925 struct buffer_head *dx_leaf_bh = NULL;
926 struct ocfs2_dx_leaf *dx_leaf;
927 struct ocfs2_dx_entry *dx_entry = NULL;
928 struct buffer_head *dir_ent_bh = NULL;
929 struct ocfs2_dir_entry *dir_ent = NULL;
930 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
931 struct ocfs2_extent_list *dr_el;
932 struct ocfs2_dx_entry_list *entry_list;
933
934 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
935
936 if (ocfs2_dx_root_inline(dx_root)) {
937 entry_list = &dx_root->dr_entries;
938 goto search;
939 }
940
941 dr_el = &dx_root->dr_list;
942
943 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
944 if (ret) {
945 mlog_errno(ret);
946 goto out;
947 }
948
949 mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
950 "returns: %llu\n",
951 (unsigned long long)OCFS2_I(dir)->ip_blkno,
952 namelen, name, hinfo->major_hash, hinfo->minor_hash,
953 (unsigned long long)phys);
954
955 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
956 if (ret) {
957 mlog_errno(ret);
958 goto out;
959 }
960
961 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
962
963 mlog(0, "leaf info: num_used: %d, count: %d\n",
964 le16_to_cpu(dx_leaf->dl_list.de_num_used),
965 le16_to_cpu(dx_leaf->dl_list.de_count));
966
967 entry_list = &dx_leaf->dl_list;
968
969search:
970 /*
971 * Empty leaf is legal, so no need to check for that.
972 */
973 found = 0;
974 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
975 dx_entry = &entry_list->de_entries[i];
976
977 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
978 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
979 continue;
980
981 /*
982 * Search unindexed leaf block now. We're not
983 * guaranteed to find anything.
984 */
985 ret = ocfs2_read_dir_block_direct(dir,
986 le64_to_cpu(dx_entry->dx_dirent_blk),
987 &dir_ent_bh);
988 if (ret) {
989 mlog_errno(ret);
990 goto out;
991 }
992
993 /*
994 * XXX: We should check the unindexed block here,
995 * before using it.
996 */
997
998 found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
999 0, dir_ent_bh->b_data,
1000 dir->i_sb->s_blocksize, &dir_ent);
1001 if (found == 1)
1002 break;
1003
1004 if (found == -1) {
1005 /* This means we found a bad directory entry. */
1006 ret = -EIO;
1007 mlog_errno(ret);
1008 goto out;
1009 }
1010
1011 brelse(dir_ent_bh);
1012 dir_ent_bh = NULL;
1013 }
1014
1015 if (found <= 0) {
1016 ret = -ENOENT;
1017 goto out;
1018 }
1019
1020 res->dl_leaf_bh = dir_ent_bh;
1021 res->dl_entry = dir_ent;
1022 res->dl_dx_leaf_bh = dx_leaf_bh;
1023 res->dl_dx_entry = dx_entry;
1024
1025 ret = 0;
1026out:
1027 if (ret) {
1028 brelse(dx_leaf_bh);
1029 brelse(dir_ent_bh);
1030 }
1031 return ret;
1032}
1033
1034static int ocfs2_find_entry_dx(const char *name, int namelen,
1035 struct inode *dir,
1036 struct ocfs2_dir_lookup_result *lookup)
1037{
1038 int ret;
1039 struct buffer_head *di_bh = NULL;
1040 struct ocfs2_dinode *di;
1041 struct buffer_head *dx_root_bh = NULL;
1042 struct ocfs2_dx_root_block *dx_root;
1043
1044 ret = ocfs2_read_inode_block(dir, &di_bh);
1045 if (ret) {
1046 mlog_errno(ret);
1047 goto out;
1048 }
1049
1050 di = (struct ocfs2_dinode *)di_bh->b_data;
1051
1052 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
1053 if (ret) {
1054 mlog_errno(ret);
1055 goto out;
1056 }
1057 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
1058
1059 ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
1060 if (ret) {
1061 if (ret != -ENOENT)
1062 mlog_errno(ret);
1063 goto out;
1064 }
1065
1066 lookup->dl_dx_root_bh = dx_root_bh;
1067 dx_root_bh = NULL;
1068out:
1069 brelse(di_bh);
1070 brelse(dx_root_bh);
1071 return ret;
1072}
1073
483/* 1074/*
484 * Try to find an entry of the provided name within 'dir'. 1075 * Try to find an entry of the provided name within 'dir'.
485 * 1076 *
486 * If nothing was found, NULL is returned. Otherwise, a buffer_head 1077 * If nothing was found, -ENOENT is returned. Otherwise, zero is
487 * and pointer to the dir entry are passed back. 1078 * returned and the struct 'res' will contain information useful to
1079 * other directory manipulation functions.
488 * 1080 *
489 * Caller can NOT assume anything about the contents of the 1081 * Caller can NOT assume anything about the contents of the
490 * buffer_head - it is passed back only so that it can be passed into 1082 * buffer_heads - they are passed back only so that it can be passed
491 * any one of the manipulation functions (add entry, delete entry, 1083 * into any one of the manipulation functions (add entry, delete
492 * etc). As an example, bh in the extent directory case is a data 1084 * entry, etc). As an example, bh in the extent directory case is a
493 * block, in the inline-data case it actually points to an inode. 1085 * data block, in the inline-data case it actually points to an inode,
1086 * in the indexed directory case, multiple buffers are involved.
494 */ 1087 */
495struct buffer_head *ocfs2_find_entry(const char *name, int namelen, 1088int ocfs2_find_entry(const char *name, int namelen,
496 struct inode *dir, 1089 struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
497 struct ocfs2_dir_entry **res_dir)
498{ 1090{
499 *res_dir = NULL; 1091 struct buffer_head *bh;
1092 struct ocfs2_dir_entry *res_dir = NULL;
500 1093
1094 if (ocfs2_dir_indexed(dir))
1095 return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1096
1097 /*
1098 * The unindexed dir code only uses part of the lookup
1099 * structure, so there's no reason to push it down further
1100 * than this.
1101 */
501 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1102 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
502 return ocfs2_find_entry_id(name, namelen, dir, res_dir); 1103 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
1104 else
1105 bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
1106
1107 if (bh == NULL)
1108 return -ENOENT;
503 1109
504 return ocfs2_find_entry_el(name, namelen, dir, res_dir); 1110 lookup->dl_leaf_bh = bh;
1111 lookup->dl_entry = res_dir;
1112 return 0;
505} 1113}
506 1114
507/* 1115/*
508 * Update inode number and type of a previously found directory entry. 1116 * Update inode number and type of a previously found directory entry.
509 */ 1117 */
510int ocfs2_update_entry(struct inode *dir, handle_t *handle, 1118int ocfs2_update_entry(struct inode *dir, handle_t *handle,
511 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 1119 struct ocfs2_dir_lookup_result *res,
512 struct inode *new_entry_inode) 1120 struct inode *new_entry_inode)
513{ 1121{
514 int ret; 1122 int ret;
515 ocfs2_journal_access_func access = ocfs2_journal_access_db; 1123 ocfs2_journal_access_func access = ocfs2_journal_access_db;
1124 struct ocfs2_dir_entry *de = res->dl_entry;
1125 struct buffer_head *de_bh = res->dl_leaf_bh;
516 1126
517 /* 1127 /*
518 * The same code works fine for both inline-data and extent 1128 * The same code works fine for both inline-data and extent
@@ -538,6 +1148,10 @@ out:
538 return ret; 1148 return ret;
539} 1149}
540 1150
1151/*
1152 * __ocfs2_delete_entry deletes a directory entry by merging it with the
1153 * previous entry
1154 */
541static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, 1155static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
542 struct ocfs2_dir_entry *de_del, 1156 struct ocfs2_dir_entry *de_del,
543 struct buffer_head *bh, char *first_de, 1157 struct buffer_head *bh, char *first_de,
@@ -587,6 +1201,181 @@ bail:
587 return status; 1201 return status;
588} 1202}
589 1203
1204static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
1205{
1206 unsigned int hole;
1207
1208 if (le64_to_cpu(de->inode) == 0)
1209 hole = le16_to_cpu(de->rec_len);
1210 else
1211 hole = le16_to_cpu(de->rec_len) -
1212 OCFS2_DIR_REC_LEN(de->name_len);
1213
1214 return hole;
1215}
1216
1217static int ocfs2_find_max_rec_len(struct super_block *sb,
1218 struct buffer_head *dirblock_bh)
1219{
1220 int size, this_hole, largest_hole = 0;
1221 char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
1222 struct ocfs2_dir_entry *de;
1223
1224 trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
1225 size = ocfs2_dir_trailer_blk_off(sb);
1226 limit = start + size;
1227 de_buf = start;
1228 de = (struct ocfs2_dir_entry *)de_buf;
1229 do {
1230 if (de_buf != trailer) {
1231 this_hole = ocfs2_figure_dirent_hole(de);
1232 if (this_hole > largest_hole)
1233 largest_hole = this_hole;
1234 }
1235
1236 de_buf += le16_to_cpu(de->rec_len);
1237 de = (struct ocfs2_dir_entry *)de_buf;
1238 } while (de_buf < limit);
1239
1240 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
1241 return largest_hole;
1242 return 0;
1243}
1244
1245static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
1246 int index)
1247{
1248 int num_used = le16_to_cpu(entry_list->de_num_used);
1249
1250 if (num_used == 1 || index == (num_used - 1))
1251 goto clear;
1252
1253 memmove(&entry_list->de_entries[index],
1254 &entry_list->de_entries[index + 1],
1255 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1256clear:
1257 num_used--;
1258 memset(&entry_list->de_entries[num_used], 0,
1259 sizeof(struct ocfs2_dx_entry));
1260 entry_list->de_num_used = cpu_to_le16(num_used);
1261}
1262
1263static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1264 struct ocfs2_dir_lookup_result *lookup)
1265{
1266 int ret, index, max_rec_len, add_to_free_list = 0;
1267 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1268 struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1269 struct ocfs2_dx_leaf *dx_leaf;
1270 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1271 struct ocfs2_dir_block_trailer *trailer;
1272 struct ocfs2_dx_root_block *dx_root;
1273 struct ocfs2_dx_entry_list *entry_list;
1274
1275 /*
1276 * This function gets a bit messy because we might have to
1277 * modify the root block, regardless of whether the indexed
1278 * entries are stored inline.
1279 */
1280
1281 /*
1282 * *Only* set 'entry_list' here, based on where we're looking
1283 * for the indexed entries. Later, we might still want to
1284 * journal both blocks, based on free list state.
1285 */
1286 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
1287 if (ocfs2_dx_root_inline(dx_root)) {
1288 entry_list = &dx_root->dr_entries;
1289 } else {
1290 dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1291 entry_list = &dx_leaf->dl_list;
1292 }
1293
1294 /* Neither of these are a disk corruption - that should have
1295 * been caught by lookup, before we got here. */
1296 BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
1297 BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
1298
1299 index = (char *)dx_entry - (char *)entry_list->de_entries;
1300 index /= sizeof(*dx_entry);
1301
1302 if (index >= le16_to_cpu(entry_list->de_num_used)) {
1303 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1304 (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
1305 entry_list, dx_entry);
1306 return -EIO;
1307 }
1308
1309 /*
1310 * We know that removal of this dirent will leave enough room
1311 * for a new one, so add this block to the free list if it
1312 * isn't already there.
1313 */
1314 trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
1315 if (trailer->db_free_rec_len == 0)
1316 add_to_free_list = 1;
1317
1318 /*
1319 * Add the block holding our index into the journal before
1320 * removing the unindexed entry. If we get an error return
1321 * from __ocfs2_delete_entry(), then it hasn't removed the
1322 * entry yet. Likewise, successful return means we *must*
1323 * remove the indexed entry.
1324 *
1325 * We're also careful to journal the root tree block here as
1326 * the entry count needs to be updated. Also, we might be
1327 * adding to the start of the free list.
1328 */
1329 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (ret) {
1332 mlog_errno(ret);
1333 goto out;
1334 }
1335
1336 if (!ocfs2_dx_root_inline(dx_root)) {
1337 ret = ocfs2_journal_access_dl(handle, dir,
1338 lookup->dl_dx_leaf_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) {
1341 mlog_errno(ret);
1342 goto out;
1343 }
1344 }
1345
1346 mlog(0, "Dir %llu: delete entry at index: %d\n",
1347 (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
1348
1349 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1350 leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1351 if (ret) {
1352 mlog_errno(ret);
1353 goto out;
1354 }
1355
1356 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
1357 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1358 if (add_to_free_list) {
1359 trailer->db_free_next = dx_root->dr_free_blk;
1360 dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
1361 ocfs2_journal_dirty(handle, dx_root_bh);
1362 }
1363
1364 /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
1365 ocfs2_journal_dirty(handle, leaf_bh);
1366
1367 le32_add_cpu(&dx_root->dr_num_entries, -1);
1368 ocfs2_journal_dirty(handle, dx_root_bh);
1369
1370 ocfs2_dx_list_remove_entry(entry_list, index);
1371
1372 if (!ocfs2_dx_root_inline(dx_root))
1373 ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1374
1375out:
1376 return ret;
1377}
1378
590static inline int ocfs2_delete_entry_id(handle_t *handle, 1379static inline int ocfs2_delete_entry_id(handle_t *handle,
591 struct inode *dir, 1380 struct inode *dir,
592 struct ocfs2_dir_entry *de_del, 1381 struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
624} 1413}
625 1414
626/* 1415/*
627 * ocfs2_delete_entry deletes a directory entry by merging it with the 1416 * Delete a directory entry. Hide the details of directory
628 * previous entry 1417 * implementation from the caller.
629 */ 1418 */
630int ocfs2_delete_entry(handle_t *handle, 1419int ocfs2_delete_entry(handle_t *handle,
631 struct inode *dir, 1420 struct inode *dir,
632 struct ocfs2_dir_entry *de_del, 1421 struct ocfs2_dir_lookup_result *res)
633 struct buffer_head *bh)
634{ 1422{
1423 if (ocfs2_dir_indexed(dir))
1424 return ocfs2_delete_entry_dx(handle, dir, res);
1425
635 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1426 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
636 return ocfs2_delete_entry_id(handle, dir, de_del, bh); 1427 return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
1428 res->dl_leaf_bh);
637 1429
638 return ocfs2_delete_entry_el(handle, dir, de_del, bh); 1430 return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
1431 res->dl_leaf_bh);
639} 1432}
640 1433
641/* 1434/*
@@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
663 return 0; 1456 return 0;
664} 1457}
665 1458
1459static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1460 struct ocfs2_dx_entry *dx_new_entry)
1461{
1462 int i;
1463
1464 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1465 dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1466
1467 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1468}
1469
1470static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
1471 struct ocfs2_dx_hinfo *hinfo,
1472 u64 dirent_blk)
1473{
1474 int i;
1475 struct ocfs2_dx_entry *dx_entry;
1476
1477 i = le16_to_cpu(entry_list->de_num_used);
1478 dx_entry = &entry_list->de_entries[i];
1479
1480 memset(dx_entry, 0, sizeof(*dx_entry));
1481 dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1482 dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1483 dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1484
1485 le16_add_cpu(&entry_list->de_num_used, 1);
1486}
1487
1488static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1489 struct ocfs2_dx_hinfo *hinfo,
1490 u64 dirent_blk,
1491 struct buffer_head *dx_leaf_bh)
1492{
1493 int ret;
1494 struct ocfs2_dx_leaf *dx_leaf;
1495
1496 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
1497 OCFS2_JOURNAL_ACCESS_WRITE);
1498 if (ret) {
1499 mlog_errno(ret);
1500 goto out;
1501 }
1502
1503 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1504 ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
1505 ocfs2_journal_dirty(handle, dx_leaf_bh);
1506
1507out:
1508 return ret;
1509}
1510
1511static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
1512 struct ocfs2_dx_hinfo *hinfo,
1513 u64 dirent_blk,
1514 struct ocfs2_dx_root_block *dx_root)
1515{
1516 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
1517}
1518
1519static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1520 struct ocfs2_dir_lookup_result *lookup)
1521{
1522 int ret = 0;
1523 struct ocfs2_dx_root_block *dx_root;
1524 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1525
1526 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1527 OCFS2_JOURNAL_ACCESS_WRITE);
1528 if (ret) {
1529 mlog_errno(ret);
1530 goto out;
1531 }
1532
1533 dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
1534 if (ocfs2_dx_root_inline(dx_root)) {
1535 ocfs2_dx_inline_root_insert(dir, handle,
1536 &lookup->dl_hinfo,
1537 lookup->dl_leaf_bh->b_blocknr,
1538 dx_root);
1539 } else {
1540 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1541 lookup->dl_leaf_bh->b_blocknr,
1542 lookup->dl_dx_leaf_bh);
1543 if (ret)
1544 goto out;
1545 }
1546
1547 le32_add_cpu(&dx_root->dr_num_entries, 1);
1548 ocfs2_journal_dirty(handle, dx_root_bh);
1549
1550out:
1551 return ret;
1552}
1553
1554static void ocfs2_remove_block_from_free_list(struct inode *dir,
1555 handle_t *handle,
1556 struct ocfs2_dir_lookup_result *lookup)
1557{
1558 struct ocfs2_dir_block_trailer *trailer, *prev;
1559 struct ocfs2_dx_root_block *dx_root;
1560 struct buffer_head *bh;
1561
1562 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1563
1564 if (ocfs2_free_list_at_root(lookup)) {
1565 bh = lookup->dl_dx_root_bh;
1566 dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
1567 dx_root->dr_free_blk = trailer->db_free_next;
1568 } else {
1569 bh = lookup->dl_prev_leaf_bh;
1570 prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
1571 prev->db_free_next = trailer->db_free_next;
1572 }
1573
1574 trailer->db_free_rec_len = cpu_to_le16(0);
1575 trailer->db_free_next = cpu_to_le64(0);
1576
1577 ocfs2_journal_dirty(handle, bh);
1578 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1579}
1580
1581/*
1582 * This expects that a journal write has been reserved on
1583 * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
1584 */
1585static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
1586 struct ocfs2_dir_lookup_result *lookup)
1587{
1588 int max_rec_len;
1589 struct ocfs2_dir_block_trailer *trailer;
1590
1591 /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
1592 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
1593 if (max_rec_len) {
1594 /*
1595 * There's still room in this block, so no need to remove it
1596 * from the free list. In this case, we just want to update
1597 * the rec len accounting.
1598 */
1599 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1600 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1601 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1602 } else {
1603 ocfs2_remove_block_from_free_list(dir, handle, lookup);
1604 }
1605}
1606
666/* we don't always have a dentry for what we want to add, so people 1607/* we don't always have a dentry for what we want to add, so people
667 * like orphan dir can call this instead. 1608 * like orphan dir can call this instead.
668 * 1609 *
669 * If you pass me insert_bh, I'll skip the search of the other dir 1610 * The lookup context must have been filled from
670 * blocks and put the record in there. 1611 * ocfs2_prepare_dir_for_insert.
671 */ 1612 */
672int __ocfs2_add_entry(handle_t *handle, 1613int __ocfs2_add_entry(handle_t *handle,
673 struct inode *dir, 1614 struct inode *dir,
674 const char *name, int namelen, 1615 const char *name, int namelen,
675 struct inode *inode, u64 blkno, 1616 struct inode *inode, u64 blkno,
676 struct buffer_head *parent_fe_bh, 1617 struct buffer_head *parent_fe_bh,
677 struct buffer_head *insert_bh) 1618 struct ocfs2_dir_lookup_result *lookup)
678{ 1619{
679 unsigned long offset; 1620 unsigned long offset;
680 unsigned short rec_len; 1621 unsigned short rec_len;
@@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle,
683 struct super_block *sb = dir->i_sb; 1624 struct super_block *sb = dir->i_sb;
684 int retval, status; 1625 int retval, status;
685 unsigned int size = sb->s_blocksize; 1626 unsigned int size = sb->s_blocksize;
1627 struct buffer_head *insert_bh = lookup->dl_leaf_bh;
686 char *data_start = insert_bh->b_data; 1628 char *data_start = insert_bh->b_data;
687 1629
688 mlog_entry_void(); 1630 mlog_entry_void();
@@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle,
690 if (!namelen) 1632 if (!namelen)
691 return -EINVAL; 1633 return -EINVAL;
692 1634
693 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1635 if (ocfs2_dir_indexed(dir)) {
1636 struct buffer_head *bh;
1637
1638 /*
1639 * An indexed dir may require that we update the free space
1640 * list. Reserve a write to the previous node in the list so
1641 * that we don't fail later.
1642 *
1643 * XXX: This can be either a dx_root_block, or an unindexed
1644 * directory tree leaf block.
1645 */
1646 if (ocfs2_free_list_at_root(lookup)) {
1647 bh = lookup->dl_dx_root_bh;
1648 retval = ocfs2_journal_access_dr(handle, dir, bh,
1649 OCFS2_JOURNAL_ACCESS_WRITE);
1650 } else {
1651 bh = lookup->dl_prev_leaf_bh;
1652 retval = ocfs2_journal_access_db(handle, dir, bh,
1653 OCFS2_JOURNAL_ACCESS_WRITE);
1654 }
1655 if (retval) {
1656 mlog_errno(retval);
1657 return retval;
1658 }
1659 } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
694 data_start = di->id2.i_data.id_data; 1660 data_start = di->id2.i_data.id_data;
695 size = i_size_read(dir); 1661 size = i_size_read(dir);
696 1662
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
737 status = ocfs2_journal_access_di(handle, dir, 1703 status = ocfs2_journal_access_di(handle, dir,
738 insert_bh, 1704 insert_bh,
739 OCFS2_JOURNAL_ACCESS_WRITE); 1705 OCFS2_JOURNAL_ACCESS_WRITE);
740 else 1706 else {
741 status = ocfs2_journal_access_db(handle, dir, 1707 status = ocfs2_journal_access_db(handle, dir,
742 insert_bh, 1708 insert_bh,
743 OCFS2_JOURNAL_ACCESS_WRITE); 1709 OCFS2_JOURNAL_ACCESS_WRITE);
1710
1711 if (ocfs2_dir_indexed(dir)) {
1712 status = ocfs2_dx_dir_insert(dir,
1713 handle,
1714 lookup);
1715 if (status) {
1716 mlog_errno(status);
1717 goto bail;
1718 }
1719 }
1720 }
1721
744 /* By now the buffer is marked for journaling */ 1722 /* By now the buffer is marked for journaling */
745 offset += le16_to_cpu(de->rec_len); 1723 offset += le16_to_cpu(de->rec_len);
746 if (le64_to_cpu(de->inode)) { 1724 if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
761 de->name_len = namelen; 1739 de->name_len = namelen;
762 memcpy(de->name, name, namelen); 1740 memcpy(de->name, name, namelen);
763 1741
1742 if (ocfs2_dir_indexed(dir))
1743 ocfs2_recalc_free_list(dir, handle, lookup);
1744
764 dir->i_version++; 1745 dir->i_version++;
765 status = ocfs2_journal_dirty(handle, insert_bh); 1746 status = ocfs2_journal_dirty(handle, insert_bh);
766 retval = 0; 1747 retval = 0;
@@ -870,6 +1851,10 @@ out:
870 return 0; 1851 return 0;
871} 1852}
872 1853
1854/*
1855 * NOTE: This function can be called against unindexed directories,
1856 * and indexed ones.
1857 */
873static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1858static int ocfs2_dir_foreach_blk_el(struct inode *inode,
874 u64 *f_version, 1859 u64 *f_version,
875 loff_t *f_pos, void *priv, 1860 loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
1071 int namelen, 2056 int namelen,
1072 u64 *blkno, 2057 u64 *blkno,
1073 struct inode *inode, 2058 struct inode *inode,
1074 struct buffer_head **dirent_bh, 2059 struct ocfs2_dir_lookup_result *lookup)
1075 struct ocfs2_dir_entry **dirent)
1076{ 2060{
1077 int status = -ENOENT; 2061 int status = -ENOENT;
1078 2062
1079 mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n", 2063 mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
1080 namelen, name, blkno, inode, dirent_bh, dirent); 2064 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1081 2065
1082 *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); 2066 status = ocfs2_find_entry(name, namelen, inode, lookup);
1083 if (!*dirent_bh || !*dirent) { 2067 if (status)
1084 status = -ENOENT;
1085 goto leave; 2068 goto leave;
1086 }
1087 2069
1088 *blkno = le64_to_cpu((*dirent)->inode); 2070 *blkno = le64_to_cpu(lookup->dl_entry->inode);
1089 2071
1090 status = 0; 2072 status = 0;
1091leave: 2073leave:
1092 if (status < 0) {
1093 *dirent = NULL;
1094 brelse(*dirent_bh);
1095 *dirent_bh = NULL;
1096 }
1097 2074
1098 mlog_exit(status);
1099 return status; 2075 return status;
1100} 2076}
1101 2077
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
1107 int namelen, u64 *blkno) 2083 int namelen, u64 *blkno)
1108{ 2084{
1109 int ret; 2085 int ret;
1110 struct buffer_head *bh = NULL; 2086 struct ocfs2_dir_lookup_result lookup = { NULL, };
1111 struct ocfs2_dir_entry *dirent = NULL;
1112 2087
1113 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent); 2088 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
1114 brelse(bh); 2089 ocfs2_free_dir_lookup_result(&lookup);
1115 2090
1116 return ret; 2091 return ret;
1117} 2092}
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
1128 int namelen) 2103 int namelen)
1129{ 2104{
1130 int ret; 2105 int ret;
1131 struct buffer_head *dirent_bh = NULL; 2106 struct ocfs2_dir_lookup_result lookup = { NULL, };
1132 struct ocfs2_dir_entry *dirent = NULL;
1133 2107
1134 mlog_entry("dir %llu, name '%.*s'\n", 2108 mlog_entry("dir %llu, name '%.*s'\n",
1135 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); 2109 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
1136 2110
1137 ret = -EEXIST; 2111 ret = -EEXIST;
1138 dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); 2112 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
1139 if (dirent_bh)
1140 goto bail; 2113 goto bail;
1141 2114
1142 ret = 0; 2115 ret = 0;
1143bail: 2116bail:
1144 brelse(dirent_bh); 2117 ocfs2_free_dir_lookup_result(&lookup);
1145 2118
1146 mlog_exit(ret); 2119 mlog_exit(ret);
1147 return ret; 2120 return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
1151 unsigned seen_dot; 2124 unsigned seen_dot;
1152 unsigned seen_dot_dot; 2125 unsigned seen_dot_dot;
1153 unsigned seen_other; 2126 unsigned seen_other;
2127 unsigned dx_dir;
1154}; 2128};
1155static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, 2129static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1156 loff_t pos, u64 ino, unsigned type) 2130 loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1160 /* 2134 /*
1161 * Check the positions of "." and ".." records to be sure 2135 * Check the positions of "." and ".." records to be sure
1162 * they're in the correct place. 2136 * they're in the correct place.
2137 *
2138 * Indexed directories don't need to proceed past the first
2139 * two entries, so we end the scan after seeing '..'. Despite
2140 * that, we allow the scan to proceed In the event that we
2141 * have a corrupted indexed directory (no dot or dot dot
2142 * entries). This allows us to double check for existing
2143 * entries which might not have been found in the index.
1163 */ 2144 */
1164 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { 2145 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
1165 p->seen_dot = 1; 2146 p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1169 if (name_len == 2 && !strncmp("..", name, 2) && 2150 if (name_len == 2 && !strncmp("..", name, 2) &&
1170 pos == OCFS2_DIR_REC_LEN(1)) { 2151 pos == OCFS2_DIR_REC_LEN(1)) {
1171 p->seen_dot_dot = 1; 2152 p->seen_dot_dot = 1;
2153
2154 if (p->dx_dir && p->seen_dot)
2155 return 1;
2156
1172 return 0; 2157 return 0;
1173 } 2158 }
1174 2159
1175 p->seen_other = 1; 2160 p->seen_other = 1;
1176 return 1; 2161 return 1;
1177} 2162}
2163
2164static int ocfs2_empty_dir_dx(struct inode *inode,
2165 struct ocfs2_empty_dir_priv *priv)
2166{
2167 int ret;
2168 struct buffer_head *di_bh = NULL;
2169 struct buffer_head *dx_root_bh = NULL;
2170 struct ocfs2_dinode *di;
2171 struct ocfs2_dx_root_block *dx_root;
2172
2173 priv->dx_dir = 1;
2174
2175 ret = ocfs2_read_inode_block(inode, &di_bh);
2176 if (ret) {
2177 mlog_errno(ret);
2178 goto out;
2179 }
2180 di = (struct ocfs2_dinode *)di_bh->b_data;
2181
2182 ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
2183 if (ret) {
2184 mlog_errno(ret);
2185 goto out;
2186 }
2187 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2188
2189 if (le32_to_cpu(dx_root->dr_num_entries) != 2)
2190 priv->seen_other = 1;
2191
2192out:
2193 brelse(di_bh);
2194 brelse(dx_root_bh);
2195 return ret;
2196}
2197
1178/* 2198/*
1179 * routine to check that the specified directory is empty (for rmdir) 2199 * routine to check that the specified directory is empty (for rmdir)
1180 * 2200 *
1181 * Returns 1 if dir is empty, zero otherwise. 2201 * Returns 1 if dir is empty, zero otherwise.
2202 *
2203 * XXX: This is a performance problem for unindexed directories.
1182 */ 2204 */
1183int ocfs2_empty_dir(struct inode *inode) 2205int ocfs2_empty_dir(struct inode *inode)
1184{ 2206{
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
1188 2210
1189 memset(&priv, 0, sizeof(priv)); 2211 memset(&priv, 0, sizeof(priv));
1190 2212
2213 if (ocfs2_dir_indexed(inode)) {
2214 ret = ocfs2_empty_dir_dx(inode, &priv);
2215 if (ret)
2216 mlog_errno(ret);
2217 /*
2218 * We still run ocfs2_dir_foreach to get the checks
2219 * for "." and "..".
2220 */
2221 }
2222
1191 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); 2223 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
1192 if (ret) 2224 if (ret)
1193 mlog_errno(ret); 2225 mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1280 struct inode *parent, 2312 struct inode *parent,
1281 struct inode *inode, 2313 struct inode *inode,
1282 struct buffer_head *fe_bh, 2314 struct buffer_head *fe_bh,
1283 struct ocfs2_alloc_context *data_ac) 2315 struct ocfs2_alloc_context *data_ac,
2316 struct buffer_head **ret_new_bh)
1284{ 2317{
1285 int status; 2318 int status;
1286 unsigned int size = osb->sb->s_blocksize; 2319 unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1289 2322
1290 mlog_entry_void(); 2323 mlog_entry_void();
1291 2324
1292 if (ocfs2_supports_dir_trailer(osb)) 2325 if (ocfs2_new_dir_wants_trailer(inode))
1293 size = ocfs2_dir_trailer_blk_off(parent->i_sb); 2326 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
1294 2327
1295 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 2328 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1310 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 2343 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1311 2344
1312 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); 2345 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
1313 if (ocfs2_supports_dir_trailer(osb)) 2346 if (ocfs2_new_dir_wants_trailer(inode)) {
1314 ocfs2_init_dir_trailer(inode, new_bh); 2347 int size = le16_to_cpu(de->rec_len);
2348
2349 /*
2350 * Figure out the size of the hole left over after
2351 * insertion of '.' and '..'. The trailer wants this
2352 * information.
2353 */
2354 size -= OCFS2_DIR_REC_LEN(2);
2355 size -= sizeof(struct ocfs2_dir_block_trailer);
2356
2357 ocfs2_init_dir_trailer(inode, new_bh, size);
2358 }
1315 2359
1316 status = ocfs2_journal_dirty(handle, new_bh); 2360 status = ocfs2_journal_dirty(handle, new_bh);
1317 if (status < 0) { 2361 if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1329 } 2373 }
1330 2374
1331 status = 0; 2375 status = 0;
2376 if (ret_new_bh) {
2377 *ret_new_bh = new_bh;
2378 new_bh = NULL;
2379 }
1332bail: 2380bail:
1333 brelse(new_bh); 2381 brelse(new_bh);
1334 2382
@@ -1336,20 +2384,427 @@ bail:
1336 return status; 2384 return status;
1337} 2385}
1338 2386
2387static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2388 handle_t *handle, struct inode *dir,
2389 struct buffer_head *di_bh,
2390 struct buffer_head *dirdata_bh,
2391 struct ocfs2_alloc_context *meta_ac,
2392 int dx_inline, u32 num_entries,
2393 struct buffer_head **ret_dx_root_bh)
2394{
2395 int ret;
2396 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2397 u16 dr_suballoc_bit;
2398 u64 dr_blkno;
2399 unsigned int num_bits;
2400 struct buffer_head *dx_root_bh = NULL;
2401 struct ocfs2_dx_root_block *dx_root;
2402 struct ocfs2_dir_block_trailer *trailer =
2403 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2404
2405 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
2406 &num_bits, &dr_blkno);
2407 if (ret) {
2408 mlog_errno(ret);
2409 goto out;
2410 }
2411
2412 mlog(0, "Dir %llu, attach new index block: %llu\n",
2413 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2414 (unsigned long long)dr_blkno);
2415
2416 dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2417 if (dx_root_bh == NULL) {
2418 ret = -EIO;
2419 goto out;
2420 }
2421 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
2422
2423 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
2424 OCFS2_JOURNAL_ACCESS_CREATE);
2425 if (ret < 0) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429
2430 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2431 memset(dx_root, 0, osb->sb->s_blocksize);
2432 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2433 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
2434 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2435 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2436 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2437 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2438 dx_root->dr_num_entries = cpu_to_le32(num_entries);
2439 if (le16_to_cpu(trailer->db_free_rec_len))
2440 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
2441 else
2442 dx_root->dr_free_blk = cpu_to_le64(0);
2443
2444 if (dx_inline) {
2445 dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
2446 dx_root->dr_entries.de_count =
2447 cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
2448 } else {
2449 dx_root->dr_list.l_count =
2450 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2451 }
2452
2453 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2454 if (ret)
2455 mlog_errno(ret);
2456
2457 ret = ocfs2_journal_access_di(handle, dir, di_bh,
2458 OCFS2_JOURNAL_ACCESS_CREATE);
2459 if (ret) {
2460 mlog_errno(ret);
2461 goto out;
2462 }
2463
2464 di->i_dx_root = cpu_to_le64(dr_blkno);
2465
2466 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2467 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2468
2469 ret = ocfs2_journal_dirty(handle, di_bh);
2470 if (ret)
2471 mlog_errno(ret);
2472
2473 *ret_dx_root_bh = dx_root_bh;
2474 dx_root_bh = NULL;
2475
2476out:
2477 brelse(dx_root_bh);
2478 return ret;
2479}
2480
2481static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2482 handle_t *handle, struct inode *dir,
2483 struct buffer_head **dx_leaves,
2484 int num_dx_leaves, u64 start_blk)
2485{
2486 int ret, i;
2487 struct ocfs2_dx_leaf *dx_leaf;
2488 struct buffer_head *bh;
2489
2490 for (i = 0; i < num_dx_leaves; i++) {
2491 bh = sb_getblk(osb->sb, start_blk + i);
2492 if (bh == NULL) {
2493 ret = -EIO;
2494 goto out;
2495 }
2496 dx_leaves[i] = bh;
2497
2498 ocfs2_set_new_buffer_uptodate(dir, bh);
2499
2500 ret = ocfs2_journal_access_dl(handle, dir, bh,
2501 OCFS2_JOURNAL_ACCESS_CREATE);
2502 if (ret < 0) {
2503 mlog_errno(ret);
2504 goto out;
2505 }
2506
2507 dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2508
2509 memset(dx_leaf, 0, osb->sb->s_blocksize);
2510 strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2511 dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2512 dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2513 dx_leaf->dl_list.de_count =
2514 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2515
2516 mlog(0,
2517 "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
2518 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2519 (unsigned long long)bh->b_blocknr,
2520 le16_to_cpu(dx_leaf->dl_list.de_count));
2521
2522 ocfs2_journal_dirty(handle, bh);
2523 }
2524
2525 ret = 0;
2526out:
2527 return ret;
2528}
2529
2530/*
2531 * Allocates and formats a new cluster for use in an indexed dir
2532 * leaf. This version will not do the extent insert, so that it can be
2533 * used by operations which need careful ordering.
2534 */
2535static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2536 u32 cpos, handle_t *handle,
2537 struct ocfs2_alloc_context *data_ac,
2538 struct buffer_head **dx_leaves,
2539 int num_dx_leaves, u64 *ret_phys_blkno)
2540{
2541 int ret;
2542 u32 phys, num;
2543 u64 phys_blkno;
2544 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2545
2546 /*
2547 * XXX: For create, this should claim cluster for the index
2548 * *before* the unindexed insert so that we have a better
2549 * chance of contiguousness as the directory grows in number
2550 * of entries.
2551 */
2552 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
2553 if (ret) {
2554 mlog_errno(ret);
2555 goto out;
2556 }
2557
2558 /*
2559 * Format the new cluster first. That way, we're inserting
2560 * valid data.
2561 */
2562 phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2563 ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2564 num_dx_leaves, phys_blkno);
2565 if (ret) {
2566 mlog_errno(ret);
2567 goto out;
2568 }
2569
2570 *ret_phys_blkno = phys_blkno;
2571out:
2572 return ret;
2573}
2574
2575static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2576 struct ocfs2_extent_tree *et,
2577 u32 cpos, handle_t *handle,
2578 struct ocfs2_alloc_context *data_ac,
2579 struct ocfs2_alloc_context *meta_ac,
2580 struct buffer_head **dx_leaves,
2581 int num_dx_leaves)
2582{
2583 int ret;
2584 u64 phys_blkno;
2585 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2586
2587 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2588 num_dx_leaves, &phys_blkno);
2589 if (ret) {
2590 mlog_errno(ret);
2591 goto out;
2592 }
2593
2594 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
2595 meta_ac);
2596 if (ret)
2597 mlog_errno(ret);
2598out:
2599 return ret;
2600}
2601
2602static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2603 int *ret_num_leaves)
2604{
2605 int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2606 struct buffer_head **dx_leaves;
2607
2608 dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2609 GFP_NOFS);
2610 if (dx_leaves && ret_num_leaves)
2611 *ret_num_leaves = num_dx_leaves;
2612
2613 return dx_leaves;
2614}
2615
2616static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2617 handle_t *handle,
2618 struct inode *parent,
2619 struct inode *inode,
2620 struct buffer_head *di_bh,
2621 struct ocfs2_alloc_context *data_ac,
2622 struct ocfs2_alloc_context *meta_ac)
2623{
2624 int ret;
2625 struct buffer_head *leaf_bh = NULL;
2626 struct buffer_head *dx_root_bh = NULL;
2627 struct ocfs2_dx_hinfo hinfo;
2628 struct ocfs2_dx_root_block *dx_root;
2629 struct ocfs2_dx_entry_list *entry_list;
2630
2631 /*
2632 * Our strategy is to create the directory as though it were
2633 * unindexed, then add the index block. This works with very
2634 * little complication since the state of a new directory is a
2635 * very well known quantity.
2636 *
2637 * Essentially, we have two dirents ("." and ".."), in the 1st
2638 * block which need indexing. These are easily inserted into
2639 * the index block.
2640 */
2641
2642 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2643 data_ac, &leaf_bh);
2644 if (ret) {
2645 mlog_errno(ret);
2646 goto out;
2647 }
2648
2649 ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
2650 meta_ac, 1, 2, &dx_root_bh);
2651 if (ret) {
2652 mlog_errno(ret);
2653 goto out;
2654 }
2655 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2656 entry_list = &dx_root->dr_entries;
2657
2658 /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
2659 ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2660 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2661
2662 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2663 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2664
2665out:
2666 brelse(dx_root_bh);
2667 brelse(leaf_bh);
2668 return ret;
2669}
2670
1339int ocfs2_fill_new_dir(struct ocfs2_super *osb, 2671int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1340 handle_t *handle, 2672 handle_t *handle,
1341 struct inode *parent, 2673 struct inode *parent,
1342 struct inode *inode, 2674 struct inode *inode,
1343 struct buffer_head *fe_bh, 2675 struct buffer_head *fe_bh,
1344 struct ocfs2_alloc_context *data_ac) 2676 struct ocfs2_alloc_context *data_ac,
2677 struct ocfs2_alloc_context *meta_ac)
2678
1345{ 2679{
1346 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); 2680 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
1347 2681
1348 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2682 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1349 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); 2683 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
1350 2684
2685 if (ocfs2_supports_indexed_dirs(osb))
2686 return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2687 data_ac, meta_ac);
2688
1351 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, 2689 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
1352 data_ac); 2690 data_ac, NULL);
2691}
2692
2693static int ocfs2_dx_dir_index_block(struct inode *dir,
2694 handle_t *handle,
2695 struct buffer_head **dx_leaves,
2696 int num_dx_leaves,
2697 u32 *num_dx_entries,
2698 struct buffer_head *dirent_bh)
2699{
2700 int ret, namelen, i;
2701 char *de_buf, *limit;
2702 struct ocfs2_dir_entry *de;
2703 struct buffer_head *dx_leaf_bh;
2704 struct ocfs2_dx_hinfo hinfo;
2705 u64 dirent_blk = dirent_bh->b_blocknr;
2706
2707 de_buf = dirent_bh->b_data;
2708 limit = de_buf + dir->i_sb->s_blocksize;
2709
2710 while (de_buf < limit) {
2711 de = (struct ocfs2_dir_entry *)de_buf;
2712
2713 namelen = de->name_len;
2714 if (!namelen || !de->inode)
2715 goto inc;
2716
2717 ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2718
2719 i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2720 dx_leaf_bh = dx_leaves[i];
2721
2722 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2723 dirent_blk, dx_leaf_bh);
2724 if (ret) {
2725 mlog_errno(ret);
2726 goto out;
2727 }
2728
2729 *num_dx_entries = *num_dx_entries + 1;
2730
2731inc:
2732 de_buf += le16_to_cpu(de->rec_len);
2733 }
2734
2735out:
2736 return ret;
2737}
2738
2739/*
2740 * XXX: This expects dx_root_bh to already be part of the transaction.
2741 */
2742static void ocfs2_dx_dir_index_root_block(struct inode *dir,
2743 struct buffer_head *dx_root_bh,
2744 struct buffer_head *dirent_bh)
2745{
2746 char *de_buf, *limit;
2747 struct ocfs2_dx_root_block *dx_root;
2748 struct ocfs2_dir_entry *de;
2749 struct ocfs2_dx_hinfo hinfo;
2750 u64 dirent_blk = dirent_bh->b_blocknr;
2751
2752 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2753
2754 de_buf = dirent_bh->b_data;
2755 limit = de_buf + dir->i_sb->s_blocksize;
2756
2757 while (de_buf < limit) {
2758 de = (struct ocfs2_dir_entry *)de_buf;
2759
2760 if (!de->name_len || !de->inode)
2761 goto inc;
2762
2763 ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
2764
2765 mlog(0,
2766 "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
2767 (unsigned long long)dir->i_ino, hinfo.major_hash,
2768 hinfo.minor_hash,
2769 le16_to_cpu(dx_root->dr_entries.de_num_used),
2770 de->name_len, de->name);
2771
2772 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
2773 dirent_blk);
2774
2775 le32_add_cpu(&dx_root->dr_num_entries, 1);
2776inc:
2777 de_buf += le16_to_cpu(de->rec_len);
2778 }
2779}
2780
2781/*
2782 * Count the number of inline directory entries in di_bh and compare
2783 * them against the number of entries we can hold in an inline dx root
2784 * block.
2785 */
2786static int ocfs2_new_dx_should_be_inline(struct inode *dir,
2787 struct buffer_head *di_bh)
2788{
2789 int dirent_count = 0;
2790 char *de_buf, *limit;
2791 struct ocfs2_dir_entry *de;
2792 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2793
2794 de_buf = di->id2.i_data.id_data;
2795 limit = de_buf + i_size_read(dir);
2796
2797 while (de_buf < limit) {
2798 de = (struct ocfs2_dir_entry *)de_buf;
2799
2800 if (de->name_len && de->inode)
2801 dirent_count++;
2802
2803 de_buf += le16_to_cpu(de->rec_len);
2804 }
2805
2806 /* We are careful to leave room for one extra record. */
2807 return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
1353} 2808}
1354 2809
1355/* 2810/*
@@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1358 * expansion from an inline directory to one with extents. The first dir block 2813 * expansion from an inline directory to one with extents. The first dir block
1359 * in that case is taken from the inline data portion of the inode block. 2814 * in that case is taken from the inline data portion of the inode block.
1360 * 2815 *
2816 * This will also return the largest amount of contiguous space for a dirent
2817 * in the block. That value is *not* necessarily the last dirent, even after
2818 * expansion. The directory indexing code wants this value for free space
2819 * accounting. We do this here since we're already walking the entire dir
2820 * block.
2821 *
1361 * We add the dir trailer if this filesystem wants it. 2822 * We add the dir trailer if this filesystem wants it.
1362 */ 2823 */
1363static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, 2824static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1364 struct super_block *sb) 2825 struct inode *dir)
1365{ 2826{
2827 struct super_block *sb = dir->i_sb;
1366 struct ocfs2_dir_entry *de; 2828 struct ocfs2_dir_entry *de;
1367 struct ocfs2_dir_entry *prev_de; 2829 struct ocfs2_dir_entry *prev_de;
1368 char *de_buf, *limit; 2830 char *de_buf, *limit;
1369 unsigned int new_size = sb->s_blocksize; 2831 unsigned int new_size = sb->s_blocksize;
1370 unsigned int bytes; 2832 unsigned int bytes, this_hole;
2833 unsigned int largest_hole = 0;
1371 2834
1372 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 2835 if (ocfs2_new_dir_wants_trailer(dir))
1373 new_size = ocfs2_dir_trailer_blk_off(sb); 2836 new_size = ocfs2_dir_trailer_blk_off(sb);
1374 2837
1375 bytes = new_size - old_size; 2838 bytes = new_size - old_size;
@@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1378 de_buf = start; 2841 de_buf = start;
1379 de = (struct ocfs2_dir_entry *)de_buf; 2842 de = (struct ocfs2_dir_entry *)de_buf;
1380 do { 2843 do {
2844 this_hole = ocfs2_figure_dirent_hole(de);
2845 if (this_hole > largest_hole)
2846 largest_hole = this_hole;
2847
1381 prev_de = de; 2848 prev_de = de;
1382 de_buf += le16_to_cpu(de->rec_len); 2849 de_buf += le16_to_cpu(de->rec_len);
1383 de = (struct ocfs2_dir_entry *)de_buf; 2850 de = (struct ocfs2_dir_entry *)de_buf;
1384 } while (de_buf < limit); 2851 } while (de_buf < limit);
1385 2852
1386 le16_add_cpu(&prev_de->rec_len, bytes); 2853 le16_add_cpu(&prev_de->rec_len, bytes);
2854
2855 /* We need to double check this after modification of the final
2856 * dirent. */
2857 this_hole = ocfs2_figure_dirent_hole(prev_de);
2858 if (this_hole > largest_hole)
2859 largest_hole = this_hole;
2860
2861 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
2862 return largest_hole;
2863 return 0;
1387} 2864}
1388 2865
1389/* 2866/*
@@ -1396,29 +2873,61 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1396 */ 2873 */
1397static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, 2874static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1398 unsigned int blocks_wanted, 2875 unsigned int blocks_wanted,
2876 struct ocfs2_dir_lookup_result *lookup,
1399 struct buffer_head **first_block_bh) 2877 struct buffer_head **first_block_bh)
1400{ 2878{
1401 u32 alloc, bit_off, len; 2879 u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
1402 struct super_block *sb = dir->i_sb; 2880 struct super_block *sb = dir->i_sb;
1403 int ret, credits = ocfs2_inline_to_extents_credits(sb); 2881 int ret, i, num_dx_leaves = 0, dx_inline = 0,
1404 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 2882 credits = ocfs2_inline_to_extents_credits(sb);
2883 u64 dx_insert_blkno, blkno,
2884 bytes = blocks_wanted << sb->s_blocksize_bits;
1405 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2885 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1406 struct ocfs2_inode_info *oi = OCFS2_I(dir); 2886 struct ocfs2_inode_info *oi = OCFS2_I(dir);
1407 struct ocfs2_alloc_context *data_ac; 2887 struct ocfs2_alloc_context *data_ac;
2888 struct ocfs2_alloc_context *meta_ac = NULL;
1408 struct buffer_head *dirdata_bh = NULL; 2889 struct buffer_head *dirdata_bh = NULL;
2890 struct buffer_head *dx_root_bh = NULL;
2891 struct buffer_head **dx_leaves = NULL;
1409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2892 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1410 handle_t *handle; 2893 handle_t *handle;
1411 struct ocfs2_extent_tree et; 2894 struct ocfs2_extent_tree et;
1412 int did_quota = 0; 2895 struct ocfs2_extent_tree dx_et;
2896 int did_quota = 0, bytes_allocated = 0;
1413 2897
1414 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2898 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1415 2899
1416 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2900 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2901 dx_alloc = 0;
2902
2903 if (ocfs2_supports_indexed_dirs(osb)) {
2904 credits += ocfs2_add_dir_index_credits(sb);
2905
2906 dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
2907 if (!dx_inline) {
2908 /* Add one more cluster for an index leaf */
2909 dx_alloc++;
2910 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
2911 &num_dx_leaves);
2912 if (!dx_leaves) {
2913 ret = -ENOMEM;
2914 mlog_errno(ret);
2915 goto out;
2916 }
2917 }
2918
2919 /* This gets us the dx_root */
2920 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2921 if (ret) {
2922 mlog_errno(ret);
2923 goto out;
2924 }
2925 }
1417 2926
1418 /* 2927 /*
1419 * We should never need more than 2 clusters for this - 2928 * We should never need more than 2 clusters for the unindexed
1420 * maximum dirent size is far less than one block. In fact, 2929 * tree - maximum dirent size is far less than one block. In
1421 * the only time we'd need more than one cluster is if 2930 * fact, the only time we'd need more than one cluster is if
1422 * blocksize == clustersize and the dirent won't fit in the 2931 * blocksize == clustersize and the dirent won't fit in the
1423 * extra space that the expansion to a single block gives. As 2932 * extra space that the expansion to a single block gives. As
1424 * of today, that only happens on 4k/4k file systems. 2933 * of today, that only happens on 4k/4k file systems.
@@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1435 2944
1436 /* 2945 /*
1437 * Prepare for worst case allocation scenario of two separate 2946 * Prepare for worst case allocation scenario of two separate
1438 * extents. 2947 * extents in the unindexed tree.
1439 */ 2948 */
1440 if (alloc == 2) 2949 if (alloc == 2)
1441 credits += OCFS2_SUBALLOC_ALLOC; 2950 credits += OCFS2_SUBALLOC_ALLOC;
@@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1448 } 2957 }
1449 2958
1450 if (vfs_dq_alloc_space_nodirty(dir, 2959 if (vfs_dq_alloc_space_nodirty(dir,
1451 ocfs2_clusters_to_bytes(osb->sb, alloc))) { 2960 ocfs2_clusters_to_bytes(osb->sb,
2961 alloc + dx_alloc))) {
1452 ret = -EDQUOT; 2962 ret = -EDQUOT;
1453 goto out_commit; 2963 goto out_commit;
1454 } 2964 }
1455 did_quota = 1; 2965 did_quota = 1;
2966
2967 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2968 /*
2969 * Allocate our index cluster first, to maximize the
2970 * possibility that unindexed leaves grow
2971 * contiguously.
2972 */
2973 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2974 dx_leaves, num_dx_leaves,
2975 &dx_insert_blkno);
2976 if (ret) {
2977 mlog_errno(ret);
2978 goto out_commit;
2979 }
2980 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2981 }
2982
1456 /* 2983 /*
1457 * Try to claim as many clusters as the bitmap can give though 2984 * Try to claim as many clusters as the bitmap can give though
1458 * if we only get one now, that's enough to continue. The rest 2985 * if we only get one now, that's enough to continue. The rest
@@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1463 mlog_errno(ret); 2990 mlog_errno(ret);
1464 goto out_commit; 2991 goto out_commit;
1465 } 2992 }
2993 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1466 2994
1467 /* 2995 /*
1468 * Operations are carefully ordered so that we set up the new 2996 * Operations are carefully ordered so that we set up the new
@@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1489 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 3017 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1490 memset(dirdata_bh->b_data + i_size_read(dir), 0, 3018 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1491 sb->s_blocksize - i_size_read(dir)); 3019 sb->s_blocksize - i_size_read(dir));
1492 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); 3020 i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
1493 if (ocfs2_supports_dir_trailer(osb)) 3021 if (ocfs2_new_dir_wants_trailer(dir)) {
1494 ocfs2_init_dir_trailer(dir, dirdata_bh); 3022 /*
3023 * Prepare the dir trailer up front. It will otherwise look
3024 * like a valid dirent. Even if inserting the index fails
3025 * (unlikely), then all we'll have done is given first dir
3026 * block a small amount of fragmentation.
3027 */
3028 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3029 }
1495 3030
1496 ret = ocfs2_journal_dirty(handle, dirdata_bh); 3031 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1497 if (ret) { 3032 if (ret) {
@@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1499 goto out_commit; 3034 goto out_commit;
1500 } 3035 }
1501 3036
3037 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3038 /*
3039 * Dx dirs with an external cluster need to do this up
3040 * front. Inline dx root's get handled later, after
3041 * we've allocated our root block. We get passed back
3042 * a total number of items so that dr_num_entries can
3043 * be correctly set once the dx_root has been
3044 * allocated.
3045 */
3046 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
3047 num_dx_leaves, &num_dx_entries,
3048 dirdata_bh);
3049 if (ret) {
3050 mlog_errno(ret);
3051 goto out_commit;
3052 }
3053 }
3054
1502 /* 3055 /*
1503 * Set extent, i_size, etc on the directory. After this, the 3056 * Set extent, i_size, etc on the directory. After this, the
1504 * inode should contain the same exact dirents as before and 3057 * inode should contain the same exact dirents as before and
@@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1551 goto out_commit; 3104 goto out_commit;
1552 } 3105 }
1553 3106
3107 if (ocfs2_supports_indexed_dirs(osb)) {
3108 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
3109 dirdata_bh, meta_ac, dx_inline,
3110 num_dx_entries, &dx_root_bh);
3111 if (ret) {
3112 mlog_errno(ret);
3113 goto out_commit;
3114 }
3115
3116 if (dx_inline) {
3117 ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3118 dirdata_bh);
3119 } else {
3120 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
3121 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
3122 dx_insert_blkno, 1, 0, NULL);
3123 if (ret)
3124 mlog_errno(ret);
3125 }
3126 }
3127
1554 /* 3128 /*
1555 * We asked for two clusters, but only got one in the 1st 3129 * We asked for two clusters, but only got one in the 1st
1556 * pass. Claim the 2nd cluster as a separate extent. 3130 * pass. Claim the 2nd cluster as a separate extent.
@@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1570 mlog_errno(ret); 3144 mlog_errno(ret);
1571 goto out_commit; 3145 goto out_commit;
1572 } 3146 }
3147 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1573 } 3148 }
1574 3149
1575 *first_block_bh = dirdata_bh; 3150 *first_block_bh = dirdata_bh;
1576 dirdata_bh = NULL; 3151 dirdata_bh = NULL;
3152 if (ocfs2_supports_indexed_dirs(osb)) {
3153 unsigned int off;
3154
3155 if (!dx_inline) {
3156 /*
3157 * We need to return the correct block within the
3158 * cluster which should hold our entry.
3159 */
3160 off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
3161 &lookup->dl_hinfo);
3162 get_bh(dx_leaves[off]);
3163 lookup->dl_dx_leaf_bh = dx_leaves[off];
3164 }
3165 lookup->dl_dx_root_bh = dx_root_bh;
3166 dx_root_bh = NULL;
3167 }
1577 3168
1578out_commit: 3169out_commit:
1579 if (ret < 0 && did_quota) 3170 if (ret < 0 && did_quota)
1580 vfs_dq_free_space_nodirty(dir, 3171 vfs_dq_free_space_nodirty(dir, bytes_allocated);
1581 ocfs2_clusters_to_bytes(osb->sb, 2)); 3172
1582 ocfs2_commit_trans(osb, handle); 3173 ocfs2_commit_trans(osb, handle);
1583 3174
1584out_sem: 3175out_sem:
@@ -1587,8 +3178,17 @@ out_sem:
1587out: 3178out:
1588 if (data_ac) 3179 if (data_ac)
1589 ocfs2_free_alloc_context(data_ac); 3180 ocfs2_free_alloc_context(data_ac);
3181 if (meta_ac)
3182 ocfs2_free_alloc_context(meta_ac);
3183
3184 if (dx_leaves) {
3185 for (i = 0; i < num_dx_leaves; i++)
3186 brelse(dx_leaves[i]);
3187 kfree(dx_leaves);
3188 }
1590 3189
1591 brelse(dirdata_bh); 3190 brelse(dirdata_bh);
3191 brelse(dx_root_bh);
1592 3192
1593 return ret; 3193 return ret;
1594} 3194}
@@ -1658,11 +3258,14 @@ bail:
1658 * is to be turned into an extent based one. The size of the dirent to 3258 * is to be turned into an extent based one. The size of the dirent to
1659 * insert might be larger than the space gained by growing to just one 3259 * insert might be larger than the space gained by growing to just one
1660 * block, so we may have to grow the inode by two blocks in that case. 3260 * block, so we may have to grow the inode by two blocks in that case.
3261 *
3262 * If the directory is already indexed, dx_root_bh must be provided.
1661 */ 3263 */
1662static int ocfs2_extend_dir(struct ocfs2_super *osb, 3264static int ocfs2_extend_dir(struct ocfs2_super *osb,
1663 struct inode *dir, 3265 struct inode *dir,
1664 struct buffer_head *parent_fe_bh, 3266 struct buffer_head *parent_fe_bh,
1665 unsigned int blocks_wanted, 3267 unsigned int blocks_wanted,
3268 struct ocfs2_dir_lookup_result *lookup,
1666 struct buffer_head **new_de_bh) 3269 struct buffer_head **new_de_bh)
1667{ 3270{
1668 int status = 0; 3271 int status = 0;
@@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1677 struct ocfs2_dir_entry * de; 3280 struct ocfs2_dir_entry * de;
1678 struct super_block *sb = osb->sb; 3281 struct super_block *sb = osb->sb;
1679 struct ocfs2_extent_tree et; 3282 struct ocfs2_extent_tree et;
3283 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1680 3284
1681 mlog_entry_void(); 3285 mlog_entry_void();
1682 3286
1683 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 3287 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
3288 /*
3289 * This would be a code error as an inline directory should
3290 * never have an index root.
3291 */
3292 BUG_ON(dx_root_bh);
3293
1684 status = ocfs2_expand_inline_dir(dir, parent_fe_bh, 3294 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
1685 blocks_wanted, &new_bh); 3295 blocks_wanted, lookup,
3296 &new_bh);
1686 if (status) { 3297 if (status) {
1687 mlog_errno(status); 3298 mlog_errno(status);
1688 goto bail; 3299 goto bail;
1689 } 3300 }
1690 3301
3302 /* Expansion from inline to an indexed directory will
3303 * have given us this. */
3304 dx_root_bh = lookup->dl_dx_root_bh;
3305
1691 if (blocks_wanted == 1) { 3306 if (blocks_wanted == 1) {
1692 /* 3307 /*
1693 * If the new dirent will fit inside the space 3308 * If the new dirent will fit inside the space
@@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1751 } 3366 }
1752 3367
1753do_extend: 3368do_extend:
3369 if (ocfs2_dir_indexed(dir))
3370 credits++; /* For attaching the new dirent block to the
3371 * dx_root */
3372
1754 down_write(&OCFS2_I(dir)->ip_alloc_sem); 3373 down_write(&OCFS2_I(dir)->ip_alloc_sem);
1755 drop_alloc_sem = 1; 3374 drop_alloc_sem = 1;
1756 3375
@@ -1781,9 +3400,19 @@ do_extend:
1781 3400
1782 de = (struct ocfs2_dir_entry *) new_bh->b_data; 3401 de = (struct ocfs2_dir_entry *) new_bh->b_data;
1783 de->inode = 0; 3402 de->inode = 0;
1784 if (ocfs2_dir_has_trailer(dir)) { 3403 if (ocfs2_supports_dir_trailer(dir)) {
1785 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); 3404 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
1786 ocfs2_init_dir_trailer(dir, new_bh); 3405
3406 ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
3407
3408 if (ocfs2_dir_indexed(dir)) {
3409 status = ocfs2_dx_dir_link_trailer(dir, handle,
3410 dx_root_bh, new_bh);
3411 if (status) {
3412 mlog_errno(status);
3413 goto bail;
3414 }
3415 }
1787 } else { 3416 } else {
1788 de->rec_len = cpu_to_le16(sb->s_blocksize); 3417 de->rec_len = cpu_to_le16(sb->s_blocksize);
1789 } 3418 }
@@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1839 * This calculates how many free bytes we'd have in block zero, should 3468 * This calculates how many free bytes we'd have in block zero, should
1840 * this function force expansion to an extent tree. 3469 * this function force expansion to an extent tree.
1841 */ 3470 */
1842 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 3471 if (ocfs2_new_dir_wants_trailer(dir))
1843 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); 3472 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
1844 else 3473 else
1845 free_space = dir->i_sb->s_blocksize - i_size_read(dir); 3474 free_space = dir->i_sb->s_blocksize - i_size_read(dir);
@@ -1970,12 +3599,766 @@ bail:
1970 return status; 3599 return status;
1971} 3600}
1972 3601
3602static int dx_leaf_sort_cmp(const void *a, const void *b)
3603{
3604 const struct ocfs2_dx_entry *entry1 = a;
3605 const struct ocfs2_dx_entry *entry2 = b;
3606 u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3607 u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3608 u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3609 u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3610
3611 if (major_hash1 > major_hash2)
3612 return 1;
3613 if (major_hash1 < major_hash2)
3614 return -1;
3615
3616 /*
3617 * It is not strictly necessary to sort by minor
3618 */
3619 if (minor_hash1 > minor_hash2)
3620 return 1;
3621 if (minor_hash1 < minor_hash2)
3622 return -1;
3623 return 0;
3624}
3625
3626static void dx_leaf_sort_swap(void *a, void *b, int size)
3627{
3628 struct ocfs2_dx_entry *entry1 = a;
3629 struct ocfs2_dx_entry *entry2 = b;
3630 struct ocfs2_dx_entry tmp;
3631
3632 BUG_ON(size != sizeof(*entry1));
3633
3634 tmp = *entry1;
3635 *entry1 = *entry2;
3636 *entry2 = tmp;
3637}
3638
3639static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3640{
3641 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3642 int i, num = le16_to_cpu(dl_list->de_num_used);
3643
3644 for (i = 0; i < (num - 1); i++) {
3645 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3646 le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3647 return 0;
3648 }
3649
3650 return 1;
3651}
3652
3653/*
3654 * Find the optimal value to split this leaf on. This expects the leaf
3655 * entries to be in sorted order.
3656 *
3657 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3658 * the hash we want to insert.
3659 *
3660 * This function is only concerned with the major hash - that which
3661 * determines which cluster an item belongs to.
3662 */
3663static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3664 u32 leaf_cpos, u32 insert_hash,
3665 u32 *split_hash)
3666{
3667 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3668 int i, num_used = le16_to_cpu(dl_list->de_num_used);
3669 int allsame;
3670
3671 /*
3672 * There's a couple rare, but nasty corner cases we have to
3673 * check for here. All of them involve a leaf where all value
3674 * have the same hash, which is what we look for first.
3675 *
3676 * Most of the time, all of the above is false, and we simply
3677 * pick the median value for a split.
3678 */
3679 allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3680 if (allsame) {
3681 u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3682
3683 if (val == insert_hash) {
3684 /*
3685 * No matter where we would choose to split,
3686 * the new entry would want to occupy the same
3687 * block as these. Since there's no space left
3688 * in their existing block, we know there
3689 * won't be space after the split.
3690 */
3691 return -ENOSPC;
3692 }
3693
3694 if (val == leaf_cpos) {
3695 /*
3696 * Because val is the same as leaf_cpos (which
3697 * is the smallest value this leaf can have),
3698 * yet is not equal to insert_hash, then we
3699 * know that insert_hash *must* be larger than
3700 * val (and leaf_cpos). At least cpos+1 in value.
3701 *
3702 * We also know then, that there cannot be an
3703 * adjacent extent (otherwise we'd be looking
3704 * at it). Choosing this value gives us a
3705 * chance to get some contiguousness.
3706 */
3707 *split_hash = leaf_cpos + 1;
3708 return 0;
3709 }
3710
3711 if (val > insert_hash) {
3712 /*
3713 * val can not be the same as insert hash, and
3714 * also must be larger than leaf_cpos. Also,
3715 * we know that there can't be a leaf between
3716 * cpos and val, otherwise the entries with
3717 * hash 'val' would be there.
3718 */
3719 *split_hash = val;
3720 return 0;
3721 }
3722
3723 *split_hash = insert_hash;
3724 return 0;
3725 }
3726
3727 /*
3728 * Since the records are sorted and the checks above
3729 * guaranteed that not all records in this block are the same,
3730 * we simple travel forward, from the median, and pick the 1st
3731 * record whose value is larger than leaf_cpos.
3732 */
3733 for (i = (num_used / 2); i < num_used; i++)
3734 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3735 leaf_cpos)
3736 break;
3737
3738 BUG_ON(i == num_used); /* Should be impossible */
3739 *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3740 return 0;
3741}
3742
3743/*
3744 * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3745 * larger than split_hash into new_dx_leaves. We use a temporary
3746 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3747 *
3748 * Since the block offset inside a leaf (cluster) is a constant mask
3749 * of minor_hash, we can optimize - an item at block offset X within
3750 * the original cluster, will be at offset X within the new cluster.
3751 */
3752static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3753 handle_t *handle,
3754 struct ocfs2_dx_leaf *tmp_dx_leaf,
3755 struct buffer_head **orig_dx_leaves,
3756 struct buffer_head **new_dx_leaves,
3757 int num_dx_leaves)
3758{
3759 int i, j, num_used;
3760 u32 major_hash;
3761 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3762 struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
3763 struct ocfs2_dx_entry *dx_entry;
3764
3765 tmp_list = &tmp_dx_leaf->dl_list;
3766
3767 for (i = 0; i < num_dx_leaves; i++) {
3768 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3769 orig_list = &orig_dx_leaf->dl_list;
3770 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3771 new_list = &new_dx_leaf->dl_list;
3772
3773 num_used = le16_to_cpu(orig_list->de_num_used);
3774
3775 memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3776 tmp_list->de_num_used = cpu_to_le16(0);
3777 memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3778
3779 for (j = 0; j < num_used; j++) {
3780 dx_entry = &orig_list->de_entries[j];
3781 major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3782 if (major_hash >= split_hash)
3783 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3784 dx_entry);
3785 else
3786 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3787 dx_entry);
3788 }
3789 memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3790
3791 ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3792 ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3793 }
3794}
3795
3796static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3797 struct ocfs2_dx_root_block *dx_root)
3798{
3799 int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
3800
3801 credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
3802 credits += ocfs2_quota_trans_credits(osb->sb);
3803 return credits;
3804}
3805
3806/*
3807 * Find the median value in dx_leaf_bh and allocate a new leaf to move
3808 * half our entries into.
3809 */
3810static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3811 struct buffer_head *dx_root_bh,
3812 struct buffer_head *dx_leaf_bh,
3813 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3814 u64 leaf_blkno)
3815{
3816 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3817 int credits, ret, i, num_used, did_quota = 0;
3818 u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3819 u64 orig_leaves_start;
3820 int num_dx_leaves;
3821 struct buffer_head **orig_dx_leaves = NULL;
3822 struct buffer_head **new_dx_leaves = NULL;
3823 struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3824 struct ocfs2_extent_tree et;
3825 handle_t *handle = NULL;
3826 struct ocfs2_dx_root_block *dx_root;
3827 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3828
3829 mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
3830 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3831 (unsigned long long)leaf_blkno, insert_hash);
3832
3833 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
3834
3835 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3836 /*
3837 * XXX: This is a rather large limit. We should use a more
3838 * realistic value.
3839 */
3840 if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3841 return -ENOSPC;
3842
3843 num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3844 if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3845 mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3846 "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3847 (unsigned long long)leaf_blkno, num_used);
3848 ret = -EIO;
3849 goto out;
3850 }
3851
3852 orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3853 if (!orig_dx_leaves) {
3854 ret = -ENOMEM;
3855 mlog_errno(ret);
3856 goto out;
3857 }
3858
3859 new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3860 if (!new_dx_leaves) {
3861 ret = -ENOMEM;
3862 mlog_errno(ret);
3863 goto out;
3864 }
3865
3866 ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3867 if (ret) {
3868 if (ret != -ENOSPC)
3869 mlog_errno(ret);
3870 goto out;
3871 }
3872
3873 credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3874 handle = ocfs2_start_trans(osb, credits);
3875 if (IS_ERR(handle)) {
3876 ret = PTR_ERR(handle);
3877 handle = NULL;
3878 mlog_errno(ret);
3879 goto out;
3880 }
3881
3882 if (vfs_dq_alloc_space_nodirty(dir,
3883 ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
3884 ret = -EDQUOT;
3885 goto out_commit;
3886 }
3887 did_quota = 1;
3888
3889 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
3890 OCFS2_JOURNAL_ACCESS_WRITE);
3891 if (ret) {
3892 mlog_errno(ret);
3893 goto out_commit;
3894 }
3895
3896 /*
3897 * This block is changing anyway, so we can sort it in place.
3898 */
3899 sort(dx_leaf->dl_list.de_entries, num_used,
3900 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3901 dx_leaf_sort_swap);
3902
3903 ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
3904 if (ret) {
3905 mlog_errno(ret);
3906 goto out_commit;
3907 }
3908
3909 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3910 &split_hash);
3911 if (ret) {
3912 mlog_errno(ret);
3913 goto out_commit;
3914 }
3915
3916 mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
3917 leaf_cpos, split_hash, insert_hash);
3918
3919 /*
3920 * We have to carefully order operations here. There are items
3921 * which want to be in the new cluster before insert, but in
3922 * order to put those items in the new cluster, we alter the
3923 * old cluster. A failure to insert gets nasty.
3924 *
3925 * So, start by reserving writes to the old
3926 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3927 * the new cluster for us, before inserting it. The insert
3928 * won't happen if there's an error before that. Once the
3929 * insert is done then, we can transfer from one leaf into the
3930 * other without fear of hitting any error.
3931 */
3932
3933 /*
3934 * The leaf transfer wants some scratch space so that we don't
3935 * wind up doing a bunch of expensive memmove().
3936 */
3937 tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3938 if (!tmp_dx_leaf) {
3939 ret = -ENOMEM;
3940 mlog_errno(ret);
3941 goto out_commit;
3942 }
3943
3944 orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
3945 ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3946 orig_dx_leaves);
3947 if (ret) {
3948 mlog_errno(ret);
3949 goto out_commit;
3950 }
3951
3952 for (i = 0; i < num_dx_leaves; i++) {
3953 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
3954 OCFS2_JOURNAL_ACCESS_WRITE);
3955 if (ret) {
3956 mlog_errno(ret);
3957 goto out_commit;
3958 }
3959 }
3960
3961 cpos = split_hash;
3962 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3963 data_ac, meta_ac, new_dx_leaves,
3964 num_dx_leaves);
3965 if (ret) {
3966 mlog_errno(ret);
3967 goto out_commit;
3968 }
3969
3970 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3971 orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3972
3973out_commit:
3974 if (ret < 0 && did_quota)
3975 vfs_dq_free_space_nodirty(dir,
3976 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3977
3978 ocfs2_commit_trans(osb, handle);
3979
3980out:
3981 if (orig_dx_leaves || new_dx_leaves) {
3982 for (i = 0; i < num_dx_leaves; i++) {
3983 if (orig_dx_leaves)
3984 brelse(orig_dx_leaves[i]);
3985 if (new_dx_leaves)
3986 brelse(new_dx_leaves[i]);
3987 }
3988 kfree(orig_dx_leaves);
3989 kfree(new_dx_leaves);
3990 }
3991
3992 if (meta_ac)
3993 ocfs2_free_alloc_context(meta_ac);
3994 if (data_ac)
3995 ocfs2_free_alloc_context(data_ac);
3996
3997 kfree(tmp_dx_leaf);
3998 return ret;
3999}
4000
4001static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
4002 struct buffer_head *di_bh,
4003 struct buffer_head *dx_root_bh,
4004 const char *name, int namelen,
4005 struct ocfs2_dir_lookup_result *lookup)
4006{
4007 int ret, rebalanced = 0;
4008 struct ocfs2_dx_root_block *dx_root;
4009 struct buffer_head *dx_leaf_bh = NULL;
4010 struct ocfs2_dx_leaf *dx_leaf;
4011 u64 blkno;
4012 u32 leaf_cpos;
4013
4014 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4015
4016restart_search:
4017 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
4018 &leaf_cpos, &blkno);
4019 if (ret) {
4020 mlog_errno(ret);
4021 goto out;
4022 }
4023
4024 ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
4025 if (ret) {
4026 mlog_errno(ret);
4027 goto out;
4028 }
4029
4030 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
4031
4032 if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
4033 le16_to_cpu(dx_leaf->dl_list.de_count)) {
4034 if (rebalanced) {
4035 /*
4036 * Rebalancing should have provided us with
4037 * space in an appropriate leaf.
4038 *
4039 * XXX: Is this an abnormal condition then?
4040 * Should we print a message here?
4041 */
4042 ret = -ENOSPC;
4043 goto out;
4044 }
4045
4046 ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
4047 &lookup->dl_hinfo, leaf_cpos,
4048 blkno);
4049 if (ret) {
4050 if (ret != -ENOSPC)
4051 mlog_errno(ret);
4052 goto out;
4053 }
4054
4055 /*
4056 * Restart the lookup. The rebalance might have
4057 * changed which block our item fits into. Mark our
4058 * progress, so we only execute this once.
4059 */
4060 brelse(dx_leaf_bh);
4061 dx_leaf_bh = NULL;
4062 rebalanced = 1;
4063 goto restart_search;
4064 }
4065
4066 lookup->dl_dx_leaf_bh = dx_leaf_bh;
4067 dx_leaf_bh = NULL;
4068
4069out:
4070 brelse(dx_leaf_bh);
4071 return ret;
4072}
4073
4074static int ocfs2_search_dx_free_list(struct inode *dir,
4075 struct buffer_head *dx_root_bh,
4076 int namelen,
4077 struct ocfs2_dir_lookup_result *lookup)
4078{
4079 int ret = -ENOSPC;
4080 struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
4081 struct ocfs2_dir_block_trailer *db;
4082 u64 next_block;
4083 int rec_len = OCFS2_DIR_REC_LEN(namelen);
4084 struct ocfs2_dx_root_block *dx_root;
4085
4086 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4087 next_block = le64_to_cpu(dx_root->dr_free_blk);
4088
4089 while (next_block) {
4090 brelse(prev_leaf_bh);
4091 prev_leaf_bh = leaf_bh;
4092 leaf_bh = NULL;
4093
4094 ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
4095 if (ret) {
4096 mlog_errno(ret);
4097 goto out;
4098 }
4099
4100 db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
4101 if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
4102 lookup->dl_leaf_bh = leaf_bh;
4103 lookup->dl_prev_leaf_bh = prev_leaf_bh;
4104 leaf_bh = NULL;
4105 prev_leaf_bh = NULL;
4106 break;
4107 }
4108
4109 next_block = le64_to_cpu(db->db_free_next);
4110 }
4111
4112 if (!next_block)
4113 ret = -ENOSPC;
4114
4115out:
4116
4117 brelse(leaf_bh);
4118 brelse(prev_leaf_bh);
4119 return ret;
4120}
4121
4122static int ocfs2_expand_inline_dx_root(struct inode *dir,
4123 struct buffer_head *dx_root_bh)
4124{
4125 int ret, num_dx_leaves, i, j, did_quota = 0;
4126 struct buffer_head **dx_leaves = NULL;
4127 struct ocfs2_extent_tree et;
4128 u64 insert_blkno;
4129 struct ocfs2_alloc_context *data_ac = NULL;
4130 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4131 handle_t *handle = NULL;
4132 struct ocfs2_dx_root_block *dx_root;
4133 struct ocfs2_dx_entry_list *entry_list;
4134 struct ocfs2_dx_entry *dx_entry;
4135 struct ocfs2_dx_leaf *target_leaf;
4136
4137 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
4138 if (ret) {
4139 mlog_errno(ret);
4140 goto out;
4141 }
4142
4143 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
4144 if (!dx_leaves) {
4145 ret = -ENOMEM;
4146 mlog_errno(ret);
4147 goto out;
4148 }
4149
4150 handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
4151 if (IS_ERR(handle)) {
4152 ret = PTR_ERR(handle);
4153 mlog_errno(ret);
4154 goto out;
4155 }
4156
4157 if (vfs_dq_alloc_space_nodirty(dir,
4158 ocfs2_clusters_to_bytes(osb->sb, 1))) {
4159 ret = -EDQUOT;
4160 goto out_commit;
4161 }
4162 did_quota = 1;
4163
4164 /*
4165 * We do this up front, before the allocation, so that a
4166 * failure to add the dx_root_bh to the journal won't result
4167 * us losing clusters.
4168 */
4169 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
4170 OCFS2_JOURNAL_ACCESS_WRITE);
4171 if (ret) {
4172 mlog_errno(ret);
4173 goto out_commit;
4174 }
4175
4176 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
4177 num_dx_leaves, &insert_blkno);
4178 if (ret) {
4179 mlog_errno(ret);
4180 goto out_commit;
4181 }
4182
4183 /*
4184 * Transfer the entries from our dx_root into the appropriate
4185 * block
4186 */
4187 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4188 entry_list = &dx_root->dr_entries;
4189
4190 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
4191 dx_entry = &entry_list->de_entries[i];
4192
4193 j = __ocfs2_dx_dir_hash_idx(osb,
4194 le32_to_cpu(dx_entry->dx_minor_hash));
4195 target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
4196
4197 ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
4198
4199 /* Each leaf has been passed to the journal already
4200 * via __ocfs2_dx_dir_new_cluster() */
4201 }
4202
4203 dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
4204 memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
4205 offsetof(struct ocfs2_dx_root_block, dr_list));
4206 dx_root->dr_list.l_count =
4207 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
4208
4209 /* This should never fail considering we start with an empty
4210 * dx_root. */
4211 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4212 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
4213 insert_blkno, 1, 0, NULL);
4214 if (ret)
4215 mlog_errno(ret);
4216 did_quota = 0;
4217
4218 ocfs2_journal_dirty(handle, dx_root_bh);
4219
4220out_commit:
4221 if (ret < 0 && did_quota)
4222 vfs_dq_free_space_nodirty(dir,
4223 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4224
4225 ocfs2_commit_trans(osb, handle);
4226
4227out:
4228 if (data_ac)
4229 ocfs2_free_alloc_context(data_ac);
4230
4231 if (dx_leaves) {
4232 for (i = 0; i < num_dx_leaves; i++)
4233 brelse(dx_leaves[i]);
4234 kfree(dx_leaves);
4235 }
4236 return ret;
4237}
4238
4239static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
4240{
4241 struct ocfs2_dx_root_block *dx_root;
4242 struct ocfs2_dx_entry_list *entry_list;
4243
4244 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4245 entry_list = &dx_root->dr_entries;
4246
4247 if (le16_to_cpu(entry_list->de_num_used) >=
4248 le16_to_cpu(entry_list->de_count))
4249 return -ENOSPC;
4250
4251 return 0;
4252}
4253
4254static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
4255 struct buffer_head *di_bh,
4256 const char *name,
4257 int namelen,
4258 struct ocfs2_dir_lookup_result *lookup)
4259{
4260 int ret, free_dx_root = 1;
4261 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4262 struct buffer_head *dx_root_bh = NULL;
4263 struct buffer_head *leaf_bh = NULL;
4264 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4265 struct ocfs2_dx_root_block *dx_root;
4266
4267 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4268 if (ret) {
4269 mlog_errno(ret);
4270 goto out;
4271 }
4272
4273 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4274 if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
4275 ret = -ENOSPC;
4276 mlog_errno(ret);
4277 goto out;
4278 }
4279
4280 if (ocfs2_dx_root_inline(dx_root)) {
4281 ret = ocfs2_inline_dx_has_space(dx_root_bh);
4282
4283 if (ret == 0)
4284 goto search_el;
4285
4286 /*
4287 * We ran out of room in the root block. Expand it to
4288 * an extent, then allow ocfs2_find_dir_space_dx to do
4289 * the rest.
4290 */
4291 ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
4292 if (ret) {
4293 mlog_errno(ret);
4294 goto out;
4295 }
4296 }
4297
4298 /*
4299 * Insert preparation for an indexed directory is split into two
4300 * steps. The call to find_dir_space_dx reserves room in the index for
4301 * an additional item. If we run out of space there, it's a real error
4302 * we can't continue on.
4303 */
4304 ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
4305 namelen, lookup);
4306 if (ret) {
4307 mlog_errno(ret);
4308 goto out;
4309 }
4310
4311search_el:
4312 /*
4313 * Next, we need to find space in the unindexed tree. This call
4314 * searches using the free space linked list. If the unindexed tree
4315 * lacks sufficient space, we'll expand it below. The expansion code
4316 * is smart enough to add any new blocks to the free space list.
4317 */
4318 ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
4319 if (ret && ret != -ENOSPC) {
4320 mlog_errno(ret);
4321 goto out;
4322 }
4323
4324 /* Do this up here - ocfs2_extend_dir might need the dx_root */
4325 lookup->dl_dx_root_bh = dx_root_bh;
4326 free_dx_root = 0;
4327
4328 if (ret == -ENOSPC) {
4329 ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
4330
4331 if (ret) {
4332 mlog_errno(ret);
4333 goto out;
4334 }
4335
4336 /*
4337 * We make the assumption here that new leaf blocks are added
4338 * to the front of our free list.
4339 */
4340 lookup->dl_prev_leaf_bh = NULL;
4341 lookup->dl_leaf_bh = leaf_bh;
4342 }
4343
4344out:
4345 if (free_dx_root)
4346 brelse(dx_root_bh);
4347 return ret;
4348}
4349
4350/*
4351 * Get a directory ready for insert. Any directory allocation required
4352 * happens here. Success returns zero, and enough context in the dir
4353 * lookup result that ocfs2_add_entry() will be able complete the task
4354 * with minimal performance impact.
4355 */
1973int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 4356int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1974 struct inode *dir, 4357 struct inode *dir,
1975 struct buffer_head *parent_fe_bh, 4358 struct buffer_head *parent_fe_bh,
1976 const char *name, 4359 const char *name,
1977 int namelen, 4360 int namelen,
1978 struct buffer_head **ret_de_bh) 4361 struct ocfs2_dir_lookup_result *lookup)
1979{ 4362{
1980 int ret; 4363 int ret;
1981 unsigned int blocks_wanted = 1; 4364 unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1984 mlog(0, "getting ready to insert namelen %d into dir %llu\n", 4367 mlog(0, "getting ready to insert namelen %d into dir %llu\n",
1985 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); 4368 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
1986 4369
1987 *ret_de_bh = NULL;
1988
1989 if (!namelen) { 4370 if (!namelen) {
1990 ret = -EINVAL; 4371 ret = -EINVAL;
1991 mlog_errno(ret); 4372 mlog_errno(ret);
1992 goto out; 4373 goto out;
1993 } 4374 }
1994 4375
4376 /*
4377 * Do this up front to reduce confusion.
4378 *
4379 * The directory might start inline, then be turned into an
4380 * indexed one, in which case we'd need to hash deep inside
4381 * ocfs2_find_dir_space_id(). Since
4382 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
4383 * done, there seems no point in spreading out the calls. We
4384 * can optimize away the case where the file system doesn't
4385 * support indexing.
4386 */
4387 if (ocfs2_supports_indexed_dirs(osb))
4388 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
4389
4390 if (ocfs2_dir_indexed(dir)) {
4391 ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
4392 name, namelen, lookup);
4393 if (ret)
4394 mlog_errno(ret);
4395 goto out;
4396 }
4397
1995 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 4398 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1996 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, 4399 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
1997 namelen, &bh, &blocks_wanted); 4400 namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2010 BUG_ON(bh); 4413 BUG_ON(bh);
2011 4414
2012 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, 4415 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
2013 &bh); 4416 lookup, &bh);
2014 if (ret) { 4417 if (ret) {
2015 if (ret != -ENOSPC) 4418 if (ret != -ENOSPC)
2016 mlog_errno(ret); 4419 mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2020 BUG_ON(!bh); 4423 BUG_ON(!bh);
2021 } 4424 }
2022 4425
2023 *ret_de_bh = bh; 4426 lookup->dl_leaf_bh = bh;
2024 bh = NULL; 4427 bh = NULL;
2025out: 4428out:
2026 brelse(bh); 4429 brelse(bh);
2027 return ret; 4430 return ret;
2028} 4431}
4432
4433static int ocfs2_dx_dir_remove_index(struct inode *dir,
4434 struct buffer_head *di_bh,
4435 struct buffer_head *dx_root_bh)
4436{
4437 int ret;
4438 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4439 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4440 struct ocfs2_dx_root_block *dx_root;
4441 struct inode *dx_alloc_inode = NULL;
4442 struct buffer_head *dx_alloc_bh = NULL;
4443 handle_t *handle;
4444 u64 blk;
4445 u16 bit;
4446 u64 bg_blkno;
4447
4448 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4449
4450 dx_alloc_inode = ocfs2_get_system_file_inode(osb,
4451 EXTENT_ALLOC_SYSTEM_INODE,
4452 le16_to_cpu(dx_root->dr_suballoc_slot));
4453 if (!dx_alloc_inode) {
4454 ret = -ENOMEM;
4455 mlog_errno(ret);
4456 goto out;
4457 }
4458 mutex_lock(&dx_alloc_inode->i_mutex);
4459
4460 ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
4461 if (ret) {
4462 mlog_errno(ret);
4463 goto out_mutex;
4464 }
4465
4466 handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
4467 if (IS_ERR(handle)) {
4468 ret = PTR_ERR(handle);
4469 mlog_errno(ret);
4470 goto out_unlock;
4471 }
4472
4473 ret = ocfs2_journal_access_di(handle, dir, di_bh,
4474 OCFS2_JOURNAL_ACCESS_WRITE);
4475 if (ret) {
4476 mlog_errno(ret);
4477 goto out_commit;
4478 }
4479
4480 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4481 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4482 di->i_dx_root = cpu_to_le64(0ULL);
4483
4484 ocfs2_journal_dirty(handle, di_bh);
4485
4486 blk = le64_to_cpu(dx_root->dr_blkno);
4487 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4488 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4489 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4490 bit, bg_blkno, 1);
4491 if (ret)
4492 mlog_errno(ret);
4493
4494out_commit:
4495 ocfs2_commit_trans(osb, handle);
4496
4497out_unlock:
4498 ocfs2_inode_unlock(dx_alloc_inode, 1);
4499
4500out_mutex:
4501 mutex_unlock(&dx_alloc_inode->i_mutex);
4502 brelse(dx_alloc_bh);
4503out:
4504 iput(dx_alloc_inode);
4505 return ret;
4506}
4507
4508int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4509{
4510 int ret;
4511 unsigned int uninitialized_var(clen);
4512 u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
4513 u64 uninitialized_var(blkno);
4514 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4515 struct buffer_head *dx_root_bh = NULL;
4516 struct ocfs2_dx_root_block *dx_root;
4517 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4518 struct ocfs2_cached_dealloc_ctxt dealloc;
4519 struct ocfs2_extent_tree et;
4520
4521 ocfs2_init_dealloc_ctxt(&dealloc);
4522
4523 if (!ocfs2_dir_indexed(dir))
4524 return 0;
4525
4526 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4527 if (ret) {
4528 mlog_errno(ret);
4529 goto out;
4530 }
4531 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4532
4533 if (ocfs2_dx_root_inline(dx_root))
4534 goto remove_index;
4535
4536 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4537
4538 /* XXX: What if dr_clusters is too large? */
4539 while (le32_to_cpu(dx_root->dr_clusters)) {
4540 ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
4541 major_hash, &cpos, &blkno, &clen);
4542 if (ret) {
4543 mlog_errno(ret);
4544 goto out;
4545 }
4546
4547 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4548
4549 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
4550 &dealloc);
4551 if (ret) {
4552 mlog_errno(ret);
4553 goto out;
4554 }
4555
4556 if (cpos == 0)
4557 break;
4558
4559 major_hash = cpos - 1;
4560 }
4561
4562remove_index:
4563 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
4564 if (ret) {
4565 mlog_errno(ret);
4566 goto out;
4567 }
4568
4569 ocfs2_remove_from_cache(dir, dx_root_bh);
4570out:
4571 ocfs2_schedule_truncate_log_flush(osb, 1);
4572 ocfs2_run_deallocs(osb, &dealloc);
4573
4574 brelse(dx_root_bh);
4575 return ret;
4576}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index c511e2e18e9f..e683f3deb645 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,44 +26,70 @@
26#ifndef OCFS2_DIR_H 26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H 27#define OCFS2_DIR_H
28 28
29struct buffer_head *ocfs2_find_entry(const char *name, 29struct ocfs2_dx_hinfo {
30 int namelen, 30 u32 major_hash;
31 struct inode *dir, 31 u32 minor_hash;
32 struct ocfs2_dir_entry **res_dir); 32};
33
34struct ocfs2_dir_lookup_result {
35 struct buffer_head *dl_leaf_bh; /* Unindexed leaf
36 * block */
37 struct ocfs2_dir_entry *dl_entry; /* Target dirent in
38 * unindexed leaf */
39
40 struct buffer_head *dl_dx_root_bh; /* Root of indexed
41 * tree */
42
43 struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */
44 struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in
45 * indexed leaf */
46 struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */
47
48 struct buffer_head *dl_prev_leaf_bh;/* Previous entry in
49 * dir free space
50 * list. NULL if
51 * previous entry is
52 * dx root block. */
53};
54
55void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
56
57int ocfs2_find_entry(const char *name, int namelen,
58 struct inode *dir,
59 struct ocfs2_dir_lookup_result *lookup);
33int ocfs2_delete_entry(handle_t *handle, 60int ocfs2_delete_entry(handle_t *handle,
34 struct inode *dir, 61 struct inode *dir,
35 struct ocfs2_dir_entry *de_del, 62 struct ocfs2_dir_lookup_result *res);
36 struct buffer_head *bh);
37int __ocfs2_add_entry(handle_t *handle, 63int __ocfs2_add_entry(handle_t *handle,
38 struct inode *dir, 64 struct inode *dir,
39 const char *name, int namelen, 65 const char *name, int namelen,
40 struct inode *inode, u64 blkno, 66 struct inode *inode, u64 blkno,
41 struct buffer_head *parent_fe_bh, 67 struct buffer_head *parent_fe_bh,
42 struct buffer_head *insert_bh); 68 struct ocfs2_dir_lookup_result *lookup);
43static inline int ocfs2_add_entry(handle_t *handle, 69static inline int ocfs2_add_entry(handle_t *handle,
44 struct dentry *dentry, 70 struct dentry *dentry,
45 struct inode *inode, u64 blkno, 71 struct inode *inode, u64 blkno,
46 struct buffer_head *parent_fe_bh, 72 struct buffer_head *parent_fe_bh,
47 struct buffer_head *insert_bh) 73 struct ocfs2_dir_lookup_result *lookup)
48{ 74{
49 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, 75 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
50 dentry->d_name.name, dentry->d_name.len, 76 dentry->d_name.name, dentry->d_name.len,
51 inode, blkno, parent_fe_bh, insert_bh); 77 inode, blkno, parent_fe_bh, lookup);
52} 78}
53int ocfs2_update_entry(struct inode *dir, handle_t *handle, 79int ocfs2_update_entry(struct inode *dir, handle_t *handle,
54 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 80 struct ocfs2_dir_lookup_result *res,
55 struct inode *new_entry_inode); 81 struct inode *new_entry_inode);
56 82
57int ocfs2_check_dir_for_entry(struct inode *dir, 83int ocfs2_check_dir_for_entry(struct inode *dir,
58 const char *name, 84 const char *name,
59 int namelen); 85 int namelen);
60int ocfs2_empty_dir(struct inode *inode); 86int ocfs2_empty_dir(struct inode *inode);
87
61int ocfs2_find_files_on_disk(const char *name, 88int ocfs2_find_files_on_disk(const char *name,
62 int namelen, 89 int namelen,
63 u64 *blkno, 90 u64 *blkno,
64 struct inode *inode, 91 struct inode *inode,
65 struct buffer_head **dirent_bh, 92 struct ocfs2_dir_lookup_result *res);
66 struct ocfs2_dir_entry **dirent);
67int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, 93int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
68 int namelen, u64 *blkno); 94 int namelen, u64 *blkno);
69int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); 95int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
74 struct buffer_head *parent_fe_bh, 100 struct buffer_head *parent_fe_bh,
75 const char *name, 101 const char *name,
76 int namelen, 102 int namelen,
77 struct buffer_head **ret_de_bh); 103 struct ocfs2_dir_lookup_result *lookup);
78struct ocfs2_alloc_context; 104struct ocfs2_alloc_context;
79int ocfs2_fill_new_dir(struct ocfs2_super *osb, 105int ocfs2_fill_new_dir(struct ocfs2_super *osb,
80 handle_t *handle, 106 handle_t *handle,
81 struct inode *parent, 107 struct inode *parent,
82 struct inode *inode, 108 struct inode *inode,
83 struct buffer_head *fe_bh, 109 struct buffer_head *fe_bh,
84 struct ocfs2_alloc_context *data_ac); 110 struct ocfs2_alloc_context *data_ac,
111 struct ocfs2_alloc_context *meta_ac);
112
113int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
85 114
86struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, 115struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
87 void *data); 116 void *data);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bb53714813ab..0102be35980c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -52,16 +52,12 @@
52enum dlm_mle_type { 52enum dlm_mle_type {
53 DLM_MLE_BLOCK, 53 DLM_MLE_BLOCK,
54 DLM_MLE_MASTER, 54 DLM_MLE_MASTER,
55 DLM_MLE_MIGRATION 55 DLM_MLE_MIGRATION,
56}; 56 DLM_MLE_NUM_TYPES
57
58struct dlm_lock_name {
59 u8 len;
60 u8 name[DLM_LOCKID_NAME_MAX];
61}; 57};
62 58
63struct dlm_master_list_entry { 59struct dlm_master_list_entry {
64 struct list_head list; 60 struct hlist_node master_hash_node;
65 struct list_head hb_events; 61 struct list_head hb_events;
66 struct dlm_ctxt *dlm; 62 struct dlm_ctxt *dlm;
67 spinlock_t spinlock; 63 spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
78 enum dlm_mle_type type; 74 enum dlm_mle_type type;
79 struct o2hb_callback_func mle_hb_up; 75 struct o2hb_callback_func mle_hb_up;
80 struct o2hb_callback_func mle_hb_down; 76 struct o2hb_callback_func mle_hb_down;
81 union { 77 struct dlm_lock_resource *mleres;
82 struct dlm_lock_resource *res; 78 unsigned char mname[DLM_LOCKID_NAME_MAX];
83 struct dlm_lock_name name; 79 unsigned int mnamelen;
84 } u; 80 unsigned int mnamehash;
85}; 81};
86 82
87enum dlm_ast_type { 83enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
151 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 147 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
152 struct dlm_recovery_ctxt reco; 148 struct dlm_recovery_ctxt reco;
153 spinlock_t master_lock; 149 spinlock_t master_lock;
154 struct list_head master_list; 150 struct hlist_head **master_hash;
155 struct list_head mle_hb_events; 151 struct list_head mle_hb_events;
156 152
157 /* these give a really vague idea of the system load */ 153 /* these give a really vague idea of the system load */
158 atomic_t local_resources; 154 atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
159 atomic_t remote_resources; 155 atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
160 atomic_t unknown_resources; 156 atomic_t res_tot_count;
157 atomic_t res_cur_count;
161 158
162 struct dlm_debug_ctxt *dlm_debug_ctxt; 159 struct dlm_debug_ctxt *dlm_debug_ctxt;
163 struct dentry *dlm_debugfs_subroot; 160 struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
195 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); 192 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
196} 193}
197 194
195static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
196 unsigned i)
197{
198 return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
199 (i % DLM_BUCKETS_PER_PAGE);
200}
201
198/* these keventd work queue items are for less-frequently 202/* these keventd work queue items are for less-frequently
199 * called functions that cannot be directly called from the 203 * called functions that cannot be directly called from the
200 * net message handlers for some reason, usually because 204 * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
848 unsigned int len); 852 unsigned int len);
849 853
850int dlm_is_host_down(int errno); 854int dlm_is_host_down(int errno);
851void dlm_change_lockres_owner(struct dlm_ctxt *dlm, 855
852 struct dlm_lock_resource *res,
853 u8 owner);
854struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, 856struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
855 const char *lockid, 857 const char *lockid,
856 int namelen, 858 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
1008 DLM_LOCK_RES_MIGRATING)); 1010 DLM_LOCK_RES_MIGRATING));
1009} 1011}
1010 1012
1013void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1014void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1015
1011/* create/destroy slab caches */ 1016/* create/destroy slab caches */
1012int dlm_init_master_caches(void); 1017int dlm_init_master_caches(void);
1013void dlm_destroy_master_caches(void); 1018void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
1110 return bit; 1115 return bit;
1111} 1116}
1112 1117
1118static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
1119 struct dlm_lock_resource *res,
1120 u8 owner)
1121{
1122 assert_spin_locked(&res->spinlock);
1123
1124 res->owner = owner;
1125}
1113 1126
1127static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
1128 struct dlm_lock_resource *res,
1129 u8 owner)
1130{
1131 assert_spin_locked(&res->spinlock);
1132
1133 if (owner != res->owner)
1134 dlm_set_lockres_owner(dlm, res, owner);
1135}
1114 1136
1115#endif /* DLMCOMMON_H */ 1137#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index b32f60a5acfb..df52f706f669 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) 287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
288{ 288{
289 int out = 0; 289 int out = 0;
290 unsigned int namelen;
291 const char *name;
292 char *mle_type; 290 char *mle_type;
293 291
294 if (mle->type != DLM_MLE_MASTER) {
295 namelen = mle->u.name.len;
296 name = mle->u.name.name;
297 } else {
298 namelen = mle->u.res->lockname.len;
299 name = mle->u.res->lockname.name;
300 }
301
302 if (mle->type == DLM_MLE_BLOCK) 292 if (mle->type == DLM_MLE_BLOCK)
303 mle_type = "BLK"; 293 mle_type = "BLK";
304 else if (mle->type == DLM_MLE_MASTER) 294 else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
306 else 296 else
307 mle_type = "MIG"; 297 mle_type = "MIG";
308 298
309 out += stringify_lockname(name, namelen, buf + out, len - out); 299 out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
310 out += snprintf(buf + out, len - out, 300 out += snprintf(buf + out, len - out,
311 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", 301 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
312 mle_type, mle->master, mle->new_master, 302 mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
501static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 491static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
502{ 492{
503 struct dlm_master_list_entry *mle; 493 struct dlm_master_list_entry *mle;
504 int out = 0; 494 struct hlist_head *bucket;
505 unsigned long total = 0; 495 struct hlist_node *list;
496 int i, out = 0;
497 unsigned long total = 0, longest = 0, bktcnt;
506 498
507 out += snprintf(db->buf + out, db->len - out, 499 out += snprintf(db->buf + out, db->len - out,
508 "Dumping MLEs for Domain: %s\n", dlm->name); 500 "Dumping MLEs for Domain: %s\n", dlm->name);
509 501
510 spin_lock(&dlm->master_lock); 502 spin_lock(&dlm->master_lock);
511 list_for_each_entry(mle, &dlm->master_list, list) { 503 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
512 ++total; 504 bucket = dlm_master_hash(dlm, i);
513 if (db->len - out < 200) 505 hlist_for_each(list, bucket) {
514 continue; 506 mle = hlist_entry(list, struct dlm_master_list_entry,
515 out += dump_mle(mle, db->buf + out, db->len - out); 507 master_hash_node);
508 ++total;
509 ++bktcnt;
510 if (db->len - out < 200)
511 continue;
512 out += dump_mle(mle, db->buf + out, db->len - out);
513 }
514 longest = max(longest, bktcnt);
515 bktcnt = 0;
516 } 516 }
517 spin_unlock(&dlm->master_lock); 517 spin_unlock(&dlm->master_lock);
518 518
519 out += snprintf(db->buf + out, db->len - out, 519 out += snprintf(db->buf + out, db->len - out,
520 "Total on list: %ld\n", total); 520 "Total: %ld, Longest: %ld\n", total, longest);
521 return out; 521 return out;
522} 522}
523 523
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
756 int out = 0; 756 int out = 0;
757 struct dlm_reco_node_data *node; 757 struct dlm_reco_node_data *node;
758 char *state; 758 char *state;
759 int lres, rres, ures, tres; 759 int cur_mles = 0, tot_mles = 0;
760 760 int i;
761 lres = atomic_read(&dlm->local_resources);
762 rres = atomic_read(&dlm->remote_resources);
763 ures = atomic_read(&dlm->unknown_resources);
764 tres = lres + rres + ures;
765 761
766 spin_lock(&dlm->spinlock); 762 spin_lock(&dlm->spinlock);
767 763
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
804 db->buf + out, db->len - out); 800 db->buf + out, db->len - out);
805 out += snprintf(db->buf + out, db->len - out, "\n"); 801 out += snprintf(db->buf + out, db->len - out, "\n");
806 802
807 /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */ 803 /* Lock Resources: xxx (xxx) */
804 out += snprintf(db->buf + out, db->len - out,
805 "Lock Resources: %d (%d)\n",
806 atomic_read(&dlm->res_cur_count),
807 atomic_read(&dlm->res_tot_count));
808
809 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
810 tot_mles += atomic_read(&dlm->mle_tot_count[i]);
811
812 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
813 cur_mles += atomic_read(&dlm->mle_cur_count[i]);
814
815 /* MLEs: xxx (xxx) */
816 out += snprintf(db->buf + out, db->len - out,
817 "MLEs: %d (%d)\n", cur_mles, tot_mles);
818
819 /* Blocking: xxx (xxx) */
820 out += snprintf(db->buf + out, db->len - out,
821 " Blocking: %d (%d)\n",
822 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
823 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
824
825 /* Mastery: xxx (xxx) */
826 out += snprintf(db->buf + out, db->len - out,
827 " Mastery: %d (%d)\n",
828 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
829 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
830
831 /* Migration: xxx (xxx) */
808 out += snprintf(db->buf + out, db->len - out, 832 out += snprintf(db->buf + out, db->len - out,
809 "Mastered Resources Total: %d Locally: %d " 833 " Migration: %d (%d)\n",
810 "Remotely: %d Unknown: %d\n", 834 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
811 tres, lres, rres, ures); 835 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
812 836
813 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ 837 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
814 out += snprintf(db->buf + out, db->len - out, 838 out += snprintf(db->buf + out, db->len - out,
815 "Lists: Dirty=%s Purge=%s PendingASTs=%s " 839 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
816 "PendingBASTs=%s Master=%s\n", 840 "PendingBASTs=%s\n",
817 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), 841 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
818 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), 842 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
819 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), 843 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
820 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"), 844 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
821 (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
822 845
823 /* Purge Count: xxx Refs: xxx */ 846 /* Purge Count: xxx Refs: xxx */
824 out += snprintf(db->buf + out, db->len - out, 847 out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d8d578f45613..4d9e6b288dd8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
304 if (dlm->lockres_hash) 304 if (dlm->lockres_hash)
305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
306 306
307 if (dlm->master_hash)
308 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
309
307 if (dlm->name) 310 if (dlm->name)
308 kfree(dlm->name); 311 kfree(dlm->name);
309 312
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1534 for (i = 0; i < DLM_HASH_BUCKETS; i++) 1537 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1535 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); 1538 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
1536 1539
1540 dlm->master_hash = (struct hlist_head **)
1541 dlm_alloc_pagevec(DLM_HASH_PAGES);
1542 if (!dlm->master_hash) {
1543 mlog_errno(-ENOMEM);
1544 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1545 kfree(dlm->name);
1546 kfree(dlm);
1547 dlm = NULL;
1548 goto leave;
1549 }
1550
1551 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1552 INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
1553
1537 strcpy(dlm->name, domain); 1554 strcpy(dlm->name, domain);
1538 dlm->key = key; 1555 dlm->key = key;
1539 dlm->node_num = o2nm_this_node(); 1556 dlm->node_num = o2nm_this_node();
1540 1557
1541 ret = dlm_create_debugfs_subroot(dlm); 1558 ret = dlm_create_debugfs_subroot(dlm);
1542 if (ret < 0) { 1559 if (ret < 0) {
1560 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
1543 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 1561 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1544 kfree(dlm->name); 1562 kfree(dlm->name);
1545 kfree(dlm); 1563 kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1579 init_waitqueue_head(&dlm->reco.event); 1597 init_waitqueue_head(&dlm->reco.event);
1580 init_waitqueue_head(&dlm->ast_wq); 1598 init_waitqueue_head(&dlm->ast_wq);
1581 init_waitqueue_head(&dlm->migration_wq); 1599 init_waitqueue_head(&dlm->migration_wq);
1582 INIT_LIST_HEAD(&dlm->master_list);
1583 INIT_LIST_HEAD(&dlm->mle_hb_events); 1600 INIT_LIST_HEAD(&dlm->mle_hb_events);
1584 1601
1585 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; 1602 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1587 1604
1588 dlm->reco.new_master = O2NM_INVALID_NODE_NUM; 1605 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
1589 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; 1606 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
1590 atomic_set(&dlm->local_resources, 0); 1607
1591 atomic_set(&dlm->remote_resources, 0); 1608 atomic_set(&dlm->res_tot_count, 0);
1592 atomic_set(&dlm->unknown_resources, 0); 1609 atomic_set(&dlm->res_cur_count, 0);
1610 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
1611 atomic_set(&dlm->mle_tot_count[i], 0);
1612 atomic_set(&dlm->mle_cur_count[i], 0);
1613 }
1593 1614
1594 spin_lock_init(&dlm->work_lock); 1615 spin_lock_init(&dlm->work_lock);
1595 INIT_LIST_HEAD(&dlm->work_list); 1616 INIT_LIST_HEAD(&dlm->work_list);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0a2813947853..f8b653fcd4dd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
73 const char *name, 73 const char *name,
74 unsigned int namelen) 74 unsigned int namelen)
75{ 75{
76 struct dlm_lock_resource *res;
77
78 if (dlm != mle->dlm) 76 if (dlm != mle->dlm)
79 return 0; 77 return 0;
80 78
81 if (mle->type == DLM_MLE_BLOCK || 79 if (namelen != mle->mnamelen ||
82 mle->type == DLM_MLE_MIGRATION) { 80 memcmp(name, mle->mname, namelen) != 0)
83 if (namelen != mle->u.name.len || 81 return 0;
84 memcmp(name, mle->u.name.name, namelen)!=0) 82
85 return 0;
86 } else {
87 res = mle->u.res;
88 if (namelen != res->lockname.len ||
89 memcmp(res->lockname.name, name, namelen) != 0)
90 return 0;
91 }
92 return 1; 83 return 1;
93} 84}
94 85
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
283 274
284 mle->dlm = dlm; 275 mle->dlm = dlm;
285 mle->type = type; 276 mle->type = type;
286 INIT_LIST_HEAD(&mle->list); 277 INIT_HLIST_NODE(&mle->master_hash_node);
287 INIT_LIST_HEAD(&mle->hb_events); 278 INIT_LIST_HEAD(&mle->hb_events);
288 memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 279 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
289 spin_lock_init(&mle->spinlock); 280 spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
295 mle->new_master = O2NM_MAX_NODES; 286 mle->new_master = O2NM_MAX_NODES;
296 mle->inuse = 0; 287 mle->inuse = 0;
297 288
289 BUG_ON(mle->type != DLM_MLE_BLOCK &&
290 mle->type != DLM_MLE_MASTER &&
291 mle->type != DLM_MLE_MIGRATION);
292
298 if (mle->type == DLM_MLE_MASTER) { 293 if (mle->type == DLM_MLE_MASTER) {
299 BUG_ON(!res); 294 BUG_ON(!res);
300 mle->u.res = res; 295 mle->mleres = res;
301 } else if (mle->type == DLM_MLE_BLOCK) { 296 memcpy(mle->mname, res->lockname.name, res->lockname.len);
302 BUG_ON(!name); 297 mle->mnamelen = res->lockname.len;
303 memcpy(mle->u.name.name, name, namelen); 298 mle->mnamehash = res->lockname.hash;
304 mle->u.name.len = namelen; 299 } else {
305 } else /* DLM_MLE_MIGRATION */ {
306 BUG_ON(!name); 300 BUG_ON(!name);
307 memcpy(mle->u.name.name, name, namelen); 301 mle->mleres = NULL;
308 mle->u.name.len = namelen; 302 memcpy(mle->mname, name, namelen);
303 mle->mnamelen = namelen;
304 mle->mnamehash = dlm_lockid_hash(name, namelen);
309 } 305 }
310 306
307 atomic_inc(&dlm->mle_tot_count[mle->type]);
308 atomic_inc(&dlm->mle_cur_count[mle->type]);
309
311 /* copy off the node_map and register hb callbacks on our copy */ 310 /* copy off the node_map and register hb callbacks on our copy */
312 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); 311 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
313 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); 312 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
318 __dlm_mle_attach_hb_events(dlm, mle); 317 __dlm_mle_attach_hb_events(dlm, mle);
319} 318}
320 319
320void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
321{
322 assert_spin_locked(&dlm->spinlock);
323 assert_spin_locked(&dlm->master_lock);
324
325 if (!hlist_unhashed(&mle->master_hash_node))
326 hlist_del_init(&mle->master_hash_node);
327}
328
329void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
330{
331 struct hlist_head *bucket;
332
333 assert_spin_locked(&dlm->master_lock);
334
335 bucket = dlm_master_hash(dlm, mle->mnamehash);
336 hlist_add_head(&mle->master_hash_node, bucket);
337}
321 338
322/* returns 1 if found, 0 if not */ 339/* returns 1 if found, 0 if not */
323static int dlm_find_mle(struct dlm_ctxt *dlm, 340static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
325 char *name, unsigned int namelen) 342 char *name, unsigned int namelen)
326{ 343{
327 struct dlm_master_list_entry *tmpmle; 344 struct dlm_master_list_entry *tmpmle;
345 struct hlist_head *bucket;
346 struct hlist_node *list;
347 unsigned int hash;
328 348
329 assert_spin_locked(&dlm->master_lock); 349 assert_spin_locked(&dlm->master_lock);
330 350
331 list_for_each_entry(tmpmle, &dlm->master_list, list) { 351 hash = dlm_lockid_hash(name, namelen);
352 bucket = dlm_master_hash(dlm, hash);
353 hlist_for_each(list, bucket) {
354 tmpmle = hlist_entry(list, struct dlm_master_list_entry,
355 master_hash_node);
332 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 356 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
333 continue; 357 continue;
334 dlm_get_mle(tmpmle); 358 dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
408 mle = container_of(kref, struct dlm_master_list_entry, mle_refs); 432 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
409 dlm = mle->dlm; 433 dlm = mle->dlm;
410 434
411 if (mle->type != DLM_MLE_MASTER) {
412 mlog(0, "calling mle_release for %.*s, type %d\n",
413 mle->u.name.len, mle->u.name.name, mle->type);
414 } else {
415 mlog(0, "calling mle_release for %.*s, type %d\n",
416 mle->u.res->lockname.len,
417 mle->u.res->lockname.name, mle->type);
418 }
419 assert_spin_locked(&dlm->spinlock); 435 assert_spin_locked(&dlm->spinlock);
420 assert_spin_locked(&dlm->master_lock); 436 assert_spin_locked(&dlm->master_lock);
421 437
438 mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
439 mle->type);
440
422 /* remove from list if not already */ 441 /* remove from list if not already */
423 if (!list_empty(&mle->list)) 442 __dlm_unlink_mle(dlm, mle);
424 list_del_init(&mle->list);
425 443
426 /* detach the mle from the domain node up/down events */ 444 /* detach the mle from the domain node up/down events */
427 __dlm_mle_detach_hb_events(dlm, mle); 445 __dlm_mle_detach_hb_events(dlm, mle);
428 446
447 atomic_dec(&dlm->mle_cur_count[mle->type]);
448
429 /* NOTE: kfree under spinlock here. 449 /* NOTE: kfree under spinlock here.
430 * if this is bad, we can move this to a freelist. */ 450 * if this is bad, we can move this to a freelist. */
431 kmem_cache_free(dlm_mle_cache, mle); 451 kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
465 kmem_cache_destroy(dlm_lockres_cache); 485 kmem_cache_destroy(dlm_lockres_cache);
466} 486}
467 487
468static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
469 struct dlm_lock_resource *res,
470 u8 owner)
471{
472 assert_spin_locked(&res->spinlock);
473
474 mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
475
476 if (owner == dlm->node_num)
477 atomic_inc(&dlm->local_resources);
478 else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
479 atomic_inc(&dlm->unknown_resources);
480 else
481 atomic_inc(&dlm->remote_resources);
482
483 res->owner = owner;
484}
485
486void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
487 struct dlm_lock_resource *res, u8 owner)
488{
489 assert_spin_locked(&res->spinlock);
490
491 if (owner == res->owner)
492 return;
493
494 if (res->owner == dlm->node_num)
495 atomic_dec(&dlm->local_resources);
496 else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
497 atomic_dec(&dlm->unknown_resources);
498 else
499 atomic_dec(&dlm->remote_resources);
500
501 dlm_set_lockres_owner(dlm, res, owner);
502}
503
504
505static void dlm_lockres_release(struct kref *kref) 488static void dlm_lockres_release(struct kref *kref)
506{ 489{
507 struct dlm_lock_resource *res; 490 struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
527 } 510 }
528 spin_unlock(&dlm->track_lock); 511 spin_unlock(&dlm->track_lock);
529 512
513 atomic_dec(&dlm->res_cur_count);
514
530 dlm_put(dlm); 515 dlm_put(dlm);
531 516
532 if (!hlist_unhashed(&res->hash_node) || 517 if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
607 592
608 kref_init(&res->refs); 593 kref_init(&res->refs);
609 594
595 atomic_inc(&dlm->res_tot_count);
596 atomic_inc(&dlm->res_cur_count);
597
610 /* just for consistency */ 598 /* just for consistency */
611 spin_lock(&res->spinlock); 599 spin_lock(&res->spinlock);
612 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 600 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
843 alloc_mle = NULL; 831 alloc_mle = NULL;
844 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); 832 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
845 set_bit(dlm->node_num, mle->maybe_map); 833 set_bit(dlm->node_num, mle->maybe_map);
846 list_add(&mle->list, &dlm->master_list); 834 __dlm_insert_mle(dlm, mle);
847 835
848 /* still holding the dlm spinlock, check the recovery map 836 /* still holding the dlm spinlock, check the recovery map
849 * to see if there are any nodes that still need to be 837 * to see if there are any nodes that still need to be
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1270 res->lockname.len, 1258 res->lockname.len,
1271 res->lockname.name); 1259 res->lockname.name);
1272 mle->type = DLM_MLE_MASTER; 1260 mle->type = DLM_MLE_MASTER;
1273 mle->u.res = res; 1261 mle->mleres = res;
1274 } 1262 }
1275 } 1263 }
1276 } 1264 }
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
1315 1303
1316 BUG_ON(mle->type == DLM_MLE_MIGRATION); 1304 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1317 1305
1318 if (mle->type != DLM_MLE_MASTER) { 1306 request.namelen = (u8)mle->mnamelen;
1319 request.namelen = mle->u.name.len; 1307 memcpy(request.name, mle->mname, request.namelen);
1320 memcpy(request.name, mle->u.name.name, request.namelen);
1321 } else {
1322 request.namelen = mle->u.res->lockname.len;
1323 memcpy(request.name, mle->u.res->lockname.name,
1324 request.namelen);
1325 }
1326 1308
1327again: 1309again:
1328 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, 1310 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
1575 // "add the block.\n"); 1557 // "add the block.\n");
1576 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); 1558 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1577 set_bit(request->node_idx, mle->maybe_map); 1559 set_bit(request->node_idx, mle->maybe_map);
1578 list_add(&mle->list, &dlm->master_list); 1560 __dlm_insert_mle(dlm, mle);
1579 response = DLM_MASTER_RESP_NO; 1561 response = DLM_MASTER_RESP_NO;
1580 } else { 1562 } else {
1581 // mlog(0, "mle was found\n"); 1563 // mlog(0, "mle was found\n");
@@ -1967,7 +1949,7 @@ ok:
1967 assert->node_idx, rr, extra_ref, mle->inuse); 1949 assert->node_idx, rr, extra_ref, mle->inuse);
1968 dlm_print_one_mle(mle); 1950 dlm_print_one_mle(mle);
1969 } 1951 }
1970 list_del_init(&mle->list); 1952 __dlm_unlink_mle(dlm, mle);
1971 __dlm_mle_detach_hb_events(dlm, mle); 1953 __dlm_mle_detach_hb_events(dlm, mle);
1972 __dlm_put_mle(mle); 1954 __dlm_put_mle(mle);
1973 if (extra_ref) { 1955 if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3159 tmp->master = master; 3141 tmp->master = master;
3160 atomic_set(&tmp->woken, 1); 3142 atomic_set(&tmp->woken, 1);
3161 wake_up(&tmp->wq); 3143 wake_up(&tmp->wq);
3162 /* remove it from the list so that only one 3144 /* remove it so that only one mle will be found */
3163 * mle will be found */ 3145 __dlm_unlink_mle(dlm, tmp);
3164 list_del_init(&tmp->list);
3165 /* this was obviously WRONG. mle is uninited here. should be tmp. */
3166 __dlm_mle_detach_hb_events(dlm, tmp); 3146 __dlm_mle_detach_hb_events(dlm, tmp);
3167 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 3147 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3168 mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 3148 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3181 mle->master = master; 3161 mle->master = master;
3182 /* do this for consistency with other mle types */ 3162 /* do this for consistency with other mle types */
3183 set_bit(new_master, mle->maybe_map); 3163 set_bit(new_master, mle->maybe_map);
3184 list_add(&mle->list, &dlm->master_list); 3164 __dlm_insert_mle(dlm, mle);
3185 3165
3186 return ret; 3166 return ret;
3187} 3167}
3188 3168
3189 3169/*
3190void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3170 * Sets the owner of the lockres, associated to the mle, to UNKNOWN
3171 */
3172static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
3173 struct dlm_master_list_entry *mle)
3191{ 3174{
3192 struct dlm_master_list_entry *mle, *next;
3193 struct dlm_lock_resource *res; 3175 struct dlm_lock_resource *res;
3194 unsigned int hash;
3195 3176
3196 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); 3177 /* Find the lockres associated to the mle and set its owner to UNK */
3197top: 3178 res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
3198 assert_spin_locked(&dlm->spinlock); 3179 mle->mnamehash);
3180 if (res) {
3181 spin_unlock(&dlm->master_lock);
3199 3182
3200 /* clean the master list */ 3183 /* move lockres onto recovery list */
3201 spin_lock(&dlm->master_lock); 3184 spin_lock(&res->spinlock);
3202 list_for_each_entry_safe(mle, next, &dlm->master_list, list) { 3185 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
3203 BUG_ON(mle->type != DLM_MLE_BLOCK && 3186 dlm_move_lockres_to_recovery_list(dlm, res);
3204 mle->type != DLM_MLE_MASTER && 3187 spin_unlock(&res->spinlock);
3205 mle->type != DLM_MLE_MIGRATION); 3188 dlm_lockres_put(res);
3206
3207 /* MASTER mles are initiated locally. the waiting
3208 * process will notice the node map change
3209 * shortly. let that happen as normal. */
3210 if (mle->type == DLM_MLE_MASTER)
3211 continue;
3212 3189
3190 /* about to get rid of mle, detach from heartbeat */
3191 __dlm_mle_detach_hb_events(dlm, mle);
3213 3192
3214 /* BLOCK mles are initiated by other nodes. 3193 /* dump the mle */
3215 * need to clean up if the dead node would have 3194 spin_lock(&dlm->master_lock);
3216 * been the master. */ 3195 __dlm_put_mle(mle);
3217 if (mle->type == DLM_MLE_BLOCK) { 3196 spin_unlock(&dlm->master_lock);
3218 int bit; 3197 }
3219 3198
3220 spin_lock(&mle->spinlock); 3199 return res;
3221 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 3200}
3222 if (bit != dead_node) {
3223 mlog(0, "mle found, but dead node %u would "
3224 "not have been master\n", dead_node);
3225 spin_unlock(&mle->spinlock);
3226 } else {
3227 /* must drop the refcount by one since the
3228 * assert_master will never arrive. this
3229 * may result in the mle being unlinked and
3230 * freed, but there may still be a process
3231 * waiting in the dlmlock path which is fine. */
3232 mlog(0, "node %u was expected master\n",
3233 dead_node);
3234 atomic_set(&mle->woken, 1);
3235 spin_unlock(&mle->spinlock);
3236 wake_up(&mle->wq);
3237 /* do not need events any longer, so detach
3238 * from heartbeat */
3239 __dlm_mle_detach_hb_events(dlm, mle);
3240 __dlm_put_mle(mle);
3241 }
3242 continue;
3243 }
3244 3201
3245 /* everything else is a MIGRATION mle */ 3202static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
3246 3203 struct dlm_master_list_entry *mle)
3247 /* the rule for MIGRATION mles is that the master 3204{
3248 * becomes UNKNOWN if *either* the original or 3205 __dlm_mle_detach_hb_events(dlm, mle);
3249 * the new master dies. all UNKNOWN lockreses
3250 * are sent to whichever node becomes the recovery
3251 * master. the new master is responsible for
3252 * determining if there is still a master for
3253 * this lockres, or if he needs to take over
3254 * mastery. either way, this node should expect
3255 * another message to resolve this. */
3256 if (mle->master != dead_node &&
3257 mle->new_master != dead_node)
3258 continue;
3259 3206
3260 /* if we have reached this point, this mle needs to 3207 spin_lock(&mle->spinlock);
3261 * be removed from the list and freed. */ 3208 __dlm_unlink_mle(dlm, mle);
3209 atomic_set(&mle->woken, 1);
3210 spin_unlock(&mle->spinlock);
3262 3211
3263 /* remove from the list early. NOTE: unlinking 3212 wake_up(&mle->wq);
3264 * list_head while in list_for_each_safe */ 3213}
3265 __dlm_mle_detach_hb_events(dlm, mle); 3214
3266 spin_lock(&mle->spinlock); 3215static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
3267 list_del_init(&mle->list); 3216 struct dlm_master_list_entry *mle, u8 dead_node)
3217{
3218 int bit;
3219
3220 BUG_ON(mle->type != DLM_MLE_BLOCK);
3221
3222 spin_lock(&mle->spinlock);
3223 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3224 if (bit != dead_node) {
3225 mlog(0, "mle found, but dead node %u would not have been "
3226 "master\n", dead_node);
3227 spin_unlock(&mle->spinlock);
3228 } else {
3229 /* Must drop the refcount by one since the assert_master will
3230 * never arrive. This may result in the mle being unlinked and
3231 * freed, but there may still be a process waiting in the
3232 * dlmlock path which is fine. */
3233 mlog(0, "node %u was expected master\n", dead_node);
3268 atomic_set(&mle->woken, 1); 3234 atomic_set(&mle->woken, 1);
3269 spin_unlock(&mle->spinlock); 3235 spin_unlock(&mle->spinlock);
3270 wake_up(&mle->wq); 3236 wake_up(&mle->wq);
3271 3237
3272 mlog(0, "%s: node %u died during migration from " 3238 /* Do not need events any longer, so detach from heartbeat */
3273 "%u to %u!\n", dlm->name, dead_node, 3239 __dlm_mle_detach_hb_events(dlm, mle);
3274 mle->master, mle->new_master); 3240 __dlm_put_mle(mle);
3275 /* if there is a lockres associated with this 3241 }
3276 * mle, find it and set its owner to UNKNOWN */ 3242}
3277 hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
3278 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
3279 mle->u.name.len, hash);
3280 if (res) {
3281 /* unfortunately if we hit this rare case, our
3282 * lock ordering is messed. we need to drop
3283 * the master lock so that we can take the
3284 * lockres lock, meaning that we will have to
3285 * restart from the head of list. */
3286 spin_unlock(&dlm->master_lock);
3287 3243
3288 /* move lockres onto recovery list */ 3244void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3289 spin_lock(&res->spinlock); 3245{
3290 dlm_set_lockres_owner(dlm, res, 3246 struct dlm_master_list_entry *mle;
3291 DLM_LOCK_RES_OWNER_UNKNOWN); 3247 struct dlm_lock_resource *res;
3292 dlm_move_lockres_to_recovery_list(dlm, res); 3248 struct hlist_head *bucket;
3293 spin_unlock(&res->spinlock); 3249 struct hlist_node *list;
3294 dlm_lockres_put(res); 3250 unsigned int i;
3295 3251
3296 /* about to get rid of mle, detach from heartbeat */ 3252 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
3297 __dlm_mle_detach_hb_events(dlm, mle); 3253top:
3254 assert_spin_locked(&dlm->spinlock);
3298 3255
3299 /* dump the mle */ 3256 /* clean the master list */
3300 spin_lock(&dlm->master_lock); 3257 spin_lock(&dlm->master_lock);
3301 __dlm_put_mle(mle); 3258 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3302 spin_unlock(&dlm->master_lock); 3259 bucket = dlm_master_hash(dlm, i);
3260 hlist_for_each(list, bucket) {
3261 mle = hlist_entry(list, struct dlm_master_list_entry,
3262 master_hash_node);
3263
3264 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3265 mle->type != DLM_MLE_MASTER &&
3266 mle->type != DLM_MLE_MIGRATION);
3267
3268 /* MASTER mles are initiated locally. The waiting
3269 * process will notice the node map change shortly.
3270 * Let that happen as normal. */
3271 if (mle->type == DLM_MLE_MASTER)
3272 continue;
3273
3274 /* BLOCK mles are initiated by other nodes. Need to
3275 * clean up if the dead node would have been the
3276 * master. */
3277 if (mle->type == DLM_MLE_BLOCK) {
3278 dlm_clean_block_mle(dlm, mle, dead_node);
3279 continue;
3280 }
3303 3281
3304 /* restart */ 3282 /* Everything else is a MIGRATION mle */
3305 goto top; 3283
3306 } 3284 /* The rule for MIGRATION mles is that the master
3285 * becomes UNKNOWN if *either* the original or the new
3286 * master dies. All UNKNOWN lockres' are sent to
3287 * whichever node becomes the recovery master. The new
3288 * master is responsible for determining if there is
3289 * still a master for this lockres, or if he needs to
3290 * take over mastery. Either way, this node should
3291 * expect another message to resolve this. */
3292
3293 if (mle->master != dead_node &&
3294 mle->new_master != dead_node)
3295 continue;
3296
3297 /* If we have reached this point, this mle needs to be
3298 * removed from the list and freed. */
3299 dlm_clean_migration_mle(dlm, mle);
3300
3301 mlog(0, "%s: node %u died during migration from "
3302 "%u to %u!\n", dlm->name, dead_node, mle->master,
3303 mle->new_master);
3304
3305 /* If we find a lockres associated with the mle, we've
3306 * hit this rare case that messes up our lock ordering.
3307 * If so, we need to drop the master lock so that we can
3308 * take the lockres lock, meaning that we will have to
3309 * restart from the head of list. */
3310 res = dlm_reset_mleres_owner(dlm, mle);
3311 if (res)
3312 /* restart */
3313 goto top;
3307 3314
3308 /* this may be the last reference */ 3315 /* This may be the last reference */
3309 __dlm_put_mle(mle); 3316 __dlm_put_mle(mle);
3317 }
3310 } 3318 }
3311 spin_unlock(&dlm->master_lock); 3319 spin_unlock(&dlm->master_lock);
3312} 3320}
3313 3321
3314
3315int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 3322int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3316 u8 old_master) 3323 u8 old_master)
3317{ 3324{
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d490b66ad9d7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
162 162
163 spin_lock(&res->spinlock); 163 spin_lock(&res->spinlock);
164 if (!__dlm_lockres_unused(res)) { 164 if (!__dlm_lockres_unused(res)) {
165 spin_unlock(&res->spinlock);
166 mlog(0, "%s:%.*s: tried to purge but not unused\n", 165 mlog(0, "%s:%.*s: tried to purge but not unused\n",
167 dlm->name, res->lockname.len, res->lockname.name); 166 dlm->name, res->lockname.len, res->lockname.name);
168 return -ENOTEMPTY; 167 __dlm_print_one_lock_resource(res);
168 spin_unlock(&res->spinlock);
169 BUG();
169 } 170 }
171
172 if (res->state & DLM_LOCK_RES_MIGRATING) {
173 mlog(0, "%s:%.*s: Delay dropref as this lockres is "
174 "being remastered\n", dlm->name, res->lockname.len,
175 res->lockname.name);
176 /* Re-add the lockres to the end of the purge list */
177 if (!list_empty(&res->purge)) {
178 list_del_init(&res->purge);
179 list_add_tail(&res->purge, &dlm->purge_list);
180 }
181 spin_unlock(&res->spinlock);
182 return 0;
183 }
184
170 master = (res->owner == dlm->node_num); 185 master = (res->owner == dlm->node_num);
186
171 if (!master) 187 if (!master)
172 res->state |= DLM_LOCK_RES_DROPPING_REF; 188 res->state |= DLM_LOCK_RES_DROPPING_REF;
173 spin_unlock(&res->spinlock); 189 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7219a86d34cc..e15fc7d50827 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
244 .flags = 0, 244 .flags = 0,
245}; 245};
246 246
247static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
248 .flags = 0,
249};
250
247static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { 251static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
248 .get_osb = ocfs2_get_dentry_osb, 252 .get_osb = ocfs2_get_dentry_osb,
249 .post_unlock = ocfs2_dentry_post_unlock, 253 .post_unlock = ocfs2_dentry_post_unlock,
@@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
622 &ocfs2_rename_lops, osb); 626 &ocfs2_rename_lops, osb);
623} 627}
624 628
629static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
630 struct ocfs2_super *osb)
631{
632 /* nfs_sync lockres doesn't come from a slab so we call init
633 * once on it manually. */
634 ocfs2_lock_res_init_once(res);
635 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
636 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
637 &ocfs2_nfs_sync_lops, osb);
638}
639
625void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, 640void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
626 struct ocfs2_file_private *fp) 641 struct ocfs2_file_private *fp)
627{ 642{
@@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
2417 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); 2432 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2418} 2433}
2419 2434
2435int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
2436{
2437 int status;
2438 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2439
2440 if (ocfs2_is_hard_readonly(osb))
2441 return -EROFS;
2442
2443 if (ocfs2_mount_local(osb))
2444 return 0;
2445
2446 status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
2447 0, 0);
2448 if (status < 0)
2449 mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
2450
2451 return status;
2452}
2453
2454void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
2455{
2456 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2457
2458 if (!ocfs2_mount_local(osb))
2459 ocfs2_cluster_unlock(osb, lockres,
2460 ex ? LKM_EXMODE : LKM_PRMODE);
2461}
2462
2420int ocfs2_dentry_lock(struct dentry *dentry, int ex) 2463int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2421{ 2464{
2422 int ret; 2465 int ret;
@@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2798local: 2841local:
2799 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2842 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2800 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2843 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2844 ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
2801 2845
2802 osb->cconn = conn; 2846 osb->cconn = conn;
2803 2847
@@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
2833 2877
2834 ocfs2_lock_res_free(&osb->osb_super_lockres); 2878 ocfs2_lock_res_free(&osb->osb_super_lockres);
2835 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2879 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2880 ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
2836 2881
2837 ocfs2_cluster_disconnect(osb->cconn, hangup_pending); 2882 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
2838 osb->cconn = NULL; 2883 osb->cconn = NULL;
@@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
3015{ 3060{
3016 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); 3061 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
3017 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); 3062 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
3063 ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
3018} 3064}
3019 3065
3020int ocfs2_drop_inode_locks(struct inode *inode) 3066int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3f8d9986b8e0..e1fd5721cd7f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
115 int ex); 115 int ex);
116int ocfs2_rename_lock(struct ocfs2_super *osb); 116int ocfs2_rename_lock(struct ocfs2_super *osb);
117void ocfs2_rename_unlock(struct ocfs2_super *osb); 117void ocfs2_rename_unlock(struct ocfs2_super *osb);
118int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
119void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
118int ocfs2_dentry_lock(struct dentry *dentry, int ex); 120int ocfs2_dentry_lock(struct dentry *dentry, int ex);
119void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 121void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
120int ocfs2_file_lock(struct file *file, int ex, int trylock); 122int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 2f27b332d8b3..de3da8eb558c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -31,6 +31,7 @@
31 31
32#include "ocfs2.h" 32#include "ocfs2.h"
33 33
34#include "alloc.h"
34#include "dir.h" 35#include "dir.h"
35#include "dlmglue.h" 36#include "dlmglue.h"
36#include "dcache.h" 37#include "dcache.h"
@@ -38,6 +39,7 @@
38#include "inode.h" 39#include "inode.h"
39 40
40#include "buffer_head_io.h" 41#include "buffer_head_io.h"
42#include "suballoc.h"
41 43
42struct ocfs2_inode_handle 44struct ocfs2_inode_handle
43{ 45{
@@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
49 struct ocfs2_inode_handle *handle) 51 struct ocfs2_inode_handle *handle)
50{ 52{
51 struct inode *inode; 53 struct inode *inode;
54 struct ocfs2_super *osb = OCFS2_SB(sb);
55 u64 blkno = handle->ih_blkno;
56 int status, set;
52 struct dentry *result; 57 struct dentry *result;
53 58
54 mlog_entry("(0x%p, 0x%p)\n", sb, handle); 59 mlog_entry("(0x%p, 0x%p)\n", sb, handle);
55 60
56 if (handle->ih_blkno == 0) { 61 if (blkno == 0) {
57 mlog_errno(-ESTALE); 62 mlog(0, "nfs wants inode with blkno: 0\n");
58 return ERR_PTR(-ESTALE); 63 result = ERR_PTR(-ESTALE);
64 goto bail;
65 }
66
67 inode = ocfs2_ilookup(sb, blkno);
68 /*
69 * If the inode exists in memory, we only need to check it's
70 * generation number
71 */
72 if (inode)
73 goto check_gen;
74
75 /*
76 * This will synchronize us against ocfs2_delete_inode() on
77 * all nodes
78 */
79 status = ocfs2_nfs_sync_lock(osb, 1);
80 if (status < 0) {
81 mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
82 goto check_err;
83 }
84
85 status = ocfs2_test_inode_bit(osb, blkno, &set);
86 if (status < 0) {
87 if (status == -EINVAL) {
88 /*
89 * The blkno NFS gave us doesn't even show up
90 * as an inode, we return -ESTALE to be
91 * nice
92 */
93 mlog(0, "test inode bit failed %d\n", status);
94 status = -ESTALE;
95 } else {
96 mlog(ML_ERROR, "test inode bit failed %d\n", status);
97 }
98 goto unlock_nfs_sync;
99 }
100
101 /* If the inode allocator bit is clear, this inode must be stale */
102 if (!set) {
103 mlog(0, "inode %llu suballoc bit is clear\n", blkno);
104 status = -ESTALE;
105 goto unlock_nfs_sync;
59 } 106 }
60 107
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); 108 inode = ocfs2_iget(osb, blkno, 0, 0);
62 109
63 if (IS_ERR(inode)) 110unlock_nfs_sync:
64 return (void *)inode; 111 ocfs2_nfs_sync_unlock(osb, 1);
65 112
113check_err:
114 if (status < 0) {
115 if (status == -ESTALE) {
116 mlog(0, "stale inode ino: %llu generation: %u\n",
117 blkno, handle->ih_generation);
118 }
119 result = ERR_PTR(status);
120 goto bail;
121 }
122
123 if (IS_ERR(inode)) {
124 mlog_errno(PTR_ERR(inode));
125 result = (void *)inode;
126 goto bail;
127 }
128
129check_gen:
66 if (handle->ih_generation != inode->i_generation) { 130 if (handle->ih_generation != inode->i_generation) {
67 iput(inode); 131 iput(inode);
68 return ERR_PTR(-ESTALE); 132 mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
133 handle->ih_generation);
134 result = ERR_PTR(-ESTALE);
135 goto bail;
69 } 136 }
70 137
71 result = d_obtain_alias(inode); 138 result = d_obtain_alias(inode);
72 if (!IS_ERR(result)) 139 if (!IS_ERR(result))
73 result->d_op = &ocfs2_dentry_ops; 140 result->d_op = &ocfs2_dentry_ops;
141 else
142 mlog_errno(PTR_ERR(result));
74 143
144bail:
75 mlog_exit_ptr(result); 145 mlog_exit_ptr(result);
76 return result; 146 return result;
77} 147}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..10e1fa87396a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "alloc.h" 40#include "alloc.h"
41#include "dir.h"
41#include "blockcheck.h" 42#include "blockcheck.h"
42#include "dlmglue.h" 43#include "dlmglue.h"
43#include "extent_map.h" 44#include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
112 oi->ip_attr |= OCFS2_DIRSYNC_FL; 113 oi->ip_attr |= OCFS2_DIRSYNC_FL;
113} 114}
114 115
116struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
117{
118 struct ocfs2_find_inode_args args;
119
120 args.fi_blkno = blkno;
121 args.fi_flags = 0;
122 args.fi_ino = ino_from_blkno(sb, blkno);
123 args.fi_sysfile_type = 0;
124
125 return ilookup5(sb, blkno, ocfs2_find_actor, &args);
126}
115struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, 127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
116 int sysfile_type) 128 int sysfile_type)
117{ 129{
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
275 (unsigned long long)OCFS2_I(inode)->ip_blkno, 287 (unsigned long long)OCFS2_I(inode)->ip_blkno,
276 (unsigned long long)le64_to_cpu(fe->i_blkno)); 288 (unsigned long long)le64_to_cpu(fe->i_blkno));
277 289
278 inode->i_nlink = le16_to_cpu(fe->i_links_count); 290 inode->i_nlink = ocfs2_read_links_count(fe);
279 291
280 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { 292 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
281 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 293 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
351 363
352 ocfs2_set_inode_flags(inode); 364 ocfs2_set_inode_flags(inode);
353 365
366 OCFS2_I(inode)->ip_last_used_slot = 0;
367 OCFS2_I(inode)->ip_last_used_group = 0;
354 mlog_exit_void(); 368 mlog_exit_void();
355} 369}
356 370
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
606 } 620 }
607 621
608 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + 622 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
609 ocfs2_quota_trans_credits(inode->i_sb)); 623 ocfs2_quota_trans_credits(inode->i_sb));
610 if (IS_ERR(handle)) { 624 if (IS_ERR(handle)) {
611 status = PTR_ERR(handle); 625 status = PTR_ERR(handle);
612 mlog_errno(status); 626 mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
740 goto bail_unlock_dir; 754 goto bail_unlock_dir;
741 } 755 }
742 756
757 /* Remove any dir index tree */
758 if (S_ISDIR(inode->i_mode)) {
759 status = ocfs2_dx_dir_truncate(inode, di_bh);
760 if (status) {
761 mlog_errno(status);
762 goto bail_unlock_dir;
763 }
764 }
765
743 /*Free extended attribute resources associated with this inode.*/ 766 /*Free extended attribute resources associated with this inode.*/
744 status = ocfs2_xattr_remove(inode, di_bh); 767 status = ocfs2_xattr_remove(inode, di_bh);
745 if (status < 0) { 768 if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
949 goto bail; 972 goto bail;
950 } 973 }
951 974
975 /*
976 * Synchronize us against ocfs2_get_dentry. We take this in
977 * shared mode so that all nodes can still concurrently
978 * process deletes.
979 */
980 status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
981 if (status < 0) {
982 mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
983 ocfs2_cleanup_delete_inode(inode, 0);
984 goto bail_unblock;
985 }
952 /* Lock down the inode. This gives us an up to date view of 986 /* Lock down the inode. This gives us an up to date view of
953 * it's metadata (for verification), and allows us to 987 * it's metadata (for verification), and allows us to
954 * serialize delete_inode on multiple nodes. 988 * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
962 if (status != -ENOENT) 996 if (status != -ENOENT)
963 mlog_errno(status); 997 mlog_errno(status);
964 ocfs2_cleanup_delete_inode(inode, 0); 998 ocfs2_cleanup_delete_inode(inode, 0);
965 goto bail_unblock; 999 goto bail_unlock_nfs_sync;
966 } 1000 }
967 1001
968 /* Query the cluster. This will be the final decision made 1002 /* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
1005bail_unlock_inode: 1039bail_unlock_inode:
1006 ocfs2_inode_unlock(inode, 1); 1040 ocfs2_inode_unlock(inode, 1);
1007 brelse(di_bh); 1041 brelse(di_bh);
1042
1043bail_unlock_nfs_sync:
1044 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
1045
1008bail_unblock: 1046bail_unblock:
1009 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 1047 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
1010 if (status < 0) 1048 if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1205 spin_unlock(&OCFS2_I(inode)->ip_lock); 1243 spin_unlock(&OCFS2_I(inode)->ip_lock);
1206 1244
1207 fe->i_size = cpu_to_le64(i_size_read(inode)); 1245 fe->i_size = cpu_to_le64(i_size_read(inode));
1208 fe->i_links_count = cpu_to_le16(inode->i_nlink); 1246 ocfs2_set_links_count(fe, inode->i_nlink);
1209 fe->i_uid = cpu_to_le32(inode->i_uid); 1247 fe->i_uid = cpu_to_le32(inode->i_uid);
1210 fe->i_gid = cpu_to_le32(inode->i_gid); 1248 fe->i_gid = cpu_to_le32(inode->i_gid);
1211 fe->i_mode = cpu_to_le16(inode->i_mode); 1249 fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1242 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 1280 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
1243 ocfs2_set_inode_flags(inode); 1281 ocfs2_set_inode_flags(inode);
1244 i_size_write(inode, le64_to_cpu(fe->i_size)); 1282 i_size_write(inode, le64_to_cpu(fe->i_size));
1245 inode->i_nlink = le16_to_cpu(fe->i_links_count); 1283 inode->i_nlink = ocfs2_read_links_count(fe);
1246 inode->i_uid = le32_to_cpu(fe->i_uid); 1284 inode->i_uid = le32_to_cpu(fe->i_uid);
1247 inode->i_gid = le32_to_cpu(fe->i_gid); 1285 inode->i_gid = le32_to_cpu(fe->i_gid);
1248 inode->i_mode = le16_to_cpu(fe->i_mode); 1286 inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index eb3c302b38d3..ea71525aad41 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
72 72
73 struct inode vfs_inode; 73 struct inode vfs_inode;
74 struct jbd2_inode ip_jinode; 74 struct jbd2_inode ip_jinode;
75
76 /* Only valid if the inode is the dir. */
77 u32 ip_last_used_slot;
78 u64 ip_last_used_group;
75}; 79};
76 80
77/* 81/*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
124/* Flags for ocfs2_iget() */ 128/* Flags for ocfs2_iget() */
125#define OCFS2_FI_FLAG_SYSFILE 0x1 129#define OCFS2_FI_FLAG_SYSFILE 0x1
126#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 130#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
131struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, 132struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
128 int sysfile_type); 133 int sysfile_type);
129int ocfs2_inode_init_private(struct inode *inode); 134int ocfs2_inode_init_private(struct inode *inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 57d7d25a2b9a..a20a0f1e37fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
65static int ocfs2_recover_orphans(struct ocfs2_super *osb, 65static int ocfs2_recover_orphans(struct ocfs2_super *osb,
66 int slot); 66 int slot);
67static int ocfs2_commit_thread(void *arg); 67static int ocfs2_commit_thread(void *arg);
68static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
69 int slot_num,
70 struct ocfs2_dinode *la_dinode,
71 struct ocfs2_dinode *tl_dinode,
72 struct ocfs2_quota_recovery *qrec);
68 73
69static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) 74static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
70{ 75{
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
76 return __ocfs2_wait_on_mount(osb, 1); 81 return __ocfs2_wait_on_mount(osb, 1);
77} 82}
78 83
79
80
81/* 84/*
82 * The recovery_list is a simple linked list of node numbers to recover. 85 * This replay_map is to track online/offline slots, so we could recover
83 * It is protected by the recovery_lock. 86 * offline slots during recovery and mount
84 */ 87 */
85 88
86struct ocfs2_recovery_map { 89enum ocfs2_replay_state {
87 unsigned int rm_used; 90 REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */
88 unsigned int *rm_entries; 91 REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */
92 REPLAY_DONE /* Replay was already queued */
89}; 93};
90 94
95struct ocfs2_replay_map {
96 unsigned int rm_slots;
97 enum ocfs2_replay_state rm_state;
98 unsigned char rm_replay_slots[0];
99};
100
101void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
102{
103 if (!osb->replay_map)
104 return;
105
106 /* If we've already queued the replay, we don't have any more to do */
107 if (osb->replay_map->rm_state == REPLAY_DONE)
108 return;
109
110 osb->replay_map->rm_state = state;
111}
112
113int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
114{
115 struct ocfs2_replay_map *replay_map;
116 int i, node_num;
117
118 /* If replay map is already set, we don't do it again */
119 if (osb->replay_map)
120 return 0;
121
122 replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
123 (osb->max_slots * sizeof(char)), GFP_KERNEL);
124
125 if (!replay_map) {
126 mlog_errno(-ENOMEM);
127 return -ENOMEM;
128 }
129
130 spin_lock(&osb->osb_lock);
131
132 replay_map->rm_slots = osb->max_slots;
133 replay_map->rm_state = REPLAY_UNNEEDED;
134
135 /* set rm_replay_slots for offline slot(s) */
136 for (i = 0; i < replay_map->rm_slots; i++) {
137 if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
138 replay_map->rm_replay_slots[i] = 1;
139 }
140
141 osb->replay_map = replay_map;
142 spin_unlock(&osb->osb_lock);
143 return 0;
144}
145
146void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
147{
148 struct ocfs2_replay_map *replay_map = osb->replay_map;
149 int i;
150
151 if (!replay_map)
152 return;
153
154 if (replay_map->rm_state != REPLAY_NEEDED)
155 return;
156
157 for (i = 0; i < replay_map->rm_slots; i++)
158 if (replay_map->rm_replay_slots[i])
159 ocfs2_queue_recovery_completion(osb->journal, i, NULL,
160 NULL, NULL);
161 replay_map->rm_state = REPLAY_DONE;
162}
163
164void ocfs2_free_replay_slots(struct ocfs2_super *osb)
165{
166 struct ocfs2_replay_map *replay_map = osb->replay_map;
167
168 if (!osb->replay_map)
169 return;
170
171 kfree(replay_map);
172 osb->replay_map = NULL;
173}
174
91int ocfs2_recovery_init(struct ocfs2_super *osb) 175int ocfs2_recovery_init(struct ocfs2_super *osb)
92{ 176{
93 struct ocfs2_recovery_map *rm; 177 struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
496 }, 580 },
497}; 581};
498 582
583static struct ocfs2_triggers dr_triggers = {
584 .ot_triggers = {
585 .t_commit = ocfs2_commit_trigger,
586 .t_abort = ocfs2_abort_trigger,
587 },
588 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
589};
590
591static struct ocfs2_triggers dl_triggers = {
592 .ot_triggers = {
593 .t_commit = ocfs2_commit_trigger,
594 .t_abort = ocfs2_abort_trigger,
595 },
596 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
597};
598
499static int __ocfs2_journal_access(handle_t *handle, 599static int __ocfs2_journal_access(handle_t *handle,
500 struct inode *inode, 600 struct inode *inode,
501 struct buffer_head *bh, 601 struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
600 type); 700 type);
601} 701}
602 702
703int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
704 struct buffer_head *bh, int type)
705{
706 return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
707 type);
708}
709
710int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
711 struct buffer_head *bh, int type)
712{
713 return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
714 type);
715}
716
603int ocfs2_journal_access(handle_t *handle, struct inode *inode, 717int ocfs2_journal_access(handle_t *handle, struct inode *inode,
604 struct buffer_head *bh, int type) 718 struct buffer_head *bh, int type)
605{ 719{
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1176} 1290}
1177 1291
1178/* Called by the mount code to queue recovery the last part of 1292/* Called by the mount code to queue recovery the last part of
1179 * recovery for it's own slot. */ 1293 * recovery for it's own and offline slot(s). */
1180void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) 1294void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1181{ 1295{
1182 struct ocfs2_journal *journal = osb->journal; 1296 struct ocfs2_journal *journal = osb->journal;
1183 1297
1184 if (osb->dirty) { 1298 /* No need to queue up our truncate_log as regular cleanup will catch
1185 /* No need to queue up our truncate_log as regular 1299 * that */
1186 * cleanup will catch that. */ 1300 ocfs2_queue_recovery_completion(journal, osb->slot_num,
1187 ocfs2_queue_recovery_completion(journal, 1301 osb->local_alloc_copy, NULL, NULL);
1188 osb->slot_num, 1302 ocfs2_schedule_truncate_log_flush(osb, 0);
1189 osb->local_alloc_copy,
1190 NULL,
1191 NULL);
1192 ocfs2_schedule_truncate_log_flush(osb, 0);
1193 1303
1194 osb->local_alloc_copy = NULL; 1304 osb->local_alloc_copy = NULL;
1195 osb->dirty = 0; 1305 osb->dirty = 0;
1196 } 1306
1307 /* queue to recover orphan slots for all offline slots */
1308 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1309 ocfs2_queue_replay_slots(osb);
1310 ocfs2_free_replay_slots(osb);
1197} 1311}
1198 1312
1199void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) 1313void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
1236 goto bail; 1350 goto bail;
1237 } 1351 }
1238 1352
1353 status = ocfs2_compute_replay_slots(osb);
1354 if (status < 0)
1355 mlog_errno(status);
1356
1357 /* queue recovery for our own slot */
1358 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1359 NULL, NULL);
1360
1239 spin_lock(&osb->osb_lock); 1361 spin_lock(&osb->osb_lock);
1240 while (rm->rm_used) { 1362 while (rm->rm_used) {
1241 /* It's always safe to remove entry zero, as we won't 1363 /* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
1301 1423
1302 ocfs2_super_unlock(osb, 1); 1424 ocfs2_super_unlock(osb, 1);
1303 1425
1304 /* We always run recovery on our own orphan dir - the dead 1426 /* queue recovery for offline slots */
1305 * node(s) may have disallowd a previos inode delete. Re-processing 1427 ocfs2_queue_replay_slots(osb);
1306 * is therefore required. */
1307 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1308 NULL, NULL);
1309 1428
1310bail: 1429bail:
1311 mutex_lock(&osb->recovery_lock); 1430 mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
1314 goto restart; 1433 goto restart;
1315 } 1434 }
1316 1435
1436 ocfs2_free_replay_slots(osb);
1317 osb->recovery_thread_task = NULL; 1437 osb->recovery_thread_task = NULL;
1318 mb(); /* sync with ocfs2_recovery_thread_running */ 1438 mb(); /* sync with ocfs2_recovery_thread_running */
1319 wake_up(&osb->recovery_event); 1439 wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1465 goto done; 1585 goto done;
1466 } 1586 }
1467 1587
1588 /* we need to run complete recovery for offline orphan slots */
1589 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1590
1468 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1591 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
1469 node_num, slot_num, 1592 node_num, slot_num,
1470 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1593 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 172850a9a12a..619dd7f6c053 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
38struct ocfs2_super; 38struct ocfs2_super;
39struct ocfs2_dinode; 39struct ocfs2_dinode;
40 40
41/*
42 * The recovery_list is a simple linked list of node numbers to recover.
43 * It is protected by the recovery_lock.
44 */
45
46struct ocfs2_recovery_map {
47 unsigned int rm_used;
48 unsigned int *rm_entries;
49};
50
51
41struct ocfs2_journal { 52struct ocfs2_journal {
42 enum ocfs2_journal_state j_state; /* Journals current state */ 53 enum ocfs2_journal_state j_state; /* Journals current state */
43 54
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
139int ocfs2_recovery_init(struct ocfs2_super *osb); 150int ocfs2_recovery_init(struct ocfs2_super *osb);
140void ocfs2_recovery_exit(struct ocfs2_super *osb); 151void ocfs2_recovery_exit(struct ocfs2_super *osb);
141 152
153int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
142/* 154/*
143 * Journal Control: 155 * Journal Control:
144 * Initialize, Load, Shutdown, Wipe a journal. 156 * Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
266/* dirblock */ 278/* dirblock */
267int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 279int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
268 struct buffer_head *bh, int type); 280 struct buffer_head *bh, int type);
281/* ocfs2_dx_root_block */
282int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
283 struct buffer_head *bh, int type);
284/* ocfs2_dx_leaf */
285int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
286 struct buffer_head *bh, int type);
269/* Anything that has no ecc */ 287/* Anything that has no ecc */
270int ocfs2_journal_access(handle_t *handle, struct inode *inode, 288int ocfs2_journal_access(handle_t *handle, struct inode *inode,
271 struct buffer_head *bh, int type); 289 struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
368} 386}
369 387
370/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 388/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
371 * bitmap block for the new bit) */ 389 * bitmap block for the new bit) dx_root update for free list */
372#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 390#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
391
392static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
393{
394 /* 1 block for index, 2 allocs (data, metadata), 1 clusters
395 * worth of blocks for initial extent. */
396 return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
397 ocfs2_clusters_to_blocks(sb, 1);
398}
373 399
374/* parent fe, parent block, new file entry, inode alloc fe, inode alloc 400/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
375 * group descriptor + mkdir/symlink blocks + quota update */ 401 * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
376static inline int ocfs2_mknod_credits(struct super_block *sb) 402 * blocks + quota update */
403static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
404 int xattr_credits)
377{ 405{
378 return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + 406 int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
407
408 if (is_dir)
409 dir_credits += ocfs2_add_dir_index_credits(sb);
410
411 return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
379 ocfs2_quota_trans_credits(sb); 412 ocfs2_quota_trans_credits(sb);
380} 413}
381 414
@@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
388#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 421#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
389 422
390/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 423/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
391 * update on dir */ 424 * update on dir + index leaf + dx root update for free list */
392static inline int ocfs2_link_credits(struct super_block *sb) 425static inline int ocfs2_link_credits(struct super_block *sb)
393{ 426{
394 return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + 427 return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
395 ocfs2_quota_trans_credits(sb); 428 ocfs2_quota_trans_credits(sb);
396} 429}
397 430
398/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 431/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
399 * dir inode link */ 432 * dir inode link + dir inode index leaf + dir index root */
400static inline int ocfs2_unlink_credits(struct super_block *sb) 433static inline int ocfs2_unlink_credits(struct super_block *sb)
401{ 434{
402 /* The quota update from ocfs2_link_credits is unused here... */ 435 /* The quota update from ocfs2_link_credits is unused here... */
403 return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); 436 return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
404} 437}
405 438
406/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 439/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
407 * inode alloc group descriptor */ 440 * inode alloc group descriptor + orphan dir index leaf */
408#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) 441#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
409 442
410/* dinode update, old dir dinode update, new dir dinode update, old 443/* dinode update, old dir dinode update, new dir dinode update, old
411 * dir dir entry, new dir dir entry, dir entry update for renaming 444 * dir dir entry, new dir dir entry, dir entry update for renaming
412 * directory + target unlink */ 445 * directory + target unlink + 3 x dir index leaves */
413static inline int ocfs2_rename_credits(struct super_block *sb) 446static inline int ocfs2_rename_credits(struct super_block *sb)
414{ 447{
415 return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); 448 return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
416} 449}
417 450
418/* global bitmap dinode, group desc., relinked group, 451/* global bitmap dinode, group desc., relinked group,
@@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
422 + OCFS2_INODE_UPDATE_CREDITS \ 455 + OCFS2_INODE_UPDATE_CREDITS \
423 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) 456 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
424 457
458/* inode update, removal of dx root block from allocator */
459#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
460 OCFS2_SUBALLOC_FREE)
461
462static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
463{
464 int credits = 1 + OCFS2_SUBALLOC_ALLOC;
465
466 credits += ocfs2_clusters_to_blocks(sb, 1);
467 credits += ocfs2_quota_trans_credits(sb);
468
469 return credits;
470}
471
425/* 472/*
426 * Please note that the caller must make sure that root_el is the root 473 * Please note that the caller must make sure that root_el is the root
427 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise 474 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
457 504
458static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 505static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
459{ 506{
460 int blocks = ocfs2_mknod_credits(sb); 507 int blocks = ocfs2_mknod_credits(sb, 0, 0);
461 508
462 /* links can be longer than one block so we may update many 509 /* links can be longer than one block so we may update many
463 * within our single allocated extent. */ 510 * within our single allocated extent. */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec70cdbe77fc..bac7e6abaf47 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,7 +28,6 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31#include <linux/debugfs.h>
32 31
33#define MLOG_MASK_PREFIX ML_DISK_ALLOC 32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
34#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 75 struct inode *local_alloc_inode);
77 76
78#ifdef CONFIG_OCFS2_FS_STATS
79
80static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
81{
82 file->private_data = inode->i_private;
83 return 0;
84}
85
86#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
87#define LA_DEBUG_VER 1
88static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
89 size_t count, loff_t *ppos)
90{
91 static DEFINE_MUTEX(la_debug_mutex);
92 struct ocfs2_super *osb = file->private_data;
93 int written, ret;
94 char *buf = osb->local_alloc_debug_buf;
95
96 mutex_lock(&la_debug_mutex);
97 memset(buf, 0, LA_DEBUG_BUF_SZ);
98
99 written = snprintf(buf, LA_DEBUG_BUF_SZ,
100 "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
101 LA_DEBUG_VER,
102 (unsigned long long)osb->la_last_gd,
103 osb->local_alloc_default_bits,
104 osb->local_alloc_bits, osb->local_alloc_state);
105
106 ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
107
108 mutex_unlock(&la_debug_mutex);
109 return ret;
110}
111
112static const struct file_operations ocfs2_la_debug_fops = {
113 .open = ocfs2_la_debug_open,
114 .read = ocfs2_la_debug_read,
115};
116
117static void ocfs2_init_la_debug(struct ocfs2_super *osb)
118{
119 osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
120 if (!osb->local_alloc_debug_buf)
121 return;
122
123 osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
124 S_IFREG|S_IRUSR,
125 osb->osb_debug_root,
126 osb,
127 &ocfs2_la_debug_fops);
128 if (!osb->local_alloc_debug) {
129 kfree(osb->local_alloc_debug_buf);
130 osb->local_alloc_debug_buf = NULL;
131 }
132}
133
134static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
135{
136 if (osb->local_alloc_debug)
137 debugfs_remove(osb->local_alloc_debug);
138
139 if (osb->local_alloc_debug_buf)
140 kfree(osb->local_alloc_debug_buf);
141
142 osb->local_alloc_debug_buf = NULL;
143 osb->local_alloc_debug = NULL;
144}
145#else /* CONFIG_OCFS2_FS_STATS */
146static void ocfs2_init_la_debug(struct ocfs2_super *osb)
147{
148 return;
149}
150static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
151{
152 return;
153}
154#endif
155
156static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) 77static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
157{ 78{
158 return (osb->local_alloc_state == OCFS2_LA_THROTTLED || 79 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
226 147
227 mlog_entry_void(); 148 mlog_entry_void();
228 149
229 ocfs2_init_la_debug(osb);
230
231 if (osb->local_alloc_bits == 0) 150 if (osb->local_alloc_bits == 0)
232 goto bail; 151 goto bail;
233 152
@@ -299,9 +218,6 @@ bail:
299 if (inode) 218 if (inode)
300 iput(inode); 219 iput(inode);
301 220
302 if (status < 0)
303 ocfs2_shutdown_la_debug(osb);
304
305 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); 221 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
306 222
307 mlog_exit(status); 223 mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
331 cancel_delayed_work(&osb->la_enable_wq); 247 cancel_delayed_work(&osb->la_enable_wq);
332 flush_workqueue(ocfs2_wq); 248 flush_workqueue(ocfs2_wq);
333 249
334 ocfs2_shutdown_la_debug(osb);
335
336 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 250 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
337 goto out; 251 goto out;
338 252
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713ea..b606496b72ec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
154 return ret; 154 return ret;
155} 155}
156 156
157static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) 157static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
158{ 158{
159 struct page *page = vmf->page;
159 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 160 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
160 struct buffer_head *di_bh = NULL; 161 struct buffer_head *di_bh = NULL;
161 sigset_t blocked, oldset; 162 sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
196 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 197 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
197 if (ret2 < 0) 198 if (ret2 < 0)
198 mlog_errno(ret2); 199 mlog_errno(ret2);
199 200 if (ret)
201 ret = VM_FAULT_SIGBUS;
200 return ret; 202 return ret;
201} 203}
202 204
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4b11762f249e..2220f93f668b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
80 struct inode **ret_orphan_dir, 80 struct inode **ret_orphan_dir,
81 struct inode *inode, 81 struct inode *inode,
82 char *name, 82 char *name,
83 struct buffer_head **de_bh); 83 struct ocfs2_dir_lookup_result *lookup);
84 84
85static int ocfs2_orphan_add(struct ocfs2_super *osb, 85static int ocfs2_orphan_add(struct ocfs2_super *osb,
86 handle_t *handle, 86 handle_t *handle,
87 struct inode *inode, 87 struct inode *inode,
88 struct ocfs2_dinode *fe, 88 struct ocfs2_dinode *fe,
89 char *name, 89 char *name,
90 struct buffer_head *de_bh, 90 struct ocfs2_dir_lookup_result *lookup,
91 struct inode *orphan_dir_inode); 91 struct inode *orphan_dir_inode);
92 92
93static int ocfs2_create_symlink_data(struct ocfs2_super *osb, 93static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
228 struct ocfs2_super *osb; 228 struct ocfs2_super *osb;
229 struct ocfs2_dinode *dirfe; 229 struct ocfs2_dinode *dirfe;
230 struct buffer_head *new_fe_bh = NULL; 230 struct buffer_head *new_fe_bh = NULL;
231 struct buffer_head *de_bh = NULL;
232 struct inode *inode = NULL; 231 struct inode *inode = NULL;
233 struct ocfs2_alloc_context *inode_ac = NULL; 232 struct ocfs2_alloc_context *inode_ac = NULL;
234 struct ocfs2_alloc_context *data_ac = NULL; 233 struct ocfs2_alloc_context *data_ac = NULL;
235 struct ocfs2_alloc_context *xattr_ac = NULL; 234 struct ocfs2_alloc_context *meta_ac = NULL;
236 int want_clusters = 0; 235 int want_clusters = 0;
236 int want_meta = 0;
237 int xattr_credits = 0; 237 int xattr_credits = 0;
238 struct ocfs2_security_xattr_info si = { 238 struct ocfs2_security_xattr_info si = {
239 .enable = 1, 239 .enable = 1,
240 }; 240 };
241 int did_quota_inode = 0; 241 int did_quota_inode = 0;
242 struct ocfs2_dir_lookup_result lookup = { NULL, };
242 243
243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 244 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
244 (unsigned long)dev, dentry->d_name.len, 245 (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
254 return status; 255 return status;
255 } 256 }
256 257
257 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { 258 if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
258 status = -EMLINK; 259 status = -EMLINK;
259 goto leave; 260 goto leave;
260 } 261 }
261 262
262 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 263 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
263 if (!dirfe->i_links_count) { 264 if (!ocfs2_read_links_count(dirfe)) {
264 /* can't make a file in a deleted directory. */ 265 /* can't make a file in a deleted directory. */
265 status = -ENOENT; 266 status = -ENOENT;
266 goto leave; 267 goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
274 /* get a spot inside the dir. */ 275 /* get a spot inside the dir. */
275 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 276 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
276 dentry->d_name.name, 277 dentry->d_name.name,
277 dentry->d_name.len, &de_bh); 278 dentry->d_name.len, &lookup);
278 if (status < 0) { 279 if (status < 0) {
279 mlog_errno(status); 280 mlog_errno(status);
280 goto leave; 281 goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
308 309
309 /* calculate meta data/clusters for setting security and acl xattr */ 310 /* calculate meta data/clusters for setting security and acl xattr */
310 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, 311 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
311 &si, &want_clusters, 312 &si, &want_clusters,
312 &xattr_credits, &xattr_ac); 313 &xattr_credits, &want_meta);
313 if (status < 0) { 314 if (status < 0) {
314 mlog_errno(status); 315 mlog_errno(status);
315 goto leave; 316 goto leave;
316 } 317 }
317 318
318 /* Reserve a cluster if creating an extent based directory. */ 319 /* Reserve a cluster if creating an extent based directory. */
319 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) 320 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
320 want_clusters += 1; 321 want_clusters += 1;
321 322
323 /* Dir indexing requires extra space as well */
324 if (ocfs2_supports_indexed_dirs(osb))
325 want_meta++;
326 }
327
328 status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
329 if (status < 0) {
330 if (status != -ENOSPC)
331 mlog_errno(status);
332 goto leave;
333 }
334
322 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); 335 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
323 if (status < 0) { 336 if (status < 0) {
324 if (status != -ENOSPC) 337 if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
326 goto leave; 339 goto leave;
327 } 340 }
328 341
329 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + 342 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
330 xattr_credits); 343 S_ISDIR(mode),
344 xattr_credits));
331 if (IS_ERR(handle)) { 345 if (IS_ERR(handle)) {
332 status = PTR_ERR(handle); 346 status = PTR_ERR(handle);
333 handle = NULL; 347 handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
355 369
356 if (S_ISDIR(mode)) { 370 if (S_ISDIR(mode)) {
357 status = ocfs2_fill_new_dir(osb, handle, dir, inode, 371 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
358 new_fe_bh, data_ac); 372 new_fe_bh, data_ac, meta_ac);
359 if (status < 0) { 373 if (status < 0) {
360 mlog_errno(status); 374 mlog_errno(status);
361 goto leave; 375 goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
367 mlog_errno(status); 381 mlog_errno(status);
368 goto leave; 382 goto leave;
369 } 383 }
370 le16_add_cpu(&dirfe->i_links_count, 1); 384 ocfs2_add_links_count(dirfe, 1);
371 status = ocfs2_journal_dirty(handle, parent_fe_bh); 385 status = ocfs2_journal_dirty(handle, parent_fe_bh);
372 if (status < 0) { 386 if (status < 0) {
373 mlog_errno(status); 387 mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
377 } 391 }
378 392
379 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, 393 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
380 xattr_ac, data_ac); 394 meta_ac, data_ac);
381 if (status < 0) { 395 if (status < 0) {
382 mlog_errno(status); 396 mlog_errno(status);
383 goto leave; 397 goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
385 399
386 if (si.enable) { 400 if (si.enable) {
387 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, 401 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
388 xattr_ac, data_ac); 402 meta_ac, data_ac);
389 if (status < 0) { 403 if (status < 0) {
390 mlog_errno(status); 404 mlog_errno(status);
391 goto leave; 405 goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
394 408
395 status = ocfs2_add_entry(handle, dentry, inode, 409 status = ocfs2_add_entry(handle, dentry, inode,
396 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 410 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
397 de_bh); 411 &lookup);
398 if (status < 0) { 412 if (status < 0) {
399 mlog_errno(status); 413 mlog_errno(status);
400 goto leave; 414 goto leave;
@@ -423,11 +437,12 @@ leave:
423 mlog(0, "Disk is full\n"); 437 mlog(0, "Disk is full\n");
424 438
425 brelse(new_fe_bh); 439 brelse(new_fe_bh);
426 brelse(de_bh);
427 brelse(parent_fe_bh); 440 brelse(parent_fe_bh);
428 kfree(si.name); 441 kfree(si.name);
429 kfree(si.value); 442 kfree(si.value);
430 443
444 ocfs2_free_dir_lookup_result(&lookup);
445
431 if ((status < 0) && inode) { 446 if ((status < 0) && inode) {
432 clear_nlink(inode); 447 clear_nlink(inode);
433 iput(inode); 448 iput(inode);
@@ -439,8 +454,8 @@ leave:
439 if (data_ac) 454 if (data_ac)
440 ocfs2_free_alloc_context(data_ac); 455 ocfs2_free_alloc_context(data_ac);
441 456
442 if (xattr_ac) 457 if (meta_ac)
443 ocfs2_free_alloc_context(xattr_ac); 458 ocfs2_free_alloc_context(meta_ac);
444 459
445 mlog_exit(status); 460 mlog_exit(status);
446 461
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
462 struct ocfs2_extent_list *fel; 477 struct ocfs2_extent_list *fel;
463 u64 fe_blkno = 0; 478 u64 fe_blkno = 0;
464 u16 suballoc_bit; 479 u16 suballoc_bit;
480 u16 feat;
465 481
466 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 482 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
467 inode->i_mode, (unsigned long)dev, dentry->d_name.len, 483 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
469 485
470 *new_fe_bh = NULL; 486 *new_fe_bh = NULL;
471 487
472 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, 488 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
473 &fe_blkno); 489 inode_ac, &suballoc_bit, &fe_blkno);
474 if (status < 0) { 490 if (status < 0) {
475 mlog_errno(status); 491 mlog_errno(status);
476 goto leave; 492 goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
513 fe->i_mode = cpu_to_le16(inode->i_mode); 529 fe->i_mode = cpu_to_le16(inode->i_mode);
514 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 530 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
515 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); 531 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
516 fe->i_links_count = cpu_to_le16(inode->i_nlink); 532
533 ocfs2_set_links_count(fe, inode->i_nlink);
517 534
518 fe->i_last_eb_blk = 0; 535 fe->i_last_eb_blk = 0;
519 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 536 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
525 fe->i_dtime = 0; 542 fe->i_dtime = 0;
526 543
527 /* 544 /*
528 * If supported, directories start with inline data. 545 * If supported, directories start with inline data. If inline
546 * isn't supported, but indexing is, we start them as indexed.
529 */ 547 */
548 feat = le16_to_cpu(fe->i_dyn_features);
530 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { 549 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
531 u16 feat = le16_to_cpu(fe->i_dyn_features);
532
533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 550 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
534 551
535 fe->id2.i_data.id_count = cpu_to_le16( 552 fe->id2.i_data.id_count = cpu_to_le16(
@@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
608 int err; 625 int err;
609 struct buffer_head *fe_bh = NULL; 626 struct buffer_head *fe_bh = NULL;
610 struct buffer_head *parent_fe_bh = NULL; 627 struct buffer_head *parent_fe_bh = NULL;
611 struct buffer_head *de_bh = NULL;
612 struct ocfs2_dinode *fe = NULL; 628 struct ocfs2_dinode *fe = NULL;
613 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 629 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
630 struct ocfs2_dir_lookup_result lookup = { NULL, };
614 631
615 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 632 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
616 old_dentry->d_name.len, old_dentry->d_name.name, 633 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
638 655
639 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 656 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
640 dentry->d_name.name, 657 dentry->d_name.name,
641 dentry->d_name.len, &de_bh); 658 dentry->d_name.len, &lookup);
642 if (err < 0) { 659 if (err < 0) {
643 mlog_errno(err); 660 mlog_errno(err);
644 goto out; 661 goto out;
@@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
652 } 669 }
653 670
654 fe = (struct ocfs2_dinode *) fe_bh->b_data; 671 fe = (struct ocfs2_dinode *) fe_bh->b_data;
655 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { 672 if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
656 err = -EMLINK; 673 err = -EMLINK;
657 goto out_unlock_inode; 674 goto out_unlock_inode;
658 } 675 }
@@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
674 691
675 inc_nlink(inode); 692 inc_nlink(inode);
676 inode->i_ctime = CURRENT_TIME; 693 inode->i_ctime = CURRENT_TIME;
677 fe->i_links_count = cpu_to_le16(inode->i_nlink); 694 ocfs2_set_links_count(fe, inode->i_nlink);
678 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 695 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
679 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 696 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
680 697
681 err = ocfs2_journal_dirty(handle, fe_bh); 698 err = ocfs2_journal_dirty(handle, fe_bh);
682 if (err < 0) { 699 if (err < 0) {
683 le16_add_cpu(&fe->i_links_count, -1); 700 ocfs2_add_links_count(fe, -1);
684 drop_nlink(inode); 701 drop_nlink(inode);
685 mlog_errno(err); 702 mlog_errno(err);
686 goto out_commit; 703 goto out_commit;
@@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
688 705
689 err = ocfs2_add_entry(handle, dentry, inode, 706 err = ocfs2_add_entry(handle, dentry, inode,
690 OCFS2_I(inode)->ip_blkno, 707 OCFS2_I(inode)->ip_blkno,
691 parent_fe_bh, de_bh); 708 parent_fe_bh, &lookup);
692 if (err) { 709 if (err) {
693 le16_add_cpu(&fe->i_links_count, -1); 710 ocfs2_add_links_count(fe, -1);
694 drop_nlink(inode); 711 drop_nlink(inode);
695 mlog_errno(err); 712 mlog_errno(err);
696 goto out_commit; 713 goto out_commit;
@@ -714,10 +731,11 @@ out_unlock_inode:
714out: 731out:
715 ocfs2_inode_unlock(dir, 1); 732 ocfs2_inode_unlock(dir, 1);
716 733
717 brelse(de_bh);
718 brelse(fe_bh); 734 brelse(fe_bh);
719 brelse(parent_fe_bh); 735 brelse(parent_fe_bh);
720 736
737 ocfs2_free_dir_lookup_result(&lookup);
738
721 mlog_exit(err); 739 mlog_exit(err);
722 740
723 return err; 741 return err;
@@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
766 struct buffer_head *fe_bh = NULL; 784 struct buffer_head *fe_bh = NULL;
767 struct buffer_head *parent_node_bh = NULL; 785 struct buffer_head *parent_node_bh = NULL;
768 handle_t *handle = NULL; 786 handle_t *handle = NULL;
769 struct ocfs2_dir_entry *dirent = NULL;
770 struct buffer_head *dirent_bh = NULL;
771 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 787 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
772 struct buffer_head *orphan_entry_bh = NULL; 788 struct ocfs2_dir_lookup_result lookup = { NULL, };
789 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
773 790
774 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
775 dentry->d_name.len, dentry->d_name.name); 792 dentry->d_name.len, dentry->d_name.name);
@@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
791 } 808 }
792 809
793 status = ocfs2_find_files_on_disk(dentry->d_name.name, 810 status = ocfs2_find_files_on_disk(dentry->d_name.name,
794 dentry->d_name.len, &blkno, 811 dentry->d_name.len, &blkno, dir,
795 dir, &dirent_bh, &dirent); 812 &lookup);
796 if (status < 0) { 813 if (status < 0) {
797 if (status != -ENOENT) 814 if (status != -ENOENT)
798 mlog_errno(status); 815 mlog_errno(status);
@@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
817 child_locked = 1; 834 child_locked = 1;
818 835
819 if (S_ISDIR(inode->i_mode)) { 836 if (S_ISDIR(inode->i_mode)) {
820 if (!ocfs2_empty_dir(inode)) { 837 if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
821 status = -ENOTEMPTY;
822 goto leave;
823 } else if (inode->i_nlink != 2) {
824 status = -ENOTEMPTY; 838 status = -ENOTEMPTY;
825 goto leave; 839 goto leave;
826 } 840 }
@@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
836 850
837 if (inode_is_unlinkable(inode)) { 851 if (inode_is_unlinkable(inode)) {
838 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, 852 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
839 orphan_name, 853 orphan_name, &orphan_insert);
840 &orphan_entry_bh);
841 if (status < 0) { 854 if (status < 0) {
842 mlog_errno(status); 855 mlog_errno(status);
843 goto leave; 856 goto leave;
@@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
863 876
864 if (inode_is_unlinkable(inode)) { 877 if (inode_is_unlinkable(inode)) {
865 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 878 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
866 orphan_entry_bh, orphan_dir); 879 &orphan_insert, orphan_dir);
867 if (status < 0) { 880 if (status < 0) {
868 mlog_errno(status); 881 mlog_errno(status);
869 goto leave; 882 goto leave;
@@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
871 } 884 }
872 885
873 /* delete the name from the parent dir */ 886 /* delete the name from the parent dir */
874 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); 887 status = ocfs2_delete_entry(handle, dir, &lookup);
875 if (status < 0) { 888 if (status < 0) {
876 mlog_errno(status); 889 mlog_errno(status);
877 goto leave; 890 goto leave;
@@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
880 if (S_ISDIR(inode->i_mode)) 893 if (S_ISDIR(inode->i_mode))
881 drop_nlink(inode); 894 drop_nlink(inode);
882 drop_nlink(inode); 895 drop_nlink(inode);
883 fe->i_links_count = cpu_to_le16(inode->i_nlink); 896 ocfs2_set_links_count(fe, inode->i_nlink);
884 897
885 status = ocfs2_journal_dirty(handle, fe_bh); 898 status = ocfs2_journal_dirty(handle, fe_bh);
886 if (status < 0) { 899 if (status < 0) {
@@ -916,9 +929,10 @@ leave:
916 } 929 }
917 930
918 brelse(fe_bh); 931 brelse(fe_bh);
919 brelse(dirent_bh);
920 brelse(parent_node_bh); 932 brelse(parent_node_bh);
921 brelse(orphan_entry_bh); 933
934 ocfs2_free_dir_lookup_result(&orphan_insert);
935 ocfs2_free_dir_lookup_result(&lookup);
922 936
923 mlog_exit(status); 937 mlog_exit(status);
924 938
@@ -1004,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir,
1004 struct inode *new_dir, 1018 struct inode *new_dir,
1005 struct dentry *new_dentry) 1019 struct dentry *new_dentry)
1006{ 1020{
1007 int status = 0, rename_lock = 0, parents_locked = 0; 1021 int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
1008 int old_child_locked = 0, new_child_locked = 0; 1022 int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
1009 struct inode *old_inode = old_dentry->d_inode; 1023 struct inode *old_inode = old_dentry->d_inode;
1010 struct inode *new_inode = new_dentry->d_inode; 1024 struct inode *new_inode = new_dentry->d_inode;
1011 struct inode *orphan_dir = NULL; 1025 struct inode *orphan_dir = NULL;
@@ -1020,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir,
1020 handle_t *handle = NULL; 1034 handle_t *handle = NULL;
1021 struct buffer_head *old_dir_bh = NULL; 1035 struct buffer_head *old_dir_bh = NULL;
1022 struct buffer_head *new_dir_bh = NULL; 1036 struct buffer_head *new_dir_bh = NULL;
1023 struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
1024 *new_de = NULL;
1025 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1026 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1027 // this is the 1st dirent bh
1028 nlink_t old_dir_nlink = old_dir->i_nlink; 1037 nlink_t old_dir_nlink = old_dir->i_nlink;
1029 struct ocfs2_dinode *old_di; 1038 struct ocfs2_dinode *old_di;
1039 struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
1040 struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
1041 struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
1042 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
1043 struct ocfs2_dir_lookup_result target_insert = { NULL, };
1030 1044
1031 /* At some point it might be nice to break this function up a 1045 /* At some point it might be nice to break this function up a
1032 * bit. */ 1046 * bit. */
@@ -1108,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir,
1108 if (S_ISDIR(old_inode->i_mode)) { 1122 if (S_ISDIR(old_inode->i_mode)) {
1109 u64 old_inode_parent; 1123 u64 old_inode_parent;
1110 1124
1125 update_dot_dot = 1;
1111 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent, 1126 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
1112 old_inode, &old_inode_de_bh, 1127 old_inode,
1113 &old_inode_dot_dot_de); 1128 &old_inode_dot_dot_res);
1114 if (status) { 1129 if (status) {
1115 status = -EIO; 1130 status = -EIO;
1116 goto bail; 1131 goto bail;
@@ -1122,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir,
1122 } 1137 }
1123 1138
1124 if (!new_inode && new_dir != old_dir && 1139 if (!new_inode && new_dir != old_dir &&
1125 new_dir->i_nlink >= OCFS2_LINK_MAX) { 1140 new_dir->i_nlink >= ocfs2_link_max(osb)) {
1126 status = -EMLINK; 1141 status = -EMLINK;
1127 goto bail; 1142 goto bail;
1128 } 1143 }
@@ -1151,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir,
1151 * to delete it */ 1166 * to delete it */
1152 status = ocfs2_find_files_on_disk(new_dentry->d_name.name, 1167 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1153 new_dentry->d_name.len, 1168 new_dentry->d_name.len,
1154 &newfe_blkno, new_dir, &new_de_bh, 1169 &newfe_blkno, new_dir,
1155 &new_de); 1170 &target_lookup_res);
1156 /* The only error we allow here is -ENOENT because the new 1171 /* The only error we allow here is -ENOENT because the new
1157 * file not existing is perfectly valid. */ 1172 * file not existing is perfectly valid. */
1158 if ((status < 0) && (status != -ENOENT)) { 1173 if ((status < 0) && (status != -ENOENT)) {
@@ -1161,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir,
1161 mlog_errno(status); 1176 mlog_errno(status);
1162 goto bail; 1177 goto bail;
1163 } 1178 }
1179 if (status == 0)
1180 target_exists = 1;
1164 1181
1165 if (!new_de && new_inode) { 1182 if (!target_exists && new_inode) {
1166 /* 1183 /*
1167 * Target was unlinked by another node while we were 1184 * Target was unlinked by another node while we were
1168 * waiting to get to ocfs2_rename(). There isn't 1185 * waiting to get to ocfs2_rename(). There isn't
@@ -1175,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir,
1175 1192
1176 /* In case we need to overwrite an existing file, we blow it 1193 /* In case we need to overwrite an existing file, we blow it
1177 * away first */ 1194 * away first */
1178 if (new_de) { 1195 if (target_exists) {
1179 /* VFS didn't think there existed an inode here, but 1196 /* VFS didn't think there existed an inode here, but
1180 * someone else in the cluster must have raced our 1197 * someone else in the cluster must have raced our
1181 * rename to create one. Today we error cleanly, in 1198 * rename to create one. Today we error cleanly, in
@@ -1216,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir,
1216 1233
1217 newfe = (struct ocfs2_dinode *) newfe_bh->b_data; 1234 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1218 1235
1219 mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu " 1236 mlog(0, "aha rename over existing... new_blkno=%llu "
1220 "newfebh=%p bhblocknr=%llu\n", new_de, 1237 "newfebh=%p bhblocknr=%llu\n",
1221 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? 1238 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
1222 (unsigned long long)newfe_bh->b_blocknr : 0ULL); 1239 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1223 1240
@@ -1225,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir,
1225 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1242 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1226 new_inode, 1243 new_inode,
1227 orphan_name, 1244 orphan_name,
1228 &orphan_entry_bh); 1245 &orphan_insert);
1229 if (status < 0) { 1246 if (status < 0) {
1230 mlog_errno(status); 1247 mlog_errno(status);
1231 goto bail; 1248 goto bail;
@@ -1243,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir,
1243 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, 1260 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1244 new_dentry->d_name.name, 1261 new_dentry->d_name.name,
1245 new_dentry->d_name.len, 1262 new_dentry->d_name.len,
1246 &insert_entry_bh); 1263 &target_insert);
1247 if (status < 0) { 1264 if (status < 0) {
1248 mlog_errno(status); 1265 mlog_errno(status);
1249 goto bail; 1266 goto bail;
@@ -1258,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir,
1258 goto bail; 1275 goto bail;
1259 } 1276 }
1260 1277
1261 if (new_de) { 1278 if (target_exists) {
1262 if (S_ISDIR(new_inode->i_mode)) { 1279 if (S_ISDIR(new_inode->i_mode)) {
1263 if (!ocfs2_empty_dir(new_inode) || 1280 if (new_inode->i_nlink != 2 ||
1264 new_inode->i_nlink != 2) { 1281 !ocfs2_empty_dir(new_inode)) {
1265 status = -ENOTEMPTY; 1282 status = -ENOTEMPTY;
1266 goto bail; 1283 goto bail;
1267 } 1284 }
@@ -1274,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir,
1274 } 1291 }
1275 1292
1276 if (S_ISDIR(new_inode->i_mode) || 1293 if (S_ISDIR(new_inode->i_mode) ||
1277 (newfe->i_links_count == cpu_to_le16(1))){ 1294 (ocfs2_read_links_count(newfe) == 1)) {
1278 status = ocfs2_orphan_add(osb, handle, new_inode, 1295 status = ocfs2_orphan_add(osb, handle, new_inode,
1279 newfe, orphan_name, 1296 newfe, orphan_name,
1280 orphan_entry_bh, orphan_dir); 1297 &orphan_insert, orphan_dir);
1281 if (status < 0) { 1298 if (status < 0) {
1282 mlog_errno(status); 1299 mlog_errno(status);
1283 goto bail; 1300 goto bail;
@@ -1285,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir,
1285 } 1302 }
1286 1303
1287 /* change the dirent to point to the correct inode */ 1304 /* change the dirent to point to the correct inode */
1288 status = ocfs2_update_entry(new_dir, handle, new_de_bh, 1305 status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
1289 new_de, old_inode); 1306 old_inode);
1290 if (status < 0) { 1307 if (status < 0) {
1291 mlog_errno(status); 1308 mlog_errno(status);
1292 goto bail; 1309 goto bail;
@@ -1294,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir,
1294 new_dir->i_version++; 1311 new_dir->i_version++;
1295 1312
1296 if (S_ISDIR(new_inode->i_mode)) 1313 if (S_ISDIR(new_inode->i_mode))
1297 newfe->i_links_count = 0; 1314 ocfs2_set_links_count(newfe, 0);
1298 else 1315 else
1299 le16_add_cpu(&newfe->i_links_count, -1); 1316 ocfs2_add_links_count(newfe, -1);
1300 1317
1301 status = ocfs2_journal_dirty(handle, newfe_bh); 1318 status = ocfs2_journal_dirty(handle, newfe_bh);
1302 if (status < 0) { 1319 if (status < 0) {
@@ -1307,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir,
1307 /* if the name was not found in new_dir, add it now */ 1324 /* if the name was not found in new_dir, add it now */
1308 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1325 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1309 OCFS2_I(old_inode)->ip_blkno, 1326 OCFS2_I(old_inode)->ip_blkno,
1310 new_dir_bh, insert_entry_bh); 1327 new_dir_bh, &target_insert);
1311 } 1328 }
1312 1329
1313 old_inode->i_ctime = CURRENT_TIME; 1330 old_inode->i_ctime = CURRENT_TIME;
@@ -1334,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir,
1334 * because the insert might have changed the type of directory 1351 * because the insert might have changed the type of directory
1335 * we're dealing with. 1352 * we're dealing with.
1336 */ 1353 */
1337 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, 1354 status = ocfs2_find_entry(old_dentry->d_name.name,
1338 old_dentry->d_name.len, 1355 old_dentry->d_name.len, old_dir,
1339 old_dir, &old_de); 1356 &old_entry_lookup);
1340 if (!old_de_bh) { 1357 if (status)
1341 status = -EIO;
1342 goto bail; 1358 goto bail;
1343 }
1344 1359
1345 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); 1360 status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
1346 if (status < 0) { 1361 if (status < 0) {
1347 mlog_errno(status); 1362 mlog_errno(status);
1348 goto bail; 1363 goto bail;
@@ -1353,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir,
1353 new_inode->i_ctime = CURRENT_TIME; 1368 new_inode->i_ctime = CURRENT_TIME;
1354 } 1369 }
1355 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; 1370 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1356 if (old_inode_de_bh) { 1371
1357 status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh, 1372 if (update_dot_dot) {
1358 old_inode_dot_dot_de, new_dir); 1373 status = ocfs2_update_entry(old_inode, handle,
1374 &old_inode_dot_dot_res, new_dir);
1359 old_dir->i_nlink--; 1375 old_dir->i_nlink--;
1360 if (new_inode) { 1376 if (new_inode) {
1361 new_inode->i_nlink--; 1377 new_inode->i_nlink--;
@@ -1391,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir,
1391 } else { 1407 } else {
1392 struct ocfs2_dinode *fe; 1408 struct ocfs2_dinode *fe;
1393 status = ocfs2_journal_access_di(handle, old_dir, 1409 status = ocfs2_journal_access_di(handle, old_dir,
1394 old_dir_bh, 1410 old_dir_bh,
1395 OCFS2_JOURNAL_ACCESS_WRITE); 1411 OCFS2_JOURNAL_ACCESS_WRITE);
1396 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1412 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1397 fe->i_links_count = cpu_to_le16(old_dir->i_nlink); 1413 ocfs2_set_links_count(fe, old_dir->i_nlink);
1398 status = ocfs2_journal_dirty(handle, old_dir_bh); 1414 status = ocfs2_journal_dirty(handle, old_dir_bh);
1399 } 1415 }
1400 } 1416 }
1401
1402 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1417 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
1403 status = 0; 1418 status = 0;
1404bail: 1419bail:
@@ -1429,13 +1444,17 @@ bail:
1429 1444
1430 if (new_inode) 1445 if (new_inode)
1431 iput(new_inode); 1446 iput(new_inode);
1447
1448 ocfs2_free_dir_lookup_result(&target_lookup_res);
1449 ocfs2_free_dir_lookup_result(&old_entry_lookup);
1450 ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
1451 ocfs2_free_dir_lookup_result(&orphan_insert);
1452 ocfs2_free_dir_lookup_result(&target_insert);
1453
1432 brelse(newfe_bh); 1454 brelse(newfe_bh);
1433 brelse(old_inode_bh); 1455 brelse(old_inode_bh);
1434 brelse(old_dir_bh); 1456 brelse(old_dir_bh);
1435 brelse(new_dir_bh); 1457 brelse(new_dir_bh);
1436 brelse(new_de_bh);
1437 brelse(old_de_bh);
1438 brelse(old_inode_de_bh);
1439 brelse(orphan_entry_bh); 1458 brelse(orphan_entry_bh);
1440 brelse(insert_entry_bh); 1459 brelse(insert_entry_bh);
1441 1460
@@ -1558,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir,
1558 struct inode *inode = NULL; 1577 struct inode *inode = NULL;
1559 struct super_block *sb; 1578 struct super_block *sb;
1560 struct buffer_head *new_fe_bh = NULL; 1579 struct buffer_head *new_fe_bh = NULL;
1561 struct buffer_head *de_bh = NULL;
1562 struct buffer_head *parent_fe_bh = NULL; 1580 struct buffer_head *parent_fe_bh = NULL;
1563 struct ocfs2_dinode *fe = NULL; 1581 struct ocfs2_dinode *fe = NULL;
1564 struct ocfs2_dinode *dirfe; 1582 struct ocfs2_dinode *dirfe;
@@ -1572,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir,
1572 .enable = 1, 1590 .enable = 1,
1573 }; 1591 };
1574 int did_quota = 0, did_quota_inode = 0; 1592 int did_quota = 0, did_quota_inode = 0;
1593 struct ocfs2_dir_lookup_result lookup = { NULL, };
1575 1594
1576 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1595 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1577 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1596 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1592,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir,
1592 } 1611 }
1593 1612
1594 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1613 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1595 if (!dirfe->i_links_count) { 1614 if (!ocfs2_read_links_count(dirfe)) {
1596 /* can't make a file in a deleted directory. */ 1615 /* can't make a file in a deleted directory. */
1597 status = -ENOENT; 1616 status = -ENOENT;
1598 goto bail; 1617 goto bail;
@@ -1605,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir,
1605 1624
1606 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 1625 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1607 dentry->d_name.name, 1626 dentry->d_name.name,
1608 dentry->d_name.len, &de_bh); 1627 dentry->d_name.len, &lookup);
1609 if (status < 0) { 1628 if (status < 0) {
1610 mlog_errno(status); 1629 mlog_errno(status);
1611 goto bail; 1630 goto bail;
@@ -1744,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir,
1744 1763
1745 status = ocfs2_add_entry(handle, dentry, inode, 1764 status = ocfs2_add_entry(handle, dentry, inode,
1746 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1765 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1747 de_bh); 1766 &lookup);
1748 if (status < 0) { 1767 if (status < 0) {
1749 mlog_errno(status); 1768 mlog_errno(status);
1750 goto bail; 1769 goto bail;
@@ -1772,9 +1791,9 @@ bail:
1772 1791
1773 brelse(new_fe_bh); 1792 brelse(new_fe_bh);
1774 brelse(parent_fe_bh); 1793 brelse(parent_fe_bh);
1775 brelse(de_bh);
1776 kfree(si.name); 1794 kfree(si.name);
1777 kfree(si.value); 1795 kfree(si.value);
1796 ocfs2_free_dir_lookup_result(&lookup);
1778 if (inode_ac) 1797 if (inode_ac)
1779 ocfs2_free_alloc_context(inode_ac); 1798 ocfs2_free_alloc_context(inode_ac);
1780 if (data_ac) 1799 if (data_ac)
@@ -1826,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1826 struct inode **ret_orphan_dir, 1845 struct inode **ret_orphan_dir,
1827 struct inode *inode, 1846 struct inode *inode,
1828 char *name, 1847 char *name,
1829 struct buffer_head **de_bh) 1848 struct ocfs2_dir_lookup_result *lookup)
1830{ 1849{
1831 struct inode *orphan_dir_inode; 1850 struct inode *orphan_dir_inode;
1832 struct buffer_head *orphan_dir_bh = NULL; 1851 struct buffer_head *orphan_dir_bh = NULL;
@@ -1857,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1857 1876
1858 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 1877 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
1859 orphan_dir_bh, name, 1878 orphan_dir_bh, name,
1860 OCFS2_ORPHAN_NAMELEN, de_bh); 1879 OCFS2_ORPHAN_NAMELEN, lookup);
1861 if (status < 0) { 1880 if (status < 0) {
1862 ocfs2_inode_unlock(orphan_dir_inode, 1); 1881 ocfs2_inode_unlock(orphan_dir_inode, 1);
1863 1882
@@ -1884,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1884 struct inode *inode, 1903 struct inode *inode,
1885 struct ocfs2_dinode *fe, 1904 struct ocfs2_dinode *fe,
1886 char *name, 1905 char *name,
1887 struct buffer_head *de_bh, 1906 struct ocfs2_dir_lookup_result *lookup,
1888 struct inode *orphan_dir_inode) 1907 struct inode *orphan_dir_inode)
1889{ 1908{
1890 struct buffer_head *orphan_dir_bh = NULL; 1909 struct buffer_head *orphan_dir_bh = NULL;
@@ -1910,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1910 * underneath us... */ 1929 * underneath us... */
1911 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 1930 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1912 if (S_ISDIR(inode->i_mode)) 1931 if (S_ISDIR(inode->i_mode))
1913 le16_add_cpu(&orphan_fe->i_links_count, 1); 1932 ocfs2_add_links_count(orphan_fe, 1);
1914 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 1933 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
1915 1934
1916 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 1935 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
1917 if (status < 0) { 1936 if (status < 0) {
@@ -1922,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1922 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 1941 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
1923 OCFS2_ORPHAN_NAMELEN, inode, 1942 OCFS2_ORPHAN_NAMELEN, inode,
1924 OCFS2_I(inode)->ip_blkno, 1943 OCFS2_I(inode)->ip_blkno,
1925 orphan_dir_bh, de_bh); 1944 orphan_dir_bh, lookup);
1926 if (status < 0) { 1945 if (status < 0) {
1927 mlog_errno(status); 1946 mlog_errno(status);
1928 goto leave; 1947 goto leave;
@@ -1955,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1955 char name[OCFS2_ORPHAN_NAMELEN + 1]; 1974 char name[OCFS2_ORPHAN_NAMELEN + 1];
1956 struct ocfs2_dinode *orphan_fe; 1975 struct ocfs2_dinode *orphan_fe;
1957 int status = 0; 1976 int status = 0;
1958 struct buffer_head *target_de_bh = NULL; 1977 struct ocfs2_dir_lookup_result lookup = { NULL, };
1959 struct ocfs2_dir_entry *target_de = NULL;
1960 1978
1961 mlog_entry_void(); 1979 mlog_entry_void();
1962 1980
@@ -1971,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1971 OCFS2_ORPHAN_NAMELEN); 1989 OCFS2_ORPHAN_NAMELEN);
1972 1990
1973 /* find it's spot in the orphan directory */ 1991 /* find it's spot in the orphan directory */
1974 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, 1992 status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
1975 orphan_dir_inode, &target_de); 1993 &lookup);
1976 if (!target_de_bh) { 1994 if (status) {
1977 status = -ENOENT;
1978 mlog_errno(status); 1995 mlog_errno(status);
1979 goto leave; 1996 goto leave;
1980 } 1997 }
1981 1998
1982 /* remove it from the orphan directory */ 1999 /* remove it from the orphan directory */
1983 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, 2000 status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
1984 target_de_bh);
1985 if (status < 0) { 2001 if (status < 0) {
1986 mlog_errno(status); 2002 mlog_errno(status);
1987 goto leave; 2003 goto leave;
@@ -1997,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1997 /* do the i_nlink dance! :) */ 2013 /* do the i_nlink dance! :) */
1998 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2014 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1999 if (S_ISDIR(inode->i_mode)) 2015 if (S_ISDIR(inode->i_mode))
2000 le16_add_cpu(&orphan_fe->i_links_count, -1); 2016 ocfs2_add_links_count(orphan_fe, -1);
2001 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 2017 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
2002 2018
2003 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 2019 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2004 if (status < 0) { 2020 if (status < 0) {
@@ -2007,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2007 } 2023 }
2008 2024
2009leave: 2025leave:
2010 brelse(target_de_bh); 2026 ocfs2_free_dir_lookup_result(&lookup);
2011 2027
2012 mlog_exit(status); 2028 mlog_exit(status);
2013 return status; 2029 return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 946d3c34b90b..1386281950db 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
209struct ocfs2_journal; 209struct ocfs2_journal;
210struct ocfs2_slot_info; 210struct ocfs2_slot_info;
211struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_replay_map;
212struct ocfs2_quota_recovery; 213struct ocfs2_quota_recovery;
213struct ocfs2_dentry_lock; 214struct ocfs2_dentry_lock;
214struct ocfs2_super 215struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
264 atomic_t vol_state; 265 atomic_t vol_state;
265 struct mutex recovery_lock; 266 struct mutex recovery_lock;
266 struct ocfs2_recovery_map *recovery_map; 267 struct ocfs2_recovery_map *recovery_map;
268 struct ocfs2_replay_map *replay_map;
267 struct task_struct *recovery_thread_task; 269 struct task_struct *recovery_thread_task;
268 int disable_recovery; 270 int disable_recovery;
269 wait_queue_head_t checkpoint_event; 271 wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
287 289
288 u64 la_last_gd; 290 u64 la_last_gd;
289 291
290#ifdef CONFIG_OCFS2_FS_STATS
291 struct dentry *local_alloc_debug;
292 char *local_alloc_debug_buf;
293#endif
294
295 /* Next three fields are for local node slot recovery during 292 /* Next three fields are for local node slot recovery during
296 * mount. */ 293 * mount. */
297 int dirty; 294 int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
305 struct ocfs2_cluster_connection *cconn; 302 struct ocfs2_cluster_connection *cconn;
306 struct ocfs2_lock_res osb_super_lockres; 303 struct ocfs2_lock_res osb_super_lockres;
307 struct ocfs2_lock_res osb_rename_lockres; 304 struct ocfs2_lock_res osb_rename_lockres;
305 struct ocfs2_lock_res osb_nfs_sync_lockres;
308 struct ocfs2_dlm_debug *osb_dlm_debug; 306 struct ocfs2_dlm_debug *osb_dlm_debug;
309 307
310 struct dentry *osb_debug_root; 308 struct dentry *osb_debug_root;
309 struct dentry *osb_ctxt;
311 310
312 wait_queue_head_t recovery_event; 311 wait_queue_head_t recovery_event;
313 312
@@ -344,6 +343,12 @@ struct ocfs2_super
344 343
345 /* used to protect metaecc calculation check of xattr. */ 344 /* used to protect metaecc calculation check of xattr. */
346 spinlock_t osb_xattr_lock; 345 spinlock_t osb_xattr_lock;
346
347 unsigned int osb_dx_mask;
348 u32 osb_dx_seed[4];
349
350 /* the group we used to allocate inodes. */
351 u64 osb_inode_alloc_group;
347}; 352};
348 353
349#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 354#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
402 return 0; 407 return 0;
403} 408}
404 409
410static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
411{
412 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
413 return 1;
414 return 0;
415}
416
417static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
418{
419 if (ocfs2_supports_indexed_dirs(osb))
420 return OCFS2_DX_LINK_MAX;
421 return OCFS2_LINK_MAX;
422}
423
424static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
425{
426 u32 nlink = le16_to_cpu(di->i_links_count);
427 u32 hi = le16_to_cpu(di->i_links_count_hi);
428
429 if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
430 nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
431
432 return nlink;
433}
434
435static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
436{
437 u16 lo, hi;
438
439 lo = nlink;
440 hi = nlink >> OCFS2_LINKS_HI_SHIFT;
441
442 di->i_links_count = cpu_to_le16(lo);
443 di->i_links_count_hi = cpu_to_le16(hi);
444}
445
446static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
447{
448 u32 links = ocfs2_read_links_count(di);
449
450 links += n;
451
452 ocfs2_set_links_count(di, links);
453}
454
405/* set / clear functions because cluster events can make these happen 455/* set / clear functions because cluster events can make these happen
406 * in parallel so we want the transitions to be atomic. this also 456 * in parallel so we want the transitions to be atomic. this also
407 * means that any future flags osb_flags must be protected by spinlock 457 * means that any future flags osb_flags must be protected by spinlock
@@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
482#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ 532#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
483 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) 533 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
484 534
535#define OCFS2_IS_VALID_DX_ROOT(ptr) \
536 (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
537
538#define OCFS2_IS_VALID_DX_LEAF(ptr) \
539 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
540
485static inline unsigned long ino_from_blkno(struct super_block *sb, 541static inline unsigned long ino_from_blkno(struct super_block *sb,
486 u64 blkno) 542 u64 blkno)
487{ 543{
@@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
532 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; 588 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
533} 589}
534 590
591static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
592 u64 blocks)
593{
594 int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
595 unsigned int clusters;
596
597 clusters = ocfs2_blocks_to_clusters(sb, blocks);
598 return (u64)clusters << bits;
599}
600
535static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, 601static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
536 u64 bytes) 602 u64 bytes)
537{ 603{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 2332ef740f4f..7ab6e9e5e77c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" 67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" 68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
69 71
70/* Compatibility flags */ 72/* Compatibility flags */
71#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 73#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -95,7 +97,8 @@
95 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 97 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
96 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 98 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
97 | OCFS2_FEATURE_INCOMPAT_XATTR \ 99 | OCFS2_FEATURE_INCOMPAT_XATTR \
98 | OCFS2_FEATURE_INCOMPAT_META_ECC) 100 | OCFS2_FEATURE_INCOMPAT_META_ECC \
101 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
99#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 102#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
100 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 103 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
101 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 104 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -151,6 +154,9 @@
151/* Support for extended attributes */ 154/* Support for extended attributes */
152#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 155#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
153 156
157/* Support for indexed directores */
158#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400
159
154/* Metadata checksum and error correction */ 160/* Metadata checksum and error correction */
155#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 161#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
156 162
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
411#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ 417#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
412 OCFS2_DIR_ROUND) & \ 418 OCFS2_DIR_ROUND) & \
413 ~OCFS2_DIR_ROUND) 419 ~OCFS2_DIR_ROUND)
420#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1)
414 421
415#define OCFS2_LINK_MAX 32000 422#define OCFS2_LINK_MAX 32000
423#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U)
424#define OCFS2_LINKS_HI_SHIFT 16
425#define OCFS2_DX_ENTRIES_MAX (0xffffffffU)
416 426
417#define S_SHIFT 12 427#define S_SHIFT 12
418static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { 428static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
628/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 638/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
629 for this fs*/ 639 for this fs*/
630 __le16 s_reserved0; 640 __le16 s_reserved0;
631 __le32 s_reserved1; 641 __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
632/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ 642 * s_uuid_hash serves as seed[3]. */
643/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
633/*140*/ 644/*140*/
634 645
635 /* 646 /*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
679 belongs to */ 690 belongs to */
680 __le16 i_suballoc_bit; /* Bit offset in suballocator 691 __le16 i_suballoc_bit; /* Bit offset in suballocator
681 block group */ 692 block group */
682/*10*/ __le16 i_reserved0; 693/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */
683 __le16 i_xattr_inline_size; 694 __le16 i_xattr_inline_size;
684 __le32 i_clusters; /* Cluster count */ 695 __le32 i_clusters; /* Cluster count */
685 __le32 i_uid; /* Owner UID */ 696 __le32 i_uid; /* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
705 __le16 i_dyn_features; 716 __le16 i_dyn_features;
706 __le64 i_xattr_loc; 717 __le64 i_xattr_loc;
707/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 718/*80*/ struct ocfs2_block_check i_check; /* Error checking */
708/*88*/ __le64 i_reserved2[6]; 719/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
720 __le64 i_reserved2[5];
709/*B8*/ union { 721/*B8*/ union {
710 __le64 i_pad1; /* Generic way to refer to this 722 __le64 i_pad1; /* Generic way to refer to this
711 64bit union */ 723 64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
781/*40*/ 793/*40*/
782}; 794};
783 795
796 /*
797 * A directory entry in the indexed tree. We don't store the full name here,
798 * but instead provide a pointer to the full dirent in the unindexed tree.
799 *
800 * We also store name_len here so as to reduce the number of leaf blocks we
801 * need to search in case of collisions.
802 */
803struct ocfs2_dx_entry {
804 __le32 dx_major_hash; /* Used to find logical
805 * cluster in index */
806 __le32 dx_minor_hash; /* Lower bits used to find
807 * block in cluster */
808 __le64 dx_dirent_blk; /* Physical block in unindexed
809 * tree holding this dirent. */
810};
811
812struct ocfs2_dx_entry_list {
813 __le32 de_reserved;
814 __le16 de_count; /* Maximum number of entries
815 * possible in de_entries */
816 __le16 de_num_used; /* Current number of
817 * de_entries entries */
818 struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries
819 * in a packed array of
820 * length de_num_used */
821};
822
823#define OCFS2_DX_FLAG_INLINE 0x01
824
825/*
826 * A directory indexing block. Each indexed directory has one of these,
827 * pointed to by ocfs2_dinode.
828 *
829 * This block stores an indexed btree root, and a set of free space
830 * start-of-list pointers.
831 */
832struct ocfs2_dx_root_block {
833 __u8 dr_signature[8]; /* Signature for verification */
834 struct ocfs2_block_check dr_check; /* Error checking */
835 __le16 dr_suballoc_slot; /* Slot suballocator this
836 * block belongs to. */
837 __le16 dr_suballoc_bit; /* Bit offset in suballocator
838 * block group */
839 __le32 dr_fs_generation; /* Must match super block */
840 __le64 dr_blkno; /* Offset on disk, in blocks */
841 __le64 dr_last_eb_blk; /* Pointer to last
842 * extent block */
843 __le32 dr_clusters; /* Clusters allocated
844 * to the indexed tree. */
845 __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */
846 __u8 dr_reserved0;
847 __le16 dr_reserved1;
848 __le64 dr_dir_blkno; /* Pointer to parent inode */
849 __le32 dr_num_entries; /* Total number of
850 * names stored in
851 * this directory.*/
852 __le32 dr_reserved2;
853 __le64 dr_free_blk; /* Pointer to head of free
854 * unindexed block list. */
855 __le64 dr_reserved3[15];
856 union {
857 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
858 * bits for maximum space
859 * efficiency. */
860 struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
861 * entries. We grow out
862 * to extents if this
863 * gets too big. */
864 };
865};
866
867/*
868 * The header of a leaf block in the indexed tree.
869 */
870struct ocfs2_dx_leaf {
871 __u8 dl_signature[8];/* Signature for verification */
872 struct ocfs2_block_check dl_check; /* Error checking */
873 __le64 dl_blkno; /* Offset on disk, in blocks */
874 __le32 dl_fs_generation;/* Must match super block */
875 __le32 dl_reserved0;
876 __le64 dl_reserved1;
877 struct ocfs2_dx_entry_list dl_list;
878};
879
784/* 880/*
785 * On disk allocator group structure for OCFS2 881 * On disk allocator group structure for OCFS2
786 */ 882 */
@@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
1112 return size / sizeof(struct ocfs2_extent_rec); 1208 return size / sizeof(struct ocfs2_extent_rec);
1113} 1209}
1114 1210
1211static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
1212{
1213 int size;
1214
1215 size = sb->s_blocksize -
1216 offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
1217
1218 return size / sizeof(struct ocfs2_extent_rec);
1219}
1220
1115static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) 1221static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
1116{ 1222{
1117 int size; 1223 int size;
@@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1132 return size / sizeof(struct ocfs2_extent_rec); 1238 return size / sizeof(struct ocfs2_extent_rec);
1133} 1239}
1134 1240
1241static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1242{
1243 int size;
1244
1245 size = sb->s_blocksize -
1246 offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
1247
1248 return size / sizeof(struct ocfs2_dx_entry);
1249}
1250
1251static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
1252{
1253 int size;
1254
1255 size = sb->s_blocksize -
1256 offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
1257
1258 return size / sizeof(struct ocfs2_dx_entry);
1259}
1260
1135static inline u16 ocfs2_local_alloc_size(struct super_block *sb) 1261static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1136{ 1262{
1137 u16 size; 1263 u16 size;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index eb6f50c9ceca..a53ce87481bf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK, 48 OCFS2_LOCK_TYPE_FLOCK,
49 OCFS2_LOCK_TYPE_QINFO, 49 OCFS2_LOCK_TYPE_QINFO,
50 OCFS2_LOCK_TYPE_NFS_SYNC,
50 OCFS2_NUM_LOCK_TYPES 51 OCFS2_NUM_LOCK_TYPES
51}; 52};
52 53
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
81 case OCFS2_LOCK_TYPE_QINFO: 82 case OCFS2_LOCK_TYPE_QINFO:
82 c = 'Q'; 83 c = 'Q';
83 break; 84 break;
85 case OCFS2_LOCK_TYPE_NFS_SYNC:
86 c = 'Y';
87 break;
84 default: 88 default:
85 c = '\0'; 89 c = '\0';
86 } 90 }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a69628603e18..b4ca5911caaf 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -48,7 +48,8 @@
48#include "buffer_head_io.h" 48#include "buffer_head_io.h"
49 49
50#define NOT_ALLOC_NEW_GROUP 0 50#define NOT_ALLOC_NEW_GROUP 0
51#define ALLOC_NEW_GROUP 1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
52 53
53#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_INODES_TO_STEAL 1024
54 55
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
64static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 65static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
65 struct inode *alloc_inode, 66 struct inode *alloc_inode,
66 struct buffer_head *bh, 67 struct buffer_head *bh,
67 u64 max_block); 68 u64 max_block,
69 u64 *last_alloc_group,
70 int flags);
68 71
69static int ocfs2_cluster_group_search(struct inode *inode, 72static int ocfs2_cluster_group_search(struct inode *inode,
70 struct buffer_head *group_bh, 73 struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
116 u16 *bg_bit_off); 119 u16 *bg_bit_off);
117static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 120static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
118 u32 bits_wanted, u64 max_block, 121 u32 bits_wanted, u64 max_block,
122 int flags,
119 struct ocfs2_alloc_context **ac); 123 struct ocfs2_alloc_context **ac);
120 124
121void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 125void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
403static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 407static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
404 struct inode *alloc_inode, 408 struct inode *alloc_inode,
405 struct buffer_head *bh, 409 struct buffer_head *bh,
406 u64 max_block) 410 u64 max_block,
411 u64 *last_alloc_group,
412 int flags)
407{ 413{
408 int status, credits; 414 int status, credits;
409 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 415 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
423 cl = &fe->id2.i_chain; 429 cl = &fe->id2.i_chain;
424 status = ocfs2_reserve_clusters_with_limit(osb, 430 status = ocfs2_reserve_clusters_with_limit(osb,
425 le16_to_cpu(cl->cl_cpg), 431 le16_to_cpu(cl->cl_cpg),
426 max_block, &ac); 432 max_block, flags, &ac);
427 if (status < 0) { 433 if (status < 0) {
428 if (status != -ENOSPC) 434 if (status != -ENOSPC)
429 mlog_errno(status); 435 mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
440 goto bail; 446 goto bail;
441 } 447 }
442 448
449 if (last_alloc_group && *last_alloc_group != 0) {
450 mlog(0, "use old allocation group %llu for block group alloc\n",
451 (unsigned long long)*last_alloc_group);
452 ac->ac_last_group = *last_alloc_group;
453 }
443 status = ocfs2_claim_clusters(osb, 454 status = ocfs2_claim_clusters(osb,
444 handle, 455 handle,
445 ac, 456 ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
514 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 525 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
515 526
516 status = 0; 527 status = 0;
528
529 /* save the new last alloc group so that the caller can cache it. */
530 if (last_alloc_group)
531 *last_alloc_group = ac->ac_last_group;
532
517bail: 533bail:
518 if (handle) 534 if (handle)
519 ocfs2_commit_trans(osb, handle); 535 ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
531 struct ocfs2_alloc_context *ac, 547 struct ocfs2_alloc_context *ac,
532 int type, 548 int type,
533 u32 slot, 549 u32 slot,
534 int alloc_new_group) 550 u64 *last_alloc_group,
551 int flags)
535{ 552{
536 int status; 553 int status;
537 u32 bits_wanted = ac->ac_bits_wanted; 554 u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
587 goto bail; 604 goto bail;
588 } 605 }
589 606
590 if (alloc_new_group != ALLOC_NEW_GROUP) { 607 if (!(flags & ALLOC_NEW_GROUP)) {
591 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " 608 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
592 "and we don't alloc a new group for it.\n", 609 "and we don't alloc a new group for it.\n",
593 slot, bits_wanted, free_bits); 610 slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
596 } 613 }
597 614
598 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 615 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
599 ac->ac_max_block); 616 ac->ac_max_block,
617 last_alloc_group, flags);
600 if (status < 0) { 618 if (status < 0) {
601 if (status != -ENOSPC) 619 if (status != -ENOSPC)
602 mlog_errno(status); 620 mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
640 658
641 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 659 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
642 EXTENT_ALLOC_SYSTEM_INODE, 660 EXTENT_ALLOC_SYSTEM_INODE,
643 slot, ALLOC_NEW_GROUP); 661 slot, NULL, ALLOC_NEW_GROUP);
644 if (status < 0) { 662 if (status < 0) {
645 if (status != -ENOSPC) 663 if (status != -ENOSPC)
646 mlog_errno(status); 664 mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
686 704
687 status = ocfs2_reserve_suballoc_bits(osb, ac, 705 status = ocfs2_reserve_suballoc_bits(osb, ac,
688 INODE_ALLOC_SYSTEM_INODE, 706 INODE_ALLOC_SYSTEM_INODE,
689 slot, NOT_ALLOC_NEW_GROUP); 707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
690 if (status >= 0) { 709 if (status >= 0) {
691 ocfs2_set_inode_steal_slot(osb, slot); 710 ocfs2_set_inode_steal_slot(osb, slot);
692 break; 711 break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
703{ 722{
704 int status; 723 int status;
705 s16 slot = ocfs2_get_inode_steal_slot(osb); 724 s16 slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group;
706 726
707 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
708 if (!(*ac)) { 728 if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
738 goto inode_steal; 758 goto inode_steal;
739 759
740 atomic_set(&osb->s_num_inodes_stolen, 0); 760 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group;
741 status = ocfs2_reserve_suballoc_bits(osb, *ac, 762 status = ocfs2_reserve_suballoc_bits(osb, *ac,
742 INODE_ALLOC_SYSTEM_INODE, 763 INODE_ALLOC_SYSTEM_INODE,
743 osb->slot_num, ALLOC_NEW_GROUP); 764 osb->slot_num,
765 &alloc_group,
766 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL);
744 if (status >= 0) { 768 if (status >= 0) {
745 status = 0; 769 status = 0;
746 770
771 spin_lock(&osb->osb_lock);
772 osb->osb_inode_alloc_group = alloc_group;
773 spin_unlock(&osb->osb_lock);
774 mlog(0, "after reservation, new allocation group is "
775 "%llu\n", (unsigned long long)alloc_group);
776
747 /* 777 /*
748 * Some inodes must be freed by us, so try to allocate 778 * Some inodes must be freed by us, so try to allocate
749 * from our own next time. 779 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
790 820
791 status = ocfs2_reserve_suballoc_bits(osb, ac, 821 status = ocfs2_reserve_suballoc_bits(osb, ac,
792 GLOBAL_BITMAP_SYSTEM_INODE, 822 GLOBAL_BITMAP_SYSTEM_INODE,
793 OCFS2_INVALID_SLOT, 823 OCFS2_INVALID_SLOT, NULL,
794 ALLOC_NEW_GROUP); 824 ALLOC_NEW_GROUP);
795 if (status < 0 && status != -ENOSPC) { 825 if (status < 0 && status != -ENOSPC) {
796 mlog_errno(status); 826 mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
806 * things a bit. */ 836 * things a bit. */
807static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 837static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
808 u32 bits_wanted, u64 max_block, 838 u32 bits_wanted, u64 max_block,
839 int flags,
809 struct ocfs2_alloc_context **ac) 840 struct ocfs2_alloc_context **ac)
810{ 841{
811 int status; 842 int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
823 (*ac)->ac_max_block = max_block; 854 (*ac)->ac_max_block = max_block;
824 855
825 status = -ENOSPC; 856 status = -ENOSPC;
826 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { 857 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
858 ocfs2_alloc_should_use_local(osb, bits_wanted)) {
827 status = ocfs2_reserve_local_alloc_bits(osb, 859 status = ocfs2_reserve_local_alloc_bits(osb,
828 bits_wanted, 860 bits_wanted,
829 *ac); 861 *ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
861 u32 bits_wanted, 893 u32 bits_wanted,
862 struct ocfs2_alloc_context **ac) 894 struct ocfs2_alloc_context **ac)
863{ 895{
864 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); 896 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
897 ALLOC_NEW_GROUP, ac);
865} 898}
866 899
867/* 900/*
@@ -1618,8 +1651,41 @@ bail:
1618 return status; 1651 return status;
1619} 1652}
1620 1653
1654static void ocfs2_init_inode_ac_group(struct inode *dir,
1655 struct buffer_head *parent_fe_bh,
1656 struct ocfs2_alloc_context *ac)
1657{
1658 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1659 /*
1660 * Try to allocate inodes from some specific group.
1661 *
1662 * If the parent dir has recorded the last group used in allocation,
1663 * cool, use it. Otherwise if we try to allocate new inode from the
1664 * same slot the parent dir belongs to, use the same chunk.
1665 *
1666 * We are very careful here to avoid the mistake of setting
1667 * ac_last_group to a group descriptor from a different (unlocked) slot.
1668 */
1669 if (OCFS2_I(dir)->ip_last_used_group &&
1670 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1671 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1672 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1673 ac->ac_last_group = ocfs2_which_suballoc_group(
1674 le64_to_cpu(fe->i_blkno),
1675 le16_to_cpu(fe->i_suballoc_bit));
1676}
1677
1678static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1679 struct ocfs2_alloc_context *ac)
1680{
1681 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1682 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1683}
1684
1621int ocfs2_claim_new_inode(struct ocfs2_super *osb, 1685int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1622 handle_t *handle, 1686 handle_t *handle,
1687 struct inode *dir,
1688 struct buffer_head *parent_fe_bh,
1623 struct ocfs2_alloc_context *ac, 1689 struct ocfs2_alloc_context *ac,
1624 u16 *suballoc_bit, 1690 u16 *suballoc_bit,
1625 u64 *fe_blkno) 1691 u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1635 BUG_ON(ac->ac_bits_wanted != 1); 1701 BUG_ON(ac->ac_bits_wanted != 1);
1636 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 1702 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1637 1703
1704 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1705
1638 status = ocfs2_claim_suballoc_bits(osb, 1706 status = ocfs2_claim_suballoc_bits(osb,
1639 ac, 1707 ac,
1640 handle, 1708 handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1653 1721
1654 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 1722 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1655 ac->ac_bits_given++; 1723 ac->ac_bits_given++;
1724 ocfs2_save_inode_ac_group(dir, ac);
1656 status = 0; 1725 status = 0;
1657bail: 1726bail:
1658 mlog_exit(status); 1727 mlog_exit(status);
@@ -2116,3 +2185,162 @@ out:
2116 2185
2117 return ret; 2186 return ret;
2118} 2187}
2188
2189/*
2190 * Read the inode specified by blkno to get suballoc_slot and
2191 * suballoc_bit.
2192 */
2193static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2194 u16 *suballoc_slot, u16 *suballoc_bit)
2195{
2196 int status;
2197 struct buffer_head *inode_bh = NULL;
2198 struct ocfs2_dinode *inode_fe;
2199
2200 mlog_entry("blkno: %llu\n", blkno);
2201
2202 /* dirty read disk */
2203 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2204 if (status < 0) {
2205 mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
2206 goto bail;
2207 }
2208
2209 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2210 if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2211 mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
2212 status = -EINVAL;
2213 goto bail;
2214 }
2215
2216 if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
2217 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2218 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2219 blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2220 status = -EINVAL;
2221 goto bail;
2222 }
2223
2224 if (suballoc_slot)
2225 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2226 if (suballoc_bit)
2227 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2228
2229bail:
2230 brelse(inode_bh);
2231
2232 mlog_exit(status);
2233 return status;
2234}
2235
2236/*
2237 * test whether bit is SET in allocator bitmap or not. on success, 0
2238 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno
2239 * is returned and *res is meaningless. Call this after you have
2240 * cluster locked against suballoc, or you may get a result based on
2241 * non-up2date contents
2242 */
2243static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2244 struct inode *suballoc,
2245 struct buffer_head *alloc_bh, u64 blkno,
2246 u16 bit, int *res)
2247{
2248 struct ocfs2_dinode *alloc_fe;
2249 struct ocfs2_group_desc *group;
2250 struct buffer_head *group_bh = NULL;
2251 u64 bg_blkno;
2252 int status;
2253
2254 mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
2255
2256 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2257 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2258 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2259 (unsigned int)bit,
2260 ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2261 status = -EINVAL;
2262 goto bail;
2263 }
2264
2265 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2266 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2267 &group_bh);
2268 if (status < 0) {
2269 mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
2270 goto bail;
2271 }
2272
2273 group = (struct ocfs2_group_desc *) group_bh->b_data;
2274 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2275
2276bail:
2277 brelse(group_bh);
2278
2279 mlog_exit(status);
2280 return status;
2281}
2282
2283/*
2284 * Test if the bit representing this inode (blkno) is set in the
2285 * suballocator.
2286 *
2287 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2288 *
2289 * In the event of failure, a negative value is returned and *res is
2290 * meaningless.
2291 *
2292 * Callers must make sure to hold nfs_sync_lock to prevent
2293 * ocfs2_delete_inode() on another node from accessing the same
2294 * suballocator concurrently.
2295 */
2296int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2297{
2298 int status;
2299 u16 suballoc_bit = 0, suballoc_slot = 0;
2300 struct inode *inode_alloc_inode;
2301 struct buffer_head *alloc_bh = NULL;
2302
2303 mlog_entry("blkno: %llu", blkno);
2304
2305 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2306 &suballoc_bit);
2307 if (status < 0) {
2308 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2309 goto bail;
2310 }
2311
2312 inode_alloc_inode =
2313 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2314 suballoc_slot);
2315 if (!inode_alloc_inode) {
2316 /* the error code could be inaccurate, but we are not able to
2317 * get the correct one. */
2318 status = -EINVAL;
2319 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2320 (u32)suballoc_slot);
2321 goto bail;
2322 }
2323
2324 mutex_lock(&inode_alloc_inode->i_mutex);
2325 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2326 if (status < 0) {
2327 mutex_unlock(&inode_alloc_inode->i_mutex);
2328 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2329 (u32)suballoc_slot, status);
2330 goto bail;
2331 }
2332
2333 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2334 blkno, suballoc_bit, res);
2335 if (status < 0)
2336 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2337
2338 ocfs2_inode_unlock(inode_alloc_inode, 0);
2339 mutex_unlock(&inode_alloc_inode->i_mutex);
2340
2341 iput(inode_alloc_inode);
2342 brelse(alloc_bh);
2343bail:
2344 mlog_exit(status);
2345 return status;
2346}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e3c13c77f9e8..8c9a78a43164 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
88 u64 *blkno_start); 88 u64 *blkno_start);
89int ocfs2_claim_new_inode(struct ocfs2_super *osb, 89int ocfs2_claim_new_inode(struct ocfs2_super *osb,
90 handle_t *handle, 90 handle_t *handle,
91 struct inode *dir,
92 struct buffer_head *parent_fe_bh,
91 struct ocfs2_alloc_context *ac, 93 struct ocfs2_alloc_context *ac,
92 u16 *suballoc_bit, 94 u16 *suballoc_bit,
93 u64 *fe_blkno); 95 u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
186 u32 clusters_to_add, u32 extents_to_split, 188 u32 clusters_to_add, u32 extents_to_split,
187 struct ocfs2_alloc_context **data_ac, 189 struct ocfs2_alloc_context **data_ac,
188 struct ocfs2_alloc_context **meta_ac); 190 struct ocfs2_alloc_context **meta_ac);
191
192int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
189#endif /* _CHAINALLOC_H_ */ 193#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7ac83a81ee55..79ff8d9d37e0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
201 {Opt_err, NULL} 201 {Opt_err, NULL}
202}; 202};
203 203
204#ifdef CONFIG_DEBUG_FS
205static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
206{
207 int out = 0;
208 int i;
209 struct ocfs2_cluster_connection *cconn = osb->cconn;
210 struct ocfs2_recovery_map *rm = osb->recovery_map;
211
212 out += snprintf(buf + out, len - out,
213 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
214 "Device", osb->dev_str, osb->uuid_str,
215 osb->fs_generation, osb->vol_label);
216
217 out += snprintf(buf + out, len - out,
218 "%10s => State: %d Flags: 0x%lX\n", "Volume",
219 atomic_read(&osb->vol_state), osb->osb_flags);
220
221 out += snprintf(buf + out, len - out,
222 "%10s => Block: %lu Cluster: %d\n", "Sizes",
223 osb->sb->s_blocksize, osb->s_clustersize);
224
225 out += snprintf(buf + out, len - out,
226 "%10s => Compat: 0x%X Incompat: 0x%X "
227 "ROcompat: 0x%X\n",
228 "Features", osb->s_feature_compat,
229 osb->s_feature_incompat, osb->s_feature_ro_compat);
230
231 out += snprintf(buf + out, len - out,
232 "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
233 osb->s_mount_opt, osb->s_atime_quantum);
234
235 out += snprintf(buf + out, len - out,
236 "%10s => Stack: %s Name: %*s Version: %d.%d\n",
237 "Cluster",
238 (*osb->osb_cluster_stack == '\0' ?
239 "o2cb" : osb->osb_cluster_stack),
240 cconn->cc_namelen, cconn->cc_name,
241 cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
242
243 spin_lock(&osb->dc_task_lock);
244 out += snprintf(buf + out, len - out,
245 "%10s => Pid: %d Count: %lu WakeSeq: %lu "
246 "WorkSeq: %lu\n", "DownCnvt",
247 task_pid_nr(osb->dc_task), osb->blocked_lock_count,
248 osb->dc_wake_sequence, osb->dc_work_sequence);
249 spin_unlock(&osb->dc_task_lock);
250
251 spin_lock(&osb->osb_lock);
252 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
253 "Recovery",
254 (osb->recovery_thread_task ?
255 task_pid_nr(osb->recovery_thread_task) : -1));
256 if (rm->rm_used == 0)
257 out += snprintf(buf + out, len - out, " None\n");
258 else {
259 for (i = 0; i < rm->rm_used; i++)
260 out += snprintf(buf + out, len - out, " %d",
261 rm->rm_entries[i]);
262 out += snprintf(buf + out, len - out, "\n");
263 }
264 spin_unlock(&osb->osb_lock);
265
266 out += snprintf(buf + out, len - out,
267 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
268 task_pid_nr(osb->commit_task), osb->osb_commit_interval,
269 atomic_read(&osb->needs_checkpoint));
270
271 out += snprintf(buf + out, len - out,
272 "%10s => State: %d NumTxns: %d TxnId: %lu\n",
273 "Journal", osb->journal->j_state,
274 atomic_read(&osb->journal->j_num_trans),
275 osb->journal->j_trans_id);
276
277 out += snprintf(buf + out, len - out,
278 "%10s => GlobalAllocs: %d LocalAllocs: %d "
279 "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n",
280 "Stats",
281 atomic_read(&osb->alloc_stats.bitmap_data),
282 atomic_read(&osb->alloc_stats.local_data),
283 atomic_read(&osb->alloc_stats.bg_allocs),
284 atomic_read(&osb->alloc_stats.moves),
285 atomic_read(&osb->alloc_stats.bg_extends));
286
287 out += snprintf(buf + out, len - out,
288 "%10s => State: %u Descriptor: %llu Size: %u bits "
289 "Default: %u bits\n",
290 "LocalAlloc", osb->local_alloc_state,
291 (unsigned long long)osb->la_last_gd,
292 osb->local_alloc_bits, osb->local_alloc_default_bits);
293
294 spin_lock(&osb->osb_lock);
295 out += snprintf(buf + out, len - out,
296 "%10s => Slot: %d NumStolen: %d\n", "Steal",
297 osb->s_inode_steal_slot,
298 atomic_read(&osb->s_num_inodes_stolen));
299 spin_unlock(&osb->osb_lock);
300
301 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
302 "Slots", "Num", "RecoGen");
303
304 for (i = 0; i < osb->max_slots; ++i) {
305 out += snprintf(buf + out, len - out,
306 "%10s %c %3d %10d\n",
307 " ",
308 (i == osb->slot_num ? '*' : ' '),
309 i, osb->slot_recovery_generations[i]);
310 }
311
312 return out;
313}
314
315static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
316{
317 struct ocfs2_super *osb = inode->i_private;
318 char *buf = NULL;
319
320 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
321 if (!buf)
322 goto bail;
323
324 i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
325
326 file->private_data = buf;
327
328 return 0;
329bail:
330 return -ENOMEM;
331}
332
333static int ocfs2_debug_release(struct inode *inode, struct file *file)
334{
335 kfree(file->private_data);
336 return 0;
337}
338
339static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
340 size_t nbytes, loff_t *ppos)
341{
342 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
343 i_size_read(file->f_mapping->host));
344}
345#else
346static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
347{
348 return 0;
349}
350static int ocfs2_debug_release(struct inode *inode, struct file *file)
351{
352 return 0;
353}
354static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
355 size_t nbytes, loff_t *ppos)
356{
357 return 0;
358}
359#endif /* CONFIG_DEBUG_FS */
360
361static struct file_operations ocfs2_osb_debug_fops = {
362 .open = ocfs2_osb_debug_open,
363 .release = ocfs2_debug_release,
364 .read = ocfs2_debug_read,
365 .llseek = generic_file_llseek,
366};
367
204/* 368/*
205 * write_super and sync_fs ripped right out of ext3. 369 * write_super and sync_fs ripped right out of ext3.
206 */ 370 */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
926 goto read_super_error; 1090 goto read_super_error;
927 } 1091 }
928 1092
1093 osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
1094 osb->osb_debug_root,
1095 osb,
1096 &ocfs2_osb_debug_fops);
1097 if (!osb->osb_ctxt) {
1098 status = -EINVAL;
1099 mlog_errno(status);
1100 goto read_super_error;
1101 }
1102
929 status = ocfs2_mount_volume(sb); 1103 status = ocfs2_mount_volume(sb);
930 if (osb->root_inode) 1104 if (osb->root_inode)
931 inode = igrab(osb->root_inode); 1105 inode = igrab(osb->root_inode);
@@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1620 osb = OCFS2_SB(sb); 1794 osb = OCFS2_SB(sb);
1621 BUG_ON(!osb); 1795 BUG_ON(!osb);
1622 1796
1797 debugfs_remove(osb->osb_ctxt);
1798
1623 ocfs2_disable_quotas(osb); 1799 ocfs2_disable_quotas(osb);
1624 1800
1625 ocfs2_shutdown_local_alloc(osb); 1801 ocfs2_shutdown_local_alloc(osb);
@@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
1742 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); 1918 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
1743 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); 1919 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
1744 1920
1921 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
1922
1923 for (i = 0; i < 3; i++)
1924 osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
1925 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
1926
1745 osb->sb = sb; 1927 osb->sb = sb;
1746 /* Save off for ocfs2_rw_direct */ 1928 /* Save off for ocfs2_rw_direct */
1747 osb->s_sectsize_bits = blksize_bits(sector_size); 1929 osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2130 * lock, and it's marked as dirty, set the bit in the recover 2312 * lock, and it's marked as dirty, set the bit in the recover
2131 * map and launch a recovery thread for it. */ 2313 * map and launch a recovery thread for it. */
2132 status = ocfs2_mark_dead_nodes(osb); 2314 status = ocfs2_mark_dead_nodes(osb);
2315 if (status < 0) {
2316 mlog_errno(status);
2317 goto finally;
2318 }
2319
2320 status = ocfs2_compute_replay_slots(osb);
2133 if (status < 0) 2321 if (status < 0)
2134 mlog_errno(status); 2322 mlog_errno(status);
2135 2323
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2563df89fc2a..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
512 struct ocfs2_security_xattr_info *si, 512 struct ocfs2_security_xattr_info *si,
513 int *want_clusters, 513 int *want_clusters,
514 int *xattr_credits, 514 int *xattr_credits,
515 struct ocfs2_alloc_context **xattr_ac) 515 int *want_meta)
516{ 516{
517 int ret = 0; 517 int ret = 0;
518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || 554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || 555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { 556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
557 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); 557 *want_meta = *want_meta + 1;
558 if (ret) {
559 mlog_errno(ret);
560 return ret;
561 }
562 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; 558 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
563 } 559 }
564 560
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
68 int *, int *, struct ocfs2_alloc_context **); 68 int *, int *, struct ocfs2_alloc_context **);
69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, 69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
70 int, struct ocfs2_security_xattr_info *, 70 int, struct ocfs2_security_xattr_info *,
71 int *, int *, struct ocfs2_alloc_context **); 71 int *, int *, int *);
72 72
73/* 73/*
74 * xattrs can live inside an inode, as part of an external xattr block, 74 * xattrs can live inside an inode, as part of an external xattr block,