aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2006-02-03 18:21:40 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-02-03 18:21:40 -0500
commitd1ffa5669cd834f901141756e63195f48c1bfbf9 (patch)
treef0bed266c1f3fef528bbced56b48aac63e0a26b1 /fs
parentd6c8f6aaa1d7f68c1e6471ab0839d9047cdd159f (diff)
parent6eff5790d57a5c9c01489c95946881808a4b2a2c (diff)
Merge branch 'upstream-linus' of git://oss.oracle.com/home/sourcebo/git/ocfs2
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/configfs/configfs_internal.h11
-rw-r--r--fs/configfs/dir.c36
-rw-r--r--fs/configfs/file.c19
-rw-r--r--fs/configfs/inode.c120
-rw-r--r--fs/configfs/mount.c28
-rw-r--r--fs/configfs/symlink.c4
-rw-r--r--fs/ocfs2/buffer_head_io.c10
-rw-r--r--fs/ocfs2/cluster/heartbeat.c5
-rw-r--r--fs/ocfs2/cluster/tcp.c16
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c18
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c24
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c250
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c13
-rw-r--r--fs/ocfs2/dlm/userdlm.c2
-rw-r--r--fs/ocfs2/extent_map.c12
-rw-r--r--fs/ocfs2/file.c10
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/inode.h4
-rw-r--r--fs/ocfs2/journal.c32
-rw-r--r--fs/ocfs2/ocfs2.h3
-rw-r--r--fs/ocfs2/super.c11
-rw-r--r--fs/ocfs2/sysfile.c6
-rw-r--r--fs/ocfs2/uptodate.c12
-rw-r--r--fs/ocfs2/uptodate.h2
26 files changed, 501 insertions, 156 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 93b5dc4082ff..e9749b0eecd8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -883,8 +883,6 @@ config CONFIGFS_FS
883 Both sysfs and configfs can and should exist together on the 883 Both sysfs and configfs can and should exist together on the
884 same system. One is not a replacement for the other. 884 same system. One is not a replacement for the other.
885 885
886 If unsure, say N.
887
888endmenu 886endmenu
889 887
890menu "Miscellaneous filesystems" 888menu "Miscellaneous filesystems"
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 8899d9c5f6bf..f70e46951b37 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -36,6 +36,7 @@ struct configfs_dirent {
36 int s_type; 36 int s_type;
37 umode_t s_mode; 37 umode_t s_mode;
38 struct dentry * s_dentry; 38 struct dentry * s_dentry;
39 struct iattr * s_iattr;
39}; 40};
40 41
41#define CONFIGFS_ROOT 0x0001 42#define CONFIGFS_ROOT 0x0001
@@ -48,10 +49,11 @@ struct configfs_dirent {
48#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) 49#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
49 50
50extern struct vfsmount * configfs_mount; 51extern struct vfsmount * configfs_mount;
52extern kmem_cache_t *configfs_dir_cachep;
51 53
52extern int configfs_is_root(struct config_item *item); 54extern int configfs_is_root(struct config_item *item);
53 55
54extern struct inode * configfs_new_inode(mode_t mode); 56extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
55extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); 57extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
56 58
57extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); 59extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
@@ -63,6 +65,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
63 65
64extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); 66extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
65extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); 67extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
68extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);
66 69
67extern int configfs_pin_fs(void); 70extern int configfs_pin_fs(void);
68extern void configfs_release_fs(void); 71extern void configfs_release_fs(void);
@@ -120,8 +123,10 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
120 123
121static inline void release_configfs_dirent(struct configfs_dirent * sd) 124static inline void release_configfs_dirent(struct configfs_dirent * sd)
122{ 125{
123 if (!(sd->s_type & CONFIGFS_ROOT)) 126 if (!(sd->s_type & CONFIGFS_ROOT)) {
124 kfree(sd); 127 kfree(sd->s_iattr);
128 kmem_cache_free(configfs_dir_cachep, sd);
129 }
125} 130}
126 131
127static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) 132static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index b668ec61527e..ca60e3abef45 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
72{ 72{
73 struct configfs_dirent * sd; 73 struct configfs_dirent * sd;
74 74
75 sd = kmalloc(sizeof(*sd), GFP_KERNEL); 75 sd = kmem_cache_alloc(configfs_dir_cachep, GFP_KERNEL);
76 if (!sd) 76 if (!sd)
77 return NULL; 77 return NULL;
78 78
@@ -136,13 +136,19 @@ static int create_dir(struct config_item * k, struct dentry * p,
136 int error; 136 int error;
137 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 137 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
138 138
139 error = configfs_create(d, mode, init_dir); 139 error = configfs_make_dirent(p->d_fsdata, d, k, mode,
140 CONFIGFS_DIR);
140 if (!error) { 141 if (!error) {
141 error = configfs_make_dirent(p->d_fsdata, d, k, mode, 142 error = configfs_create(d, mode, init_dir);
142 CONFIGFS_DIR);
143 if (!error) { 143 if (!error) {
144 p->d_inode->i_nlink++; 144 p->d_inode->i_nlink++;
145 (d)->d_op = &configfs_dentry_ops; 145 (d)->d_op = &configfs_dentry_ops;
146 } else {
147 struct configfs_dirent *sd = d->d_fsdata;
148 if (sd) {
149 list_del_init(&sd->s_sibling);
150 configfs_put(sd);
151 }
146 } 152 }
147 } 153 }
148 return error; 154 return error;
@@ -182,12 +188,19 @@ int configfs_create_link(struct configfs_symlink *sl,
182 int err = 0; 188 int err = 0;
183 umode_t mode = S_IFLNK | S_IRWXUGO; 189 umode_t mode = S_IFLNK | S_IRWXUGO;
184 190
185 err = configfs_create(dentry, mode, init_symlink); 191 err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
192 CONFIGFS_ITEM_LINK);
186 if (!err) { 193 if (!err) {
187 err = configfs_make_dirent(parent->d_fsdata, dentry, sl, 194 err = configfs_create(dentry, mode, init_symlink);
188 mode, CONFIGFS_ITEM_LINK);
189 if (!err) 195 if (!err)
190 dentry->d_op = &configfs_dentry_ops; 196 dentry->d_op = &configfs_dentry_ops;
197 else {
198 struct configfs_dirent *sd = dentry->d_fsdata;
199 if (sd) {
200 list_del_init(&sd->s_sibling);
201 configfs_put(sd);
202 }
203 }
191 } 204 }
192 return err; 205 return err;
193} 206}
@@ -241,13 +254,15 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
241 struct configfs_attribute * attr = sd->s_element; 254 struct configfs_attribute * attr = sd->s_element;
242 int error; 255 int error;
243 256
257 dentry->d_fsdata = configfs_get(sd);
258 sd->s_dentry = dentry;
244 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file); 259 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
245 if (error) 260 if (error) {
261 configfs_put(sd);
246 return error; 262 return error;
263 }
247 264
248 dentry->d_op = &configfs_dentry_ops; 265 dentry->d_op = &configfs_dentry_ops;
249 dentry->d_fsdata = configfs_get(sd);
250 sd->s_dentry = dentry;
251 d_rehash(dentry); 266 d_rehash(dentry);
252 267
253 return 0; 268 return 0;
@@ -839,6 +854,7 @@ struct inode_operations configfs_dir_inode_operations = {
839 .symlink = configfs_symlink, 854 .symlink = configfs_symlink,
840 .unlink = configfs_unlink, 855 .unlink = configfs_unlink,
841 .lookup = configfs_lookup, 856 .lookup = configfs_lookup,
857 .setattr = configfs_setattr,
842}; 858};
843 859
844#if 0 860#if 0
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index c26cd61f13af..3921920d8716 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/dnotify.h>
30#include <linux/slab.h> 29#include <linux/slab.h>
31#include <asm/uaccess.h> 30#include <asm/uaccess.h>
32#include <asm/semaphore.h> 31#include <asm/semaphore.h>
@@ -150,7 +149,7 @@ out:
150/** 149/**
151 * fill_write_buffer - copy buffer from userspace. 150 * fill_write_buffer - copy buffer from userspace.
152 * @buffer: data buffer for file. 151 * @buffer: data buffer for file.
153 * @userbuf: data from user. 152 * @buf: data from user.
154 * @count: number of bytes in @userbuf. 153 * @count: number of bytes in @userbuf.
155 * 154 *
156 * Allocate @buffer->page if it hasn't been already, then 155 * Allocate @buffer->page if it hasn't been already, then
@@ -177,8 +176,9 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
177 176
178/** 177/**
179 * flush_write_buffer - push buffer to config_item. 178 * flush_write_buffer - push buffer to config_item.
180 * @file: file pointer. 179 * @dentry: dentry to the attribute
181 * @buffer: data buffer for file. 180 * @buffer: data buffer for file.
181 * @count: number of bytes
182 * 182 *
183 * Get the correct pointers for the config_item and the attribute we're 183 * Get the correct pointers for the config_item and the attribute we're
184 * dealing with, then call the store() method for the attribute, 184 * dealing with, then call the store() method for the attribute,
@@ -217,15 +217,16 @@ static ssize_t
217configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) 217configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
218{ 218{
219 struct configfs_buffer * buffer = file->private_data; 219 struct configfs_buffer * buffer = file->private_data;
220 ssize_t len;
220 221
221 down(&buffer->sem); 222 down(&buffer->sem);
222 count = fill_write_buffer(buffer,buf,count); 223 len = fill_write_buffer(buffer, buf, count);
223 if (count > 0) 224 if (len > 0)
224 count = flush_write_buffer(file->f_dentry,buffer,count); 225 len = flush_write_buffer(file->f_dentry, buffer, count);
225 if (count > 0) 226 if (len > 0)
226 *ppos += count; 227 *ppos += len;
227 up(&buffer->sem); 228 up(&buffer->sem);
228 return count; 229 return len;
229} 230}
230 231
231static int check_perm(struct inode * inode, struct file * file) 232static int check_perm(struct inode * inode, struct file * file)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 6577c588de9d..c153bd9534cb 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/pagemap.h> 31#include <linux/pagemap.h>
32#include <linux/namei.h> 32#include <linux/namei.h>
33#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
34#include <linux/capability.h>
34 35
35#include <linux/configfs.h> 36#include <linux/configfs.h>
36#include "configfs_internal.h" 37#include "configfs_internal.h"
@@ -48,18 +49,107 @@ static struct backing_dev_info configfs_backing_dev_info = {
48 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 49 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
49}; 50};
50 51
51struct inode * configfs_new_inode(mode_t mode) 52static struct inode_operations configfs_inode_operations ={
53 .setattr = configfs_setattr,
54};
55
56int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
57{
58 struct inode * inode = dentry->d_inode;
59 struct configfs_dirent * sd = dentry->d_fsdata;
60 struct iattr * sd_iattr;
61 unsigned int ia_valid = iattr->ia_valid;
62 int error;
63
64 if (!sd)
65 return -EINVAL;
66
67 sd_iattr = sd->s_iattr;
68
69 error = inode_change_ok(inode, iattr);
70 if (error)
71 return error;
72
73 error = inode_setattr(inode, iattr);
74 if (error)
75 return error;
76
77 if (!sd_iattr) {
78 /* setting attributes for the first time, allocate now */
79 sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL);
80 if (!sd_iattr)
81 return -ENOMEM;
82 /* assign default attributes */
83 memset(sd_iattr, 0, sizeof(struct iattr));
84 sd_iattr->ia_mode = sd->s_mode;
85 sd_iattr->ia_uid = 0;
86 sd_iattr->ia_gid = 0;
87 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
88 sd->s_iattr = sd_iattr;
89 }
90
91 /* attributes were changed atleast once in past */
92
93 if (ia_valid & ATTR_UID)
94 sd_iattr->ia_uid = iattr->ia_uid;
95 if (ia_valid & ATTR_GID)
96 sd_iattr->ia_gid = iattr->ia_gid;
97 if (ia_valid & ATTR_ATIME)
98 sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
99 inode->i_sb->s_time_gran);
100 if (ia_valid & ATTR_MTIME)
101 sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
102 inode->i_sb->s_time_gran);
103 if (ia_valid & ATTR_CTIME)
104 sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
105 inode->i_sb->s_time_gran);
106 if (ia_valid & ATTR_MODE) {
107 umode_t mode = iattr->ia_mode;
108
109 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
110 mode &= ~S_ISGID;
111 sd_iattr->ia_mode = sd->s_mode = mode;
112 }
113
114 return error;
115}
116
117static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
118{
119 inode->i_mode = mode;
120 inode->i_uid = 0;
121 inode->i_gid = 0;
122 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
123}
124
125static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
126{
127 inode->i_mode = iattr->ia_mode;
128 inode->i_uid = iattr->ia_uid;
129 inode->i_gid = iattr->ia_gid;
130 inode->i_atime = iattr->ia_atime;
131 inode->i_mtime = iattr->ia_mtime;
132 inode->i_ctime = iattr->ia_ctime;
133}
134
135struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
52{ 136{
53 struct inode * inode = new_inode(configfs_sb); 137 struct inode * inode = new_inode(configfs_sb);
54 if (inode) { 138 if (inode) {
55 inode->i_mode = mode;
56 inode->i_uid = 0;
57 inode->i_gid = 0;
58 inode->i_blksize = PAGE_CACHE_SIZE; 139 inode->i_blksize = PAGE_CACHE_SIZE;
59 inode->i_blocks = 0; 140 inode->i_blocks = 0;
60 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
61 inode->i_mapping->a_ops = &configfs_aops; 141 inode->i_mapping->a_ops = &configfs_aops;
62 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; 142 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
143 inode->i_op = &configfs_inode_operations;
144
145 if (sd->s_iattr) {
146 /* sysfs_dirent has non-default attributes
147 * get them for the new inode from persistent copy
148 * in sysfs_dirent
149 */
150 set_inode_attr(inode, sd->s_iattr);
151 } else
152 set_default_inode_attr(inode, mode);
63 } 153 }
64 return inode; 154 return inode;
65} 155}
@@ -70,7 +160,8 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
70 struct inode * inode = NULL; 160 struct inode * inode = NULL;
71 if (dentry) { 161 if (dentry) {
72 if (!dentry->d_inode) { 162 if (!dentry->d_inode) {
73 if ((inode = configfs_new_inode(mode))) { 163 struct configfs_dirent *sd = dentry->d_fsdata;
164 if ((inode = configfs_new_inode(mode, sd))) {
74 if (dentry->d_parent && dentry->d_parent->d_inode) { 165 if (dentry->d_parent && dentry->d_parent->d_inode) {
75 struct inode *p_inode = dentry->d_parent->d_inode; 166 struct inode *p_inode = dentry->d_parent->d_inode;
76 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; 167 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
@@ -103,10 +194,9 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
103 */ 194 */
104const unsigned char * configfs_get_name(struct configfs_dirent *sd) 195const unsigned char * configfs_get_name(struct configfs_dirent *sd)
105{ 196{
106 struct attribute * attr; 197 struct configfs_attribute *attr;
107 198
108 if (!sd || !sd->s_element) 199 BUG_ON(!sd || !sd->s_element);
109 BUG();
110 200
111 /* These always have a dentry, so use that */ 201 /* These always have a dentry, so use that */
112 if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) 202 if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
@@ -114,7 +204,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
114 204
115 if (sd->s_type & CONFIGFS_ITEM_ATTR) { 205 if (sd->s_type & CONFIGFS_ITEM_ATTR) {
116 attr = sd->s_element; 206 attr = sd->s_element;
117 return attr->name; 207 return attr->ca_name;
118 } 208 }
119 return NULL; 209 return NULL;
120} 210}
@@ -130,13 +220,17 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
130 220
131 if (dentry) { 221 if (dentry) {
132 spin_lock(&dcache_lock); 222 spin_lock(&dcache_lock);
223 spin_lock(&dentry->d_lock);
133 if (!(d_unhashed(dentry) && dentry->d_inode)) { 224 if (!(d_unhashed(dentry) && dentry->d_inode)) {
134 dget_locked(dentry); 225 dget_locked(dentry);
135 __d_drop(dentry); 226 __d_drop(dentry);
227 spin_unlock(&dentry->d_lock);
136 spin_unlock(&dcache_lock); 228 spin_unlock(&dcache_lock);
137 simple_unlink(parent->d_inode, dentry); 229 simple_unlink(parent->d_inode, dentry);
138 } else 230 } else {
231 spin_unlock(&dentry->d_lock);
139 spin_unlock(&dcache_lock); 232 spin_unlock(&dcache_lock);
233 }
140 } 234 }
141} 235}
142 236
@@ -145,6 +239,10 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
145 struct configfs_dirent * sd; 239 struct configfs_dirent * sd;
146 struct configfs_dirent * parent_sd = dir->d_fsdata; 240 struct configfs_dirent * parent_sd = dir->d_fsdata;
147 241
242 if (dir->d_inode == NULL)
243 /* no inode means this hasn't been made visible yet */
244 return;
245
148 mutex_lock(&dir->d_inode->i_mutex); 246 mutex_lock(&dir->d_inode->i_mutex);
149 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 247 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
150 if (!sd->s_element) 248 if (!sd->s_element)
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 1a2f6f6a4d91..f920d30478e5 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -38,6 +38,7 @@
38 38
39struct vfsmount * configfs_mount = NULL; 39struct vfsmount * configfs_mount = NULL;
40struct super_block * configfs_sb = NULL; 40struct super_block * configfs_sb = NULL;
41kmem_cache_t *configfs_dir_cachep;
41static int configfs_mnt_count = 0; 42static int configfs_mnt_count = 0;
42 43
43static struct super_operations configfs_ops = { 44static struct super_operations configfs_ops = {
@@ -62,6 +63,7 @@ static struct configfs_dirent configfs_root = {
62 .s_children = LIST_HEAD_INIT(configfs_root.s_children), 63 .s_children = LIST_HEAD_INIT(configfs_root.s_children),
63 .s_element = &configfs_root_group.cg_item, 64 .s_element = &configfs_root_group.cg_item,
64 .s_type = CONFIGFS_ROOT, 65 .s_type = CONFIGFS_ROOT,
66 .s_iattr = NULL,
65}; 67};
66 68
67static int configfs_fill_super(struct super_block *sb, void *data, int silent) 69static int configfs_fill_super(struct super_block *sb, void *data, int silent)
@@ -73,9 +75,11 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
73 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 75 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
74 sb->s_magic = CONFIGFS_MAGIC; 76 sb->s_magic = CONFIGFS_MAGIC;
75 sb->s_op = &configfs_ops; 77 sb->s_op = &configfs_ops;
78 sb->s_time_gran = 1;
76 configfs_sb = sb; 79 configfs_sb = sb;
77 80
78 inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); 81 inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
82 &configfs_root);
79 if (inode) { 83 if (inode) {
80 inode->i_op = &configfs_dir_inode_operations; 84 inode->i_op = &configfs_dir_inode_operations;
81 inode->i_fop = &configfs_dir_operations; 85 inode->i_fop = &configfs_dir_operations;
@@ -128,19 +132,31 @@ static decl_subsys(config, NULL, NULL);
128 132
129static int __init configfs_init(void) 133static int __init configfs_init(void)
130{ 134{
131 int err; 135 int err = -ENOMEM;
136
137 configfs_dir_cachep = kmem_cache_create("configfs_dir_cache",
138 sizeof(struct configfs_dirent),
139 0, 0, NULL, NULL);
140 if (!configfs_dir_cachep)
141 goto out;
132 142
133 kset_set_kset_s(&config_subsys, kernel_subsys); 143 kset_set_kset_s(&config_subsys, kernel_subsys);
134 err = subsystem_register(&config_subsys); 144 err = subsystem_register(&config_subsys);
135 if (err) 145 if (err) {
136 return err; 146 kmem_cache_destroy(configfs_dir_cachep);
147 configfs_dir_cachep = NULL;
148 goto out;
149 }
137 150
138 err = register_filesystem(&configfs_fs_type); 151 err = register_filesystem(&configfs_fs_type);
139 if (err) { 152 if (err) {
140 printk(KERN_ERR "configfs: Unable to register filesystem!\n"); 153 printk(KERN_ERR "configfs: Unable to register filesystem!\n");
141 subsystem_unregister(&config_subsys); 154 subsystem_unregister(&config_subsys);
155 kmem_cache_destroy(configfs_dir_cachep);
156 configfs_dir_cachep = NULL;
142 } 157 }
143 158
159out:
144 return err; 160 return err;
145} 161}
146 162
@@ -148,11 +164,13 @@ static void __exit configfs_exit(void)
148{ 164{
149 unregister_filesystem(&configfs_fs_type); 165 unregister_filesystem(&configfs_fs_type);
150 subsystem_unregister(&config_subsys); 166 subsystem_unregister(&config_subsys);
167 kmem_cache_destroy(configfs_dir_cachep);
168 configfs_dir_cachep = NULL;
151} 169}
152 170
153MODULE_AUTHOR("Oracle"); 171MODULE_AUTHOR("Oracle");
154MODULE_LICENSE("GPL"); 172MODULE_LICENSE("GPL");
155MODULE_VERSION("0.0.1"); 173MODULE_VERSION("0.0.2");
156MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); 174MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
157 175
158module_init(configfs_init); 176module_init(configfs_init);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 50f5840521a9..e5512e295cf2 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -162,8 +162,7 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
162 if (!(sd->s_type & CONFIGFS_ITEM_LINK)) 162 if (!(sd->s_type & CONFIGFS_ITEM_LINK))
163 goto out; 163 goto out;
164 164
165 if (dentry->d_parent == configfs_sb->s_root) 165 BUG_ON(dentry->d_parent == configfs_sb->s_root);
166 BUG();
167 166
168 sl = sd->s_element; 167 sl = sd->s_element;
169 168
@@ -277,5 +276,6 @@ struct inode_operations configfs_symlink_inode_operations = {
277 .follow_link = configfs_follow_link, 276 .follow_link = configfs_follow_link,
278 .readlink = generic_readlink, 277 .readlink = generic_readlink,
279 .put_link = configfs_put_link, 278 .put_link = configfs_put_link,
279 .setattr = configfs_setattr,
280}; 280};
281 281
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d424041b38e9..bae3d7548bea 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -58,7 +58,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
58 goto out; 58 goto out;
59 } 59 }
60 60
61 down(&OCFS2_I(inode)->ip_io_sem); 61 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
62 62
63 lock_buffer(bh); 63 lock_buffer(bh);
64 set_buffer_uptodate(bh); 64 set_buffer_uptodate(bh);
@@ -82,7 +82,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
82 brelse(bh); 82 brelse(bh);
83 } 83 }
84 84
85 up(&OCFS2_I(inode)->ip_io_sem); 85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
86out: 86out:
87 mlog_exit(ret); 87 mlog_exit(ret);
88 return ret; 88 return ret;
@@ -125,13 +125,13 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
125 flags &= ~OCFS2_BH_CACHED; 125 flags &= ~OCFS2_BH_CACHED;
126 126
127 if (inode) 127 if (inode)
128 down(&OCFS2_I(inode)->ip_io_sem); 128 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
129 for (i = 0 ; i < nr ; i++) { 129 for (i = 0 ; i < nr ; i++) {
130 if (bhs[i] == NULL) { 130 if (bhs[i] == NULL) {
131 bhs[i] = sb_getblk(sb, block++); 131 bhs[i] = sb_getblk(sb, block++);
132 if (bhs[i] == NULL) { 132 if (bhs[i] == NULL) {
133 if (inode) 133 if (inode)
134 up(&OCFS2_I(inode)->ip_io_sem); 134 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
135 status = -EIO; 135 status = -EIO;
136 mlog_errno(status); 136 mlog_errno(status);
137 goto bail; 137 goto bail;
@@ -220,7 +220,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
220 ocfs2_set_buffer_uptodate(inode, bh); 220 ocfs2_set_buffer_uptodate(inode, bh);
221 } 221 }
222 if (inode) 222 if (inode)
223 up(&OCFS2_I(inode)->ip_io_sem); 223 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
224 224
225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, 225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); 226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 7307ba528913..d08971d29b63 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -917,8 +917,9 @@ static int o2hb_thread(void *data)
917 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 917 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
918 918
919 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", 919 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
920 before_hb.tv_sec, before_hb.tv_usec, 920 before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
921 after_hb.tv_sec, after_hb.tv_usec, elapsed_msec); 921 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
922 elapsed_msec);
922 923
923 if (elapsed_msec < reg->hr_timeout_ms) { 924 if (elapsed_msec < reg->hr_timeout_ms) {
924 /* the kthread api has blocked signals for us so no 925 /* the kthread api has blocked signals for us so no
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 35d92c01a972..d22d4cf08db1 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1285,14 +1285,16 @@ static void o2net_idle_timer(unsigned long data)
1285 mlog(ML_NOTICE, "here are some times that might help debug the " 1285 mlog(ML_NOTICE, "here are some times that might help debug the "
1286 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1286 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1287 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1287 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
1288 sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, 1288 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
1289 now.tv_sec, now.tv_usec, 1289 now.tv_sec, (long) now.tv_usec,
1290 sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, 1290 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
1291 sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, 1291 sc->sc_tv_advance_start.tv_sec,
1292 sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, 1292 (long) sc->sc_tv_advance_start.tv_usec,
1293 sc->sc_tv_advance_stop.tv_sec,
1294 (long) sc->sc_tv_advance_stop.tv_usec,
1293 sc->sc_msg_key, sc->sc_msg_type, 1295 sc->sc_msg_key, sc->sc_msg_type,
1294 sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec, 1296 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
1295 sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec); 1297 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
1296 1298
1297 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 1299 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1298} 1300}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 3fecba0a6023..42eb53b5293b 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -657,6 +657,7 @@ void dlm_complete_thread(struct dlm_ctxt *dlm);
657int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); 657int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
658void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); 658void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
659void dlm_wait_for_recovery(struct dlm_ctxt *dlm); 659void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
660int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
660 661
661void dlm_put(struct dlm_ctxt *dlm); 662void dlm_put(struct dlm_ctxt *dlm);
662struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); 663struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index da3c22045f89..6ee30837389c 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -573,8 +573,11 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
573 spin_lock(&dlm_domain_lock); 573 spin_lock(&dlm_domain_lock);
574 dlm = __dlm_lookup_domain_full(query->domain, query->name_len); 574 dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
575 /* Once the dlm ctxt is marked as leaving then we don't want 575 /* Once the dlm ctxt is marked as leaving then we don't want
576 * to be put in someone's domain map. */ 576 * to be put in someone's domain map.
577 * Also, explicitly disallow joining at certain troublesome
578 * times (ie. during recovery). */
577 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { 579 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
580 int bit = query->node_idx;
578 spin_lock(&dlm->spinlock); 581 spin_lock(&dlm->spinlock);
579 582
580 if (dlm->dlm_state == DLM_CTXT_NEW && 583 if (dlm->dlm_state == DLM_CTXT_NEW &&
@@ -586,6 +589,19 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
586 } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { 589 } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
587 /* Disallow parallel joins. */ 590 /* Disallow parallel joins. */
588 response = JOIN_DISALLOW; 591 response = JOIN_DISALLOW;
592 } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
593 mlog(ML_NOTICE, "node %u trying to join, but recovery "
594 "is ongoing.\n", bit);
595 response = JOIN_DISALLOW;
596 } else if (test_bit(bit, dlm->recovery_map)) {
597 mlog(ML_NOTICE, "node %u trying to join, but it "
598 "still needs recovery.\n", bit);
599 response = JOIN_DISALLOW;
600 } else if (test_bit(bit, dlm->domain_map)) {
601 mlog(ML_NOTICE, "node %u trying to join, but it "
602 "is still in the domain! needs recovery?\n",
603 bit);
604 response = JOIN_DISALLOW;
589 } else { 605 } else {
590 /* Alright we're fully a part of this domain 606 /* Alright we're fully a part of this domain
591 * so we keep some state as to who's joining 607 * so we keep some state as to who's joining
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 27e984f7e4cd..a3194fe173d9 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1050,17 +1050,10 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1050 node = dlm_bitmap_diff_iter_next(&bdi, &sc); 1050 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1051 while (node >= 0) { 1051 while (node >= 0) {
1052 if (sc == NODE_UP) { 1052 if (sc == NODE_UP) {
1053 /* a node came up. easy. might not even need 1053 /* a node came up. clear any old vote from
1054 * to talk to it if its node number is higher 1054 * the response map and set it in the vote map
1055 * or if we are already blocked. */ 1055 * then restart the mastery. */
1056 mlog(0, "node up! %d\n", node); 1056 mlog(ML_NOTICE, "node %d up while restarting\n", node);
1057 if (blocked)
1058 goto next;
1059
1060 if (node > dlm->node_num) {
1061 mlog(0, "node > this node. skipping.\n");
1062 goto next;
1063 }
1064 1057
1065 /* redo the master request, but only for the new node */ 1058 /* redo the master request, but only for the new node */
1066 mlog(0, "sending request to new node\n"); 1059 mlog(0, "sending request to new node\n");
@@ -2005,6 +1998,15 @@ fail:
2005 break; 1998 break;
2006 1999
2007 mlog(0, "timed out during migration\n"); 2000 mlog(0, "timed out during migration\n");
2001 /* avoid hang during shutdown when migrating lockres
2002 * to a node which also goes down */
2003 if (dlm_is_node_dead(dlm, target)) {
2004 mlog(0, "%s:%.*s: expected migration target %u "
2005 "is no longer up. restarting.\n",
2006 dlm->name, res->lockname.len,
2007 res->lockname.name, target);
2008 ret = -ERESTARTSYS;
2009 }
2008 } 2010 }
2009 if (ret == -ERESTARTSYS) { 2011 if (ret == -ERESTARTSYS) {
2010 /* migration failed, detach and clean up mle */ 2012 /* migration failed, detach and clean up mle */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0c8eb1093f00..186e9a76aa58 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -39,6 +39,7 @@
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/timer.h> 40#include <linux/timer.h>
41#include <linux/kthread.h> 41#include <linux/kthread.h>
42#include <linux/delay.h>
42 43
43 44
44#include "cluster/heartbeat.h" 45#include "cluster/heartbeat.h"
@@ -256,6 +257,27 @@ static int dlm_recovery_thread(void *data)
256 return 0; 257 return 0;
257} 258}
258 259
260/* returns true when the recovery master has contacted us */
261static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
262{
263 int ready;
264 spin_lock(&dlm->spinlock);
265 ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
266 spin_unlock(&dlm->spinlock);
267 return ready;
268}
269
270/* returns true if node is no longer in the domain
271 * could be dead or just not joined */
272int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
273{
274 int dead;
275 spin_lock(&dlm->spinlock);
276 dead = test_bit(node, dlm->domain_map);
277 spin_unlock(&dlm->spinlock);
278 return dead;
279}
280
259/* callers of the top-level api calls (dlmlock/dlmunlock) should 281/* callers of the top-level api calls (dlmlock/dlmunlock) should
260 * block on the dlm->reco.event when recovery is in progress. 282 * block on the dlm->reco.event when recovery is in progress.
261 * the dlm recovery thread will set this state when it begins 283 * the dlm recovery thread will set this state when it begins
@@ -297,6 +319,7 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
297static int dlm_do_recovery(struct dlm_ctxt *dlm) 319static int dlm_do_recovery(struct dlm_ctxt *dlm)
298{ 320{
299 int status = 0; 321 int status = 0;
322 int ret;
300 323
301 spin_lock(&dlm->spinlock); 324 spin_lock(&dlm->spinlock);
302 325
@@ -343,10 +366,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
343 goto master_here; 366 goto master_here;
344 367
345 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { 368 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
346 /* choose a new master */ 369 /* choose a new master, returns 0 if this node
347 if (!dlm_pick_recovery_master(dlm)) { 370 * is the master, -EEXIST if it's another node.
371 * this does not return until a new master is chosen
372 * or recovery completes entirely. */
373 ret = dlm_pick_recovery_master(dlm);
374 if (!ret) {
348 /* already notified everyone. go. */ 375 /* already notified everyone. go. */
349 dlm->reco.new_master = dlm->node_num;
350 goto master_here; 376 goto master_here;
351 } 377 }
352 mlog(0, "another node will master this recovery session.\n"); 378 mlog(0, "another node will master this recovery session.\n");
@@ -371,8 +397,13 @@ master_here:
371 if (status < 0) { 397 if (status < 0) {
372 mlog(ML_ERROR, "error %d remastering locks for node %u, " 398 mlog(ML_ERROR, "error %d remastering locks for node %u, "
373 "retrying.\n", status, dlm->reco.dead_node); 399 "retrying.\n", status, dlm->reco.dead_node);
400 /* yield a bit to allow any final network messages
401 * to get handled on remaining nodes */
402 msleep(100);
374 } else { 403 } else {
375 /* success! see if any other nodes need recovery */ 404 /* success! see if any other nodes need recovery */
405 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
406 dlm->name, dlm->reco.dead_node, dlm->node_num);
376 dlm_reset_recovery(dlm); 407 dlm_reset_recovery(dlm);
377 } 408 }
378 dlm_end_recovery(dlm); 409 dlm_end_recovery(dlm);
@@ -477,7 +508,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
477 BUG(); 508 BUG();
478 break; 509 break;
479 case DLM_RECO_NODE_DATA_DEAD: 510 case DLM_RECO_NODE_DATA_DEAD:
480 mlog(0, "node %u died after " 511 mlog(ML_NOTICE, "node %u died after "
481 "requesting recovery info for " 512 "requesting recovery info for "
482 "node %u\n", ndata->node_num, 513 "node %u\n", ndata->node_num,
483 dead_node); 514 dead_node);
@@ -485,6 +516,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
485 // start all over 516 // start all over
486 destroy = 1; 517 destroy = 1;
487 status = -EAGAIN; 518 status = -EAGAIN;
519 /* instead of spinning like crazy here,
520 * wait for the domain map to catch up
521 * with the network state. otherwise this
522 * can be hit hundreds of times before
523 * the node is really seen as dead. */
524 wait_event_timeout(dlm->dlm_reco_thread_wq,
525 dlm_is_node_dead(dlm,
526 ndata->node_num),
527 msecs_to_jiffies(1000));
528 mlog(0, "waited 1 sec for %u, "
529 "dead? %s\n", ndata->node_num,
530 dlm_is_node_dead(dlm, ndata->node_num) ?
531 "yes" : "no");
488 goto leave; 532 goto leave;
489 case DLM_RECO_NODE_DATA_RECEIVING: 533 case DLM_RECO_NODE_DATA_RECEIVING:
490 case DLM_RECO_NODE_DATA_REQUESTED: 534 case DLM_RECO_NODE_DATA_REQUESTED:
@@ -678,11 +722,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
678 dlm = item->dlm; 722 dlm = item->dlm;
679 dead_node = item->u.ral.dead_node; 723 dead_node = item->u.ral.dead_node;
680 reco_master = item->u.ral.reco_master; 724 reco_master = item->u.ral.reco_master;
725 mres = (struct dlm_migratable_lockres *)data;
726
727 if (dead_node != dlm->reco.dead_node ||
728 reco_master != dlm->reco.new_master) {
729 /* show extra debug info if the recovery state is messed */
730 mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
731 "request(dead=%u, master=%u)\n",
732 dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
733 dead_node, reco_master);
734 mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
735 "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
736 dlm->name, mres->lockname_len, mres->lockname, mres->master,
737 mres->num_locks, mres->total_locks, mres->flags,
738 mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags,
739 mres->ml[0].type, mres->ml[0].convert_type,
740 mres->ml[0].highest_blocked, mres->ml[0].node);
741 BUG();
742 }
681 BUG_ON(dead_node != dlm->reco.dead_node); 743 BUG_ON(dead_node != dlm->reco.dead_node);
682 BUG_ON(reco_master != dlm->reco.new_master); 744 BUG_ON(reco_master != dlm->reco.new_master);
683 745
684 mres = (struct dlm_migratable_lockres *)data;
685
686 /* lock resources should have already been moved to the 746 /* lock resources should have already been moved to the
687 * dlm->reco.resources list. now move items from that list 747 * dlm->reco.resources list. now move items from that list
688 * to a temp list if the dead owner matches. note that the 748 * to a temp list if the dead owner matches. note that the
@@ -757,15 +817,18 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
757 continue; 817 continue;
758 818
759 switch (ndata->state) { 819 switch (ndata->state) {
820 /* should have moved beyond INIT but not to FINALIZE yet */
760 case DLM_RECO_NODE_DATA_INIT: 821 case DLM_RECO_NODE_DATA_INIT:
761 case DLM_RECO_NODE_DATA_DEAD: 822 case DLM_RECO_NODE_DATA_DEAD:
762 case DLM_RECO_NODE_DATA_DONE:
763 case DLM_RECO_NODE_DATA_FINALIZE_SENT: 823 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
764 mlog(ML_ERROR, "bad ndata state for node %u:" 824 mlog(ML_ERROR, "bad ndata state for node %u:"
765 " state=%d\n", ndata->node_num, 825 " state=%d\n", ndata->node_num,
766 ndata->state); 826 ndata->state);
767 BUG(); 827 BUG();
768 break; 828 break;
829 /* these states are possible at this point, anywhere along
830 * the line of recovery */
831 case DLM_RECO_NODE_DATA_DONE:
769 case DLM_RECO_NODE_DATA_RECEIVING: 832 case DLM_RECO_NODE_DATA_RECEIVING:
770 case DLM_RECO_NODE_DATA_REQUESTED: 833 case DLM_RECO_NODE_DATA_REQUESTED:
771 case DLM_RECO_NODE_DATA_REQUESTING: 834 case DLM_RECO_NODE_DATA_REQUESTING:
@@ -799,13 +862,31 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
799{ 862{
800 struct dlm_lock_resource *res; 863 struct dlm_lock_resource *res;
801 struct list_head *iter, *iter2; 864 struct list_head *iter, *iter2;
865 struct dlm_lock *lock;
802 866
803 spin_lock(&dlm->spinlock); 867 spin_lock(&dlm->spinlock);
804 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 868 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
805 res = list_entry (iter, struct dlm_lock_resource, recovering); 869 res = list_entry (iter, struct dlm_lock_resource, recovering);
870 /* always prune any $RECOVERY entries for dead nodes,
871 * otherwise hangs can occur during later recovery */
806 if (dlm_is_recovery_lock(res->lockname.name, 872 if (dlm_is_recovery_lock(res->lockname.name,
807 res->lockname.len)) 873 res->lockname.len)) {
874 spin_lock(&res->spinlock);
875 list_for_each_entry(lock, &res->granted, list) {
876 if (lock->ml.node == dead_node) {
877 mlog(0, "AHA! there was "
878 "a $RECOVERY lock for dead "
879 "node %u (%s)!\n",
880 dead_node, dlm->name);
881 list_del_init(&lock->list);
882 dlm_lock_put(lock);
883 break;
884 }
885 }
886 spin_unlock(&res->spinlock);
808 continue; 887 continue;
888 }
889
809 if (res->owner == dead_node) { 890 if (res->owner == dead_node) {
810 mlog(0, "found lockres owned by dead node while " 891 mlog(0, "found lockres owned by dead node while "
811 "doing recovery for node %u. sending it.\n", 892 "doing recovery for node %u. sending it.\n",
@@ -1179,7 +1260,7 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
1179again: 1260again:
1180 ret = dlm_lockres_master_requery(dlm, res, &real_master); 1261 ret = dlm_lockres_master_requery(dlm, res, &real_master);
1181 if (ret < 0) { 1262 if (ret < 0) {
1182 mlog(0, "dlm_lockres_master_requery failure: %d\n", 1263 mlog(0, "dlm_lockres_master_requery ret=%d\n",
1183 ret); 1264 ret);
1184 goto again; 1265 goto again;
1185 } 1266 }
@@ -1757,6 +1838,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1757 struct dlm_lock_resource *res; 1838 struct dlm_lock_resource *res;
1758 int i; 1839 int i;
1759 struct list_head *bucket; 1840 struct list_head *bucket;
1841 struct dlm_lock *lock;
1760 1842
1761 1843
1762 /* purge any stale mles */ 1844 /* purge any stale mles */
@@ -1780,10 +1862,25 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1780 bucket = &(dlm->resources[i]); 1862 bucket = &(dlm->resources[i]);
1781 list_for_each(iter, bucket) { 1863 list_for_each(iter, bucket) {
1782 res = list_entry (iter, struct dlm_lock_resource, list); 1864 res = list_entry (iter, struct dlm_lock_resource, list);
1865 /* always prune any $RECOVERY entries for dead nodes,
1866 * otherwise hangs can occur during later recovery */
1783 if (dlm_is_recovery_lock(res->lockname.name, 1867 if (dlm_is_recovery_lock(res->lockname.name,
1784 res->lockname.len)) 1868 res->lockname.len)) {
1869 spin_lock(&res->spinlock);
1870 list_for_each_entry(lock, &res->granted, list) {
1871 if (lock->ml.node == dead_node) {
1872 mlog(0, "AHA! there was "
1873 "a $RECOVERY lock for dead "
1874 "node %u (%s)!\n",
1875 dead_node, dlm->name);
1876 list_del_init(&lock->list);
1877 dlm_lock_put(lock);
1878 break;
1879 }
1880 }
1881 spin_unlock(&res->spinlock);
1785 continue; 1882 continue;
1786 1883 }
1787 spin_lock(&res->spinlock); 1884 spin_lock(&res->spinlock);
1788 /* zero the lvb if necessary */ 1885 /* zero the lvb if necessary */
1789 dlm_revalidate_lvb(dlm, res, dead_node); 1886 dlm_revalidate_lvb(dlm, res, dead_node);
@@ -1869,12 +1966,9 @@ void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
1869 return; 1966 return;
1870 1967
1871 spin_lock(&dlm->spinlock); 1968 spin_lock(&dlm->spinlock);
1872
1873 set_bit(idx, dlm->live_nodes_map); 1969 set_bit(idx, dlm->live_nodes_map);
1874 1970 /* do NOT notify mle attached to the heartbeat events.
1875 /* notify any mles attached to the heartbeat events */ 1971 * new nodes are not interesting in mastery until joined. */
1876 dlm_hb_event_notify_attached(dlm, idx, 1);
1877
1878 spin_unlock(&dlm->spinlock); 1972 spin_unlock(&dlm->spinlock);
1879 1973
1880 dlm_put(dlm); 1974 dlm_put(dlm);
@@ -1897,7 +1991,18 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
1897 mlog(0, "unlockast for recovery lock fired!\n"); 1991 mlog(0, "unlockast for recovery lock fired!\n");
1898} 1992}
1899 1993
1900 1994/*
1995 * dlm_pick_recovery_master will continually attempt to use
1996 * dlmlock() on the special "$RECOVERY" lockres with the
1997 * LKM_NOQUEUE flag to get an EX. every thread that enters
1998 * this function on each node racing to become the recovery
1999 * master will not stop attempting this until either:
2000 * a) this node gets the EX (and becomes the recovery master),
2001 * or b) dlm->reco.new_master gets set to some nodenum
2002 * != O2NM_INVALID_NODE_NUM (another node will do the reco).
2003 * so each time a recovery master is needed, the entire cluster
2004 * will sync at this point. if the new master dies, that will
2005 * be detected in dlm_do_recovery */
1901static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) 2006static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1902{ 2007{
1903 enum dlm_status ret; 2008 enum dlm_status ret;
@@ -1906,23 +2011,45 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1906 2011
1907 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", 2012 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
1908 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); 2013 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
1909retry: 2014again:
1910 memset(&lksb, 0, sizeof(lksb)); 2015 memset(&lksb, 0, sizeof(lksb));
1911 2016
1912 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, 2017 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
1913 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); 2018 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
1914 2019
2020 mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
2021 dlm->name, ret, lksb.status);
2022
1915 if (ret == DLM_NORMAL) { 2023 if (ret == DLM_NORMAL) {
1916 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", 2024 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
1917 dlm->name, dlm->node_num); 2025 dlm->name, dlm->node_num);
1918 /* I am master, send message to all nodes saying 2026
1919 * that I am beginning a recovery session */ 2027 /* got the EX lock. check to see if another node
1920 status = dlm_send_begin_reco_message(dlm, 2028 * just became the reco master */
1921 dlm->reco.dead_node); 2029 if (dlm_reco_master_ready(dlm)) {
2030 mlog(0, "%s: got reco EX lock, but %u will "
2031 "do the recovery\n", dlm->name,
2032 dlm->reco.new_master);
2033 status = -EEXIST;
2034 } else {
2035 status = dlm_send_begin_reco_message(dlm,
2036 dlm->reco.dead_node);
2037 /* this always succeeds */
2038 BUG_ON(status);
2039
2040 /* set the new_master to this node */
2041 spin_lock(&dlm->spinlock);
2042 dlm->reco.new_master = dlm->node_num;
2043 spin_unlock(&dlm->spinlock);
2044 }
1922 2045
1923 /* recovery lock is a special case. ast will not get fired, 2046 /* recovery lock is a special case. ast will not get fired,
1924 * so just go ahead and unlock it. */ 2047 * so just go ahead and unlock it. */
1925 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); 2048 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
2049 if (ret == DLM_DENIED) {
2050 mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
2051 ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
2052 }
1926 if (ret != DLM_NORMAL) { 2053 if (ret != DLM_NORMAL) {
1927 /* this would really suck. this could only happen 2054 /* this would really suck. this could only happen
1928 * if there was a network error during the unlock 2055 * if there was a network error during the unlock
@@ -1930,20 +2057,42 @@ retry:
1930 * is actually "done" and the lock structure is 2057 * is actually "done" and the lock structure is
1931 * even freed. we can continue, but only 2058 * even freed. we can continue, but only
1932 * because this specific lock name is special. */ 2059 * because this specific lock name is special. */
1933 mlog(0, "dlmunlock returned %d\n", ret); 2060 mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
1934 }
1935
1936 if (status < 0) {
1937 mlog(0, "failed to send recovery message. "
1938 "must retry with new node map.\n");
1939 goto retry;
1940 } 2061 }
1941 } else if (ret == DLM_NOTQUEUED) { 2062 } else if (ret == DLM_NOTQUEUED) {
1942 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", 2063 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
1943 dlm->name, dlm->node_num); 2064 dlm->name, dlm->node_num);
1944 /* another node is master. wait on 2065 /* another node is master. wait on
1945 * reco.new_master != O2NM_INVALID_NODE_NUM */ 2066 * reco.new_master != O2NM_INVALID_NODE_NUM
2067 * for at most one second */
2068 wait_event_timeout(dlm->dlm_reco_thread_wq,
2069 dlm_reco_master_ready(dlm),
2070 msecs_to_jiffies(1000));
2071 if (!dlm_reco_master_ready(dlm)) {
2072 mlog(0, "%s: reco master taking awhile\n",
2073 dlm->name);
2074 goto again;
2075 }
2076 /* another node has informed this one that it is reco master */
2077 mlog(0, "%s: reco master %u is ready to recover %u\n",
2078 dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
1946 status = -EEXIST; 2079 status = -EEXIST;
2080 } else {
2081 struct dlm_lock_resource *res;
2082
2083 /* dlmlock returned something other than NOTQUEUED or NORMAL */
2084 mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
2085 "lksb.status=%s\n", dlm->name, dlm_errname(ret),
2086 dlm_errname(lksb.status));
2087 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2088 DLM_RECOVERY_LOCK_NAME_LEN);
2089 if (res) {
2090 dlm_print_one_lock_resource(res);
2091 dlm_lockres_put(res);
2092 } else {
2093 mlog(ML_ERROR, "recovery lock not found\n");
2094 }
2095 BUG();
1947 } 2096 }
1948 2097
1949 return status; 2098 return status;
@@ -1982,7 +2131,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1982 mlog(0, "not sending begin reco to self\n"); 2131 mlog(0, "not sending begin reco to self\n");
1983 continue; 2132 continue;
1984 } 2133 }
1985 2134retry:
1986 ret = -EINVAL; 2135 ret = -EINVAL;
1987 mlog(0, "attempting to send begin reco msg to %d\n", 2136 mlog(0, "attempting to send begin reco msg to %d\n",
1988 nodenum); 2137 nodenum);
@@ -1991,8 +2140,17 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1991 /* negative status is handled ok by caller here */ 2140 /* negative status is handled ok by caller here */
1992 if (ret >= 0) 2141 if (ret >= 0)
1993 ret = status; 2142 ret = status;
2143 if (dlm_is_host_down(ret)) {
2144 /* node is down. not involved in recovery
2145 * so just keep going */
2146 mlog(0, "%s: node %u was down when sending "
2147 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2148 ret = 0;
2149 }
1994 if (ret < 0) { 2150 if (ret < 0) {
1995 struct dlm_lock_resource *res; 2151 struct dlm_lock_resource *res;
2152 /* this is now a serious problem, possibly ENOMEM
2153 * in the network stack. must retry */
1996 mlog_errno(ret); 2154 mlog_errno(ret);
1997 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2155 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
1998 " returned %d\n", dlm->name, nodenum, ret); 2156 " returned %d\n", dlm->name, nodenum, ret);
@@ -2004,7 +2162,10 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
2004 } else { 2162 } else {
2005 mlog(ML_ERROR, "recovery lock not found\n"); 2163 mlog(ML_ERROR, "recovery lock not found\n");
2006 } 2164 }
2007 break; 2165 /* sleep for a bit in hopes that we can avoid
2166 * another ENOMEM */
2167 msleep(100);
2168 goto retry;
2008 } 2169 }
2009 } 2170 }
2010 2171
@@ -2027,19 +2188,34 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2027 2188
2028 spin_lock(&dlm->spinlock); 2189 spin_lock(&dlm->spinlock);
2029 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { 2190 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2030 mlog(0, "new_master already set to %u!\n", 2191 if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
2031 dlm->reco.new_master); 2192 mlog(0, "%s: new_master %u died, changing "
2193 "to %u\n", dlm->name, dlm->reco.new_master,
2194 br->node_idx);
2195 } else {
2196 mlog(0, "%s: new_master %u NOT DEAD, changing "
2197 "to %u\n", dlm->name, dlm->reco.new_master,
2198 br->node_idx);
2199 /* may not have seen the new master as dead yet */
2200 }
2032 } 2201 }
2033 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { 2202 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2034 mlog(0, "dead_node already set to %u!\n", 2203 mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
2035 dlm->reco.dead_node); 2204 "node %u changing it to %u\n", dlm->name,
2205 dlm->reco.dead_node, br->node_idx, br->dead_node);
2036 } 2206 }
2037 dlm->reco.new_master = br->node_idx; 2207 dlm->reco.new_master = br->node_idx;
2038 dlm->reco.dead_node = br->dead_node; 2208 dlm->reco.dead_node = br->dead_node;
2039 if (!test_bit(br->dead_node, dlm->recovery_map)) { 2209 if (!test_bit(br->dead_node, dlm->recovery_map)) {
2040 mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " 2210 mlog(0, "recovery master %u sees %u as dead, but this "
2041 "node has not yet. marking %u as dead\n", 2211 "node has not yet. marking %u as dead\n",
2042 br->node_idx, br->dead_node, br->dead_node); 2212 br->node_idx, br->dead_node, br->dead_node);
2213 if (!test_bit(br->dead_node, dlm->domain_map) ||
2214 !test_bit(br->dead_node, dlm->live_nodes_map))
2215 mlog(0, "%u not in domain/live_nodes map "
2216 "so setting it in reco map manually\n",
2217 br->dead_node);
2218 set_bit(br->dead_node, dlm->recovery_map);
2043 __dlm_hb_node_down(dlm, br->dead_node); 2219 __dlm_hb_node_down(dlm, br->dead_node);
2044 } 2220 }
2045 spin_unlock(&dlm->spinlock); 2221 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index cec2ce1cd318..c95f08d2e925 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -188,6 +188,19 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK| 188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
189 DLM_UNLOCK_REGRANT_LOCK| 189 DLM_UNLOCK_REGRANT_LOCK|
190 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 190 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
191 } else if (status == DLM_RECOVERING ||
192 status == DLM_MIGRATING ||
193 status == DLM_FORWARD) {
194 /* must clear the actions because this unlock
195 * is about to be retried. cannot free or do
196 * any list manipulation. */
197 mlog(0, "%s:%.*s: clearing actions, %s\n",
198 dlm->name, res->lockname.len,
199 res->lockname.name,
200 status==DLM_RECOVERING?"recovering":
201 (status==DLM_MIGRATING?"migrating":
202 "forward"));
203 actions = 0;
191 } 204 }
192 if (flags & LKM_CANCEL) 205 if (flags & LKM_CANCEL)
193 lock->cancel_pending = 0; 206 lock->cancel_pending = 0;
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index e1fdd288796e..c3764f4744ee 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -27,7 +27,7 @@
27 * Boston, MA 021110-1307, USA. 27 * Boston, MA 021110-1307, USA.
28 */ 28 */
29 29
30#include <asm/signal.h> 30#include <linux/signal.h>
31 31
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/fs.h> 33#include <linux/fs.h>
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f2fb40cd296a..b6ba292e9544 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -262,8 +262,7 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
262 el = &eb->h_list; 262 el = &eb->h_list;
263 } 263 }
264 264
265 if (el->l_tree_depth) 265 BUG_ON(el->l_tree_depth);
266 BUG();
267 266
268 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 267 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
269 rec = &el->l_recs[i]; 268 rec = &el->l_recs[i];
@@ -364,8 +363,8 @@ static int ocfs2_extent_map_lookup_read(struct inode *inode,
364 return ret; 363 return ret;
365 } 364 }
366 365
367 if (ent->e_tree_depth) 366 /* FIXME: Make sure this isn't a corruption */
368 BUG(); /* FIXME: Make sure this isn't a corruption */ 367 BUG_ON(ent->e_tree_depth);
369 368
370 *ret_ent = ent; 369 *ret_ent = ent;
371 370
@@ -423,8 +422,7 @@ static int ocfs2_extent_map_try_insert(struct inode *inode,
423 le32_to_cpu(rec->e_clusters), NULL, 422 le32_to_cpu(rec->e_clusters), NULL,
424 NULL); 423 NULL);
425 424
426 if (!old_ent) 425 BUG_ON(!old_ent);
427 BUG();
428 426
429 ret = -EEXIST; 427 ret = -EEXIST;
430 if (old_ent->e_tree_depth < tree_depth) 428 if (old_ent->e_tree_depth < tree_depth)
@@ -988,7 +986,7 @@ int __init init_ocfs2_extent_maps(void)
988 return 0; 986 return 0;
989} 987}
990 988
991void __exit exit_ocfs2_extent_maps(void) 989void exit_ocfs2_extent_maps(void)
992{ 990{
993 kmem_cache_destroy(ocfs2_em_ent_cachep); 991 kmem_cache_destroy(ocfs2_em_ent_cachep);
994} 992}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index eaf33caa0a1f..1715bc90e705 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1022,8 +1022,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1022 } 1022 }
1023 newsize = count + saved_pos; 1023 newsize = count + saved_pos;
1024 1024
1025 mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", 1025 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1026 saved_pos, newsize, i_size_read(inode)); 1026 (long long) saved_pos, (long long) newsize,
1027 (long long) i_size_read(inode));
1027 1028
1028 /* No need for a higher level metadata lock if we're 1029 /* No need for a higher level metadata lock if we're
1029 * never going past i_size. */ 1030 * never going past i_size. */
@@ -1042,8 +1043,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1042 spin_unlock(&OCFS2_I(inode)->ip_lock); 1043 spin_unlock(&OCFS2_I(inode)->ip_lock);
1043 1044
1044 mlog(0, "Writing at EOF, may need more allocation: " 1045 mlog(0, "Writing at EOF, may need more allocation: "
1045 "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", 1046 "i_size = %lld, newsize = %lld, need %u clusters\n",
1046 i_size_read(inode), newsize, clusters); 1047 (long long) i_size_read(inode), (long long) newsize,
1048 clusters);
1047 1049
1048 /* We only want to continue the rest of this loop if 1050 /* We only want to continue the rest of this loop if
1049 * our extend will actually require more 1051 * our extend will actually require more
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index d4ecc0627716..8122489c5762 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -903,10 +903,10 @@ void ocfs2_clear_inode(struct inode *inode)
903 "Clear inode of %"MLFu64", inode is locked\n", 903 "Clear inode of %"MLFu64", inode is locked\n",
904 oi->ip_blkno); 904 oi->ip_blkno);
905 905
906 mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), 906 mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex),
907 "Clear inode of %"MLFu64", io_sem is locked\n", 907 "Clear inode of %"MLFu64", io_mutex is locked\n",
908 oi->ip_blkno); 908 oi->ip_blkno);
909 up(&oi->ip_io_sem); 909 mutex_unlock(&oi->ip_io_mutex);
910 910
911 /* 911 /*
912 * down_trylock() returns 0, down_write_trylock() returns 1 912 * down_trylock() returns 0, down_write_trylock() returns 1
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9b0177433653..84c507961287 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,10 +46,10 @@ struct ocfs2_inode_info
46 struct list_head ip_io_markers; 46 struct list_head ip_io_markers;
47 int ip_orphaned_slot; 47 int ip_orphaned_slot;
48 48
49 struct semaphore ip_io_sem; 49 struct mutex ip_io_mutex;
50 50
51 /* Used by the journalling code to attach an inode to a 51 /* Used by the journalling code to attach an inode to a
52 * handle. These are protected by ip_io_sem in order to lock 52 * handle. These are protected by ip_io_mutex in order to lock
53 * out other I/O to the inode until we either commit or 53 * out other I/O to the inode until we either commit or
54 * abort. */ 54 * abort. */
55 struct list_head ip_handle_list; 55 struct list_head ip_handle_list;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 303c8d96457f..fa0bcac5ceae 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -147,8 +147,7 @@ struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
147 147
148 mlog_entry("(max_buffs = %d)\n", max_buffs); 148 mlog_entry("(max_buffs = %d)\n", max_buffs);
149 149
150 if (!osb || !osb->journal->j_journal) 150 BUG_ON(!osb || !osb->journal->j_journal);
151 BUG();
152 151
153 if (ocfs2_is_hard_readonly(osb)) { 152 if (ocfs2_is_hard_readonly(osb)) {
154 ret = -EROFS; 153 ret = -EROFS;
@@ -401,7 +400,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
401 * j_trans_barrier for us. */ 400 * j_trans_barrier for us. */
402 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); 401 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
403 402
404 down(&OCFS2_I(inode)->ip_io_sem); 403 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
405 switch (type) { 404 switch (type) {
406 case OCFS2_JOURNAL_ACCESS_CREATE: 405 case OCFS2_JOURNAL_ACCESS_CREATE:
407 case OCFS2_JOURNAL_ACCESS_WRITE: 406 case OCFS2_JOURNAL_ACCESS_WRITE:
@@ -416,7 +415,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
416 status = -EINVAL; 415 status = -EINVAL;
417 mlog(ML_ERROR, "Uknown access type!\n"); 416 mlog(ML_ERROR, "Uknown access type!\n");
418 } 417 }
419 up(&OCFS2_I(inode)->ip_io_sem); 418 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
420 419
421 if (status < 0) 420 if (status < 0)
422 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", 421 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
@@ -561,7 +560,11 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
561 SET_INODE_JOURNAL(inode); 560 SET_INODE_JOURNAL(inode);
562 OCFS2_I(inode)->ip_open_count++; 561 OCFS2_I(inode)->ip_open_count++;
563 562
564 status = ocfs2_meta_lock(inode, NULL, &bh, 1); 563 /* Skip recovery waits here - journal inode metadata never
564 * changes in a live cluster so it can be considered an
565 * exception to the rule. */
566 status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
567 OCFS2_META_LOCK_RECOVERY);
565 if (status < 0) { 568 if (status < 0) {
566 if (status != -ERESTARTSYS) 569 if (status != -ERESTARTSYS)
567 mlog(ML_ERROR, "Could not get lock on journal!\n"); 570 mlog(ML_ERROR, "Could not get lock on journal!\n");
@@ -672,8 +675,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
672 675
673 mlog_entry_void(); 676 mlog_entry_void();
674 677
675 if (!osb) 678 BUG_ON(!osb);
676 BUG();
677 679
678 journal = osb->journal; 680 journal = osb->journal;
679 if (!journal) 681 if (!journal)
@@ -805,8 +807,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
805 807
806 mlog_entry_void(); 808 mlog_entry_void();
807 809
808 if (!journal) 810 BUG_ON(!journal);
809 BUG();
810 811
811 status = journal_wipe(journal->j_journal, full); 812 status = journal_wipe(journal->j_journal, full);
812 if (status < 0) { 813 if (status < 0) {
@@ -1072,10 +1073,10 @@ restart:
1072 NULL); 1073 NULL);
1073 1074
1074bail: 1075bail:
1075 down(&osb->recovery_lock); 1076 mutex_lock(&osb->recovery_lock);
1076 if (!status && 1077 if (!status &&
1077 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 1078 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1078 up(&osb->recovery_lock); 1079 mutex_unlock(&osb->recovery_lock);
1079 goto restart; 1080 goto restart;
1080 } 1081 }
1081 1082
@@ -1083,7 +1084,7 @@ bail:
1083 mb(); /* sync with ocfs2_recovery_thread_running */ 1084 mb(); /* sync with ocfs2_recovery_thread_running */
1084 wake_up(&osb->recovery_event); 1085 wake_up(&osb->recovery_event);
1085 1086
1086 up(&osb->recovery_lock); 1087 mutex_unlock(&osb->recovery_lock);
1087 1088
1088 mlog_exit(status); 1089 mlog_exit(status);
1089 /* no one is callint kthread_stop() for us so the kthread() api 1090 /* no one is callint kthread_stop() for us so the kthread() api
@@ -1098,7 +1099,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1098 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 1099 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1099 node_num, osb->node_num); 1100 node_num, osb->node_num);
1100 1101
1101 down(&osb->recovery_lock); 1102 mutex_lock(&osb->recovery_lock);
1102 if (osb->disable_recovery) 1103 if (osb->disable_recovery)
1103 goto out; 1104 goto out;
1104 1105
@@ -1120,7 +1121,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1120 } 1121 }
1121 1122
1122out: 1123out:
1123 up(&osb->recovery_lock); 1124 mutex_unlock(&osb->recovery_lock);
1124 wake_up(&osb->recovery_event); 1125 wake_up(&osb->recovery_event);
1125 1126
1126 mlog_exit_void(); 1127 mlog_exit_void();
@@ -1271,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1271 1272
1272 /* Should not ever be called to recover ourselves -- in that 1273 /* Should not ever be called to recover ourselves -- in that
1273 * case we should've called ocfs2_journal_load instead. */ 1274 * case we should've called ocfs2_journal_load instead. */
1274 if (osb->node_num == node_num) 1275 BUG_ON(osb->node_num == node_num);
1275 BUG();
1276 1276
1277 slot_num = ocfs2_node_num_to_slot(si, node_num); 1277 slot_num = ocfs2_node_num_to_slot(si, node_num);
1278 if (slot_num == OCFS2_INVALID_SLOT) { 1278 if (slot_num == OCFS2_INVALID_SLOT) {
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index f468c600cf92..8d8e4779df92 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -33,6 +33,7 @@
33#include <linux/rbtree.h> 33#include <linux/rbtree.h>
34#include <linux/workqueue.h> 34#include <linux/workqueue.h>
35#include <linux/kref.h> 35#include <linux/kref.h>
36#include <linux/mutex.h>
36 37
37#include "cluster/nodemanager.h" 38#include "cluster/nodemanager.h"
38#include "cluster/heartbeat.h" 39#include "cluster/heartbeat.h"
@@ -233,7 +234,7 @@ struct ocfs2_super
233 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */ 234 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
234 235
235 atomic_t vol_state; 236 atomic_t vol_state;
236 struct semaphore recovery_lock; 237 struct mutex recovery_lock;
237 struct task_struct *recovery_thread_task; 238 struct task_struct *recovery_thread_task;
238 int disable_recovery; 239 int disable_recovery;
239 wait_queue_head_t checkpoint_event; 240 wait_queue_head_t checkpoint_event;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 364d64bd5f10..046824b6b625 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -932,7 +932,7 @@ static void ocfs2_inode_init_once(void *data,
932 oi->ip_dir_start_lookup = 0; 932 oi->ip_dir_start_lookup = 0;
933 933
934 init_rwsem(&oi->ip_alloc_sem); 934 init_rwsem(&oi->ip_alloc_sem);
935 init_MUTEX(&(oi->ip_io_sem)); 935 mutex_init(&oi->ip_io_mutex);
936 936
937 oi->ip_blkno = 0ULL; 937 oi->ip_blkno = 0ULL;
938 oi->ip_clusters = 0; 938 oi->ip_clusters = 0;
@@ -1137,9 +1137,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1137 1137
1138 /* disable any new recovery threads and wait for any currently 1138 /* disable any new recovery threads and wait for any currently
1139 * running ones to exit. Do this before setting the vol_state. */ 1139 * running ones to exit. Do this before setting the vol_state. */
1140 down(&osb->recovery_lock); 1140 mutex_lock(&osb->recovery_lock);
1141 osb->disable_recovery = 1; 1141 osb->disable_recovery = 1;
1142 up(&osb->recovery_lock); 1142 mutex_unlock(&osb->recovery_lock);
1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); 1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1144 1144
1145 /* At this point, we know that no more recovery threads can be 1145 /* At this point, we know that no more recovery threads can be
@@ -1254,8 +1254,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1254 osb->sb = sb; 1254 osb->sb = sb;
1255 /* Save off for ocfs2_rw_direct */ 1255 /* Save off for ocfs2_rw_direct */
1256 osb->s_sectsize_bits = blksize_bits(sector_size); 1256 osb->s_sectsize_bits = blksize_bits(sector_size);
1257 if (!osb->s_sectsize_bits) 1257 BUG_ON(!osb->s_sectsize_bits);
1258 BUG();
1259 1258
1260 osb->net_response_ids = 0; 1259 osb->net_response_ids = 0;
1261 spin_lock_init(&osb->net_response_lock); 1260 spin_lock_init(&osb->net_response_lock);
@@ -1283,7 +1282,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1283 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 1282 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1284 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1283 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1285 1284
1286 init_MUTEX(&osb->recovery_lock); 1285 mutex_init(&osb->recovery_lock);
1287 1286
1288 osb->disable_recovery = 0; 1287 osb->disable_recovery = 0;
1289 osb->recovery_thread_task = NULL; 1288 osb->recovery_thread_task = NULL;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 600a8bc5b541..fc29cb7a437d 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -77,8 +77,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
77 if (arr && ((inode = *arr) != NULL)) { 77 if (arr && ((inode = *arr) != NULL)) {
78 /* get a ref in addition to the array ref */ 78 /* get a ref in addition to the array ref */
79 inode = igrab(inode); 79 inode = igrab(inode);
80 if (!inode) 80 BUG_ON(!inode);
81 BUG();
82 81
83 return inode; 82 return inode;
84 } 83 }
@@ -89,8 +88,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
89 /* add one more if putting into array for first time */ 88 /* add one more if putting into array for first time */
90 if (arr && inode) { 89 if (arr && inode) {
91 *arr = igrab(inode); 90 *arr = igrab(inode);
92 if (!*arr) 91 BUG_ON(!*arr);
93 BUG();
94 } 92 }
95 return inode; 93 return inode;
96} 94}
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 3a0458fd3e1b..300b5bedfb21 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -388,7 +388,7 @@ out_free:
388 } 388 }
389} 389}
390 390
391/* Item insertion is guarded by ip_io_sem, so the insertion path takes 391/* Item insertion is guarded by ip_io_mutex, so the insertion path takes
392 * advantage of this by not rechecking for a duplicate insert during 392 * advantage of this by not rechecking for a duplicate insert during
393 * the slow case. Additionally, if the cache needs to be bumped up to 393 * the slow case. Additionally, if the cache needs to be bumped up to
394 * a tree, the code will not recheck after acquiring the lock -- 394 * a tree, the code will not recheck after acquiring the lock --
@@ -418,7 +418,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode,
418 (unsigned long long) bh->b_blocknr); 418 (unsigned long long) bh->b_blocknr);
419 419
420 /* No need to recheck under spinlock - insertion is guarded by 420 /* No need to recheck under spinlock - insertion is guarded by
421 * ip_io_sem */ 421 * ip_io_mutex */
422 spin_lock(&oi->ip_lock); 422 spin_lock(&oi->ip_lock);
423 if (ocfs2_insert_can_use_array(oi, ci)) { 423 if (ocfs2_insert_can_use_array(oi, ci)) {
424 /* Fast case - it's an array and there's a free 424 /* Fast case - it's an array and there's a free
@@ -440,7 +440,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode,
440 440
441/* Called against a newly allocated buffer. Most likely nobody should 441/* Called against a newly allocated buffer. Most likely nobody should
442 * be able to read this sort of metadata while it's still being 442 * be able to read this sort of metadata while it's still being
443 * allocated, but this is careful to take ip_io_sem anyway. */ 443 * allocated, but this is careful to take ip_io_mutex anyway. */
444void ocfs2_set_new_buffer_uptodate(struct inode *inode, 444void ocfs2_set_new_buffer_uptodate(struct inode *inode,
445 struct buffer_head *bh) 445 struct buffer_head *bh)
446{ 446{
@@ -451,9 +451,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
451 451
452 set_buffer_uptodate(bh); 452 set_buffer_uptodate(bh);
453 453
454 down(&oi->ip_io_sem); 454 mutex_lock(&oi->ip_io_mutex);
455 ocfs2_set_buffer_uptodate(inode, bh); 455 ocfs2_set_buffer_uptodate(inode, bh);
456 up(&oi->ip_io_sem); 456 mutex_unlock(&oi->ip_io_mutex);
457} 457}
458 458
459/* Requires ip_lock. */ 459/* Requires ip_lock. */
@@ -537,7 +537,7 @@ int __init init_ocfs2_uptodate_cache(void)
537 return 0; 537 return 0;
538} 538}
539 539
540void __exit exit_ocfs2_uptodate_cache(void) 540void exit_ocfs2_uptodate_cache(void)
541{ 541{
542 if (ocfs2_uptodate_cachep) 542 if (ocfs2_uptodate_cachep)
543 kmem_cache_destroy(ocfs2_uptodate_cachep); 543 kmem_cache_destroy(ocfs2_uptodate_cachep);
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index e5aacdf4eabf..01cd32d26b06 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -27,7 +27,7 @@
27#define OCFS2_UPTODATE_H 27#define OCFS2_UPTODATE_H
28 28
29int __init init_ocfs2_uptodate_cache(void); 29int __init init_ocfs2_uptodate_cache(void);
30void __exit exit_ocfs2_uptodate_cache(void); 30void exit_ocfs2_uptodate_cache(void);
31 31
32void ocfs2_metadata_cache_init(struct inode *inode); 32void ocfs2_metadata_cache_init(struct inode *inode);
33void ocfs2_metadata_cache_purge(struct inode *inode); 33void ocfs2_metadata_cache_purge(struct inode *inode);