aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
commitada47b5fe13d89735805b566185f4885f5a3f750 (patch)
tree644b88f8a71896307d71438e9b3af49126ffb22b /fs/ocfs2
parent43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff)
parent3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff)
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Kconfig10
-rw-r--r--fs/ocfs2/Makefile8
-rw-r--r--fs/ocfs2/acl.c169
-rw-r--r--fs/ocfs2/acl.h22
-rw-r--r--fs/ocfs2/alloc.c34
-rw-r--r--fs/ocfs2/alloc.h5
-rw-r--r--fs/ocfs2/aops.c52
-rw-r--r--fs/ocfs2/blockcheck.c2
-rw-r--r--fs/ocfs2/buffer_head_io.c5
-rw-r--r--fs/ocfs2/cluster/heartbeat.c13
-rw-r--r--fs/ocfs2/cluster/masklog.c3
-rw-r--r--fs/ocfs2/cluster/masklog.h7
-rw-r--r--fs/ocfs2/cluster/netdebug.c8
-rw-r--r--fs/ocfs2/cluster/nodemanager.c52
-rw-r--r--fs/ocfs2/cluster/nodemanager.h7
-rw-r--r--fs/ocfs2/cluster/quorum.c17
-rw-r--r--fs/ocfs2/cluster/tcp.c14
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h4
-rw-r--r--fs/ocfs2/dir.c39
-rw-r--r--fs/ocfs2/dlm/Makefile3
-rw-r--r--fs/ocfs2/dlm/dlmapi.h2
-rw-r--r--fs/ocfs2/dlm/dlmast.c8
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/dlm/dlmlock.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c44
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c165
-rw-r--r--fs/ocfs2/dlm/dlmthread.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c9
-rw-r--r--fs/ocfs2/dlmfs/Makefile5
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c (renamed from fs/ocfs2/dlm/dlmfs.c)141
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c (renamed from fs/ocfs2/dlm/dlmfsver.c)0
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h (renamed from fs/ocfs2/dlm/dlmfsver.h)0
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c (renamed from fs/ocfs2/dlm/userdlm.c)308
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h (renamed from fs/ocfs2/dlm/userdlm.h)16
-rw-r--r--fs/ocfs2/dlmglue.c373
-rw-r--r--fs/ocfs2/export.c2
-rw-r--r--fs/ocfs2/extent_map.c30
-rw-r--r--fs/ocfs2/file.c88
-rw-r--r--fs/ocfs2/heartbeat.c1
-rw-r--r--fs/ocfs2/inode.c94
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/ioctl.h6
-rw-r--r--fs/ocfs2/journal.c4
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/mmap.c1
-rw-r--r--fs/ocfs2/namei.c144
-rw-r--r--fs/ocfs2/ocfs2.h46
-rw-r--r--fs/ocfs2/ocfs2_fs.h70
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h79
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h2
-rw-r--r--fs/ocfs2/quota.h4
-rw-r--r--fs/ocfs2/quota_global.c8
-rw-r--r--fs/ocfs2/quota_local.c5
-rw-r--r--fs/ocfs2/refcounttree.c177
-rw-r--r--fs/ocfs2/stack_o2cb.c50
-rw-r--r--fs/ocfs2/stack_user.c52
-rw-r--r--fs/ocfs2/stackglue.c113
-rw-r--r--fs/ocfs2/stackglue.h95
-rw-r--r--fs/ocfs2/suballoc.c300
-rw-r--r--fs/ocfs2/suballoc.h6
-rw-r--r--fs/ocfs2/super.c107
-rw-r--r--fs/ocfs2/symlink.c12
-rw-r--r--fs/ocfs2/sysfile.c1
-rw-r--r--fs/ocfs2/uptodate.c4
-rw-r--r--fs/ocfs2/xattr.c2264
-rw-r--r--fs/ocfs2/xattr.h2
70 files changed, 3092 insertions, 2260 deletions
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 701b7a3a872e..0d840669698e 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -6,6 +6,7 @@ config OCFS2_FS
6 select CRC32 6 select CRC32
7 select QUOTA 7 select QUOTA
8 select QUOTA_TREE 8 select QUOTA_TREE
9 select FS_POSIX_ACL
9 help 10 help
10 OCFS2 is a general purpose extent based shared disk cluster file 11 OCFS2 is a general purpose extent based shared disk cluster file
11 system with many similarities to ext3. It supports 64 bit inode 12 system with many similarities to ext3. It supports 64 bit inode
@@ -74,12 +75,3 @@ config OCFS2_DEBUG_FS
74 This option will enable expensive consistency checks. Enable 75 This option will enable expensive consistency checks. Enable
75 this option for debugging only as it is likely to decrease 76 this option for debugging only as it is likely to decrease
76 performance of the filesystem. 77 performance of the filesystem.
77
78config OCFS2_FS_POSIX_ACL
79 bool "OCFS2 POSIX Access Control Lists"
80 depends on OCFS2_FS
81 select FS_POSIX_ACL
82 default n
83 help
84 Posix Access Control Lists (ACLs) support permissions for users and
85 groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 31f25ce32c97..791c0886c060 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -39,16 +39,14 @@ ocfs2-objs := \
39 ver.o \ 39 ver.o \
40 quota_local.o \ 40 quota_local.o \
41 quota_global.o \ 41 quota_global.o \
42 xattr.o 42 xattr.o \
43 43 acl.o
44ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
45ocfs2-objs += acl.o
46endif
47 44
48ocfs2_stackglue-objs := stackglue.o 45ocfs2_stackglue-objs := stackglue.o
49ocfs2_stack_o2cb-objs := stack_o2cb.o 46ocfs2_stack_o2cb-objs := stack_o2cb.o
50ocfs2_stack_user-objs := stack_user.o 47ocfs2_stack_user-objs := stack_user.o
51 48
49obj-$(CONFIG_OCFS2_FS) += dlmfs/
52# cluster/ is always needed when OCFS2_FS for masklog support 50# cluster/ is always needed when OCFS2_FS for masklog support
53obj-$(CONFIG_OCFS2_FS) += cluster/ 51obj-$(CONFIG_OCFS2_FS) += cluster/
54obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ 52obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index fbeaec762103..e13fc9e8fcdc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h>
24#include <linux/string.h> 25#include <linux/string.h>
25 26
26#define MLOG_MASK_PREFIX ML_INODE 27#define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
30#include "alloc.h" 31#include "alloc.h"
31#include "dlmglue.h" 32#include "dlmglue.h"
32#include "file.h" 33#include "file.h"
34#include "inode.h"
35#include "journal.h"
33#include "ocfs2_fs.h" 36#include "ocfs2_fs.h"
34 37
35#include "xattr.h" 38#include "xattr.h"
@@ -98,15 +101,11 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
98 int type, 101 int type,
99 struct buffer_head *di_bh) 102 struct buffer_head *di_bh)
100{ 103{
101 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
102 int name_index; 104 int name_index;
103 char *value = NULL; 105 char *value = NULL;
104 struct posix_acl *acl; 106 struct posix_acl *acl;
105 int retval; 107 int retval;
106 108
107 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
108 return NULL;
109
110 switch (type) { 109 switch (type) {
111 case ACL_TYPE_ACCESS: 110 case ACL_TYPE_ACCESS:
112 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; 111 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -170,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
170} 169}
171 170
172/* 171/*
172 * Helper function to set i_mode in memory and disk. Some call paths
173 * will not have di_bh or a journal handle to pass, in which case it
174 * will create it's own.
175 */
176static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
177 handle_t *handle, umode_t new_mode)
178{
179 int ret, commit_handle = 0;
180 struct ocfs2_dinode *di;
181
182 if (di_bh == NULL) {
183 ret = ocfs2_read_inode_block(inode, &di_bh);
184 if (ret) {
185 mlog_errno(ret);
186 goto out;
187 }
188 } else
189 get_bh(di_bh);
190
191 if (handle == NULL) {
192 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
193 OCFS2_INODE_UPDATE_CREDITS);
194 if (IS_ERR(handle)) {
195 ret = PTR_ERR(handle);
196 mlog_errno(ret);
197 goto out_brelse;
198 }
199
200 commit_handle = 1;
201 }
202
203 di = (struct ocfs2_dinode *)di_bh->b_data;
204 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
205 OCFS2_JOURNAL_ACCESS_WRITE);
206 if (ret) {
207 mlog_errno(ret);
208 goto out_commit;
209 }
210
211 inode->i_mode = new_mode;
212 di->i_mode = cpu_to_le16(inode->i_mode);
213
214 ocfs2_journal_dirty(handle, di_bh);
215
216out_commit:
217 if (commit_handle)
218 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
219out_brelse:
220 brelse(di_bh);
221out:
222 return ret;
223}
224
225/*
173 * Set the access or default ACL of an inode. 226 * Set the access or default ACL of an inode.
174 */ 227 */
175static int ocfs2_set_acl(handle_t *handle, 228static int ocfs2_set_acl(handle_t *handle,
@@ -197,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
197 if (ret < 0) 250 if (ret < 0)
198 return ret; 251 return ret;
199 else { 252 else {
200 inode->i_mode = mode;
201 if (ret == 0) 253 if (ret == 0)
202 acl = NULL; 254 acl = NULL;
255
256 ret = ocfs2_acl_set_mode(inode, di_bh,
257 handle, mode);
258 if (ret)
259 return ret;
260
203 } 261 }
204 } 262 }
205 break; 263 break;
@@ -287,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
287 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
288 struct posix_acl *acl = NULL; 346 struct posix_acl *acl = NULL;
289 int ret = 0; 347 int ret = 0;
348 mode_t mode;
290 349
291 if (!S_ISLNK(inode->i_mode)) { 350 if (!S_ISLNK(inode->i_mode)) {
292 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 351 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -295,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
295 if (IS_ERR(acl)) 354 if (IS_ERR(acl))
296 return PTR_ERR(acl); 355 return PTR_ERR(acl);
297 } 356 }
298 if (!acl) 357 if (!acl) {
299 inode->i_mode &= ~current_umask(); 358 mode = inode->i_mode & ~current_umask();
359 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
360 if (ret) {
361 mlog_errno(ret);
362 goto cleanup;
363 }
364 }
300 } 365 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 366 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone; 367 struct posix_acl *clone;
303 mode_t mode;
304 368
305 if (S_ISDIR(inode->i_mode)) { 369 if (S_ISDIR(inode->i_mode)) {
306 ret = ocfs2_set_acl(handle, inode, di_bh, 370 ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -317,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
317 mode = inode->i_mode; 381 mode = inode->i_mode;
318 ret = posix_acl_create_masq(clone, &mode); 382 ret = posix_acl_create_masq(clone, &mode);
319 if (ret >= 0) { 383 if (ret >= 0) {
320 inode->i_mode = mode; 384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
321 if (ret > 0) { 385 if (ret > 0) {
322 ret = ocfs2_set_acl(handle, inode, 386 ret = ocfs2_set_acl(handle, inode,
323 di_bh, ACL_TYPE_ACCESS, 387 di_bh, ACL_TYPE_ACCESS,
@@ -331,13 +395,14 @@ cleanup:
331 return ret; 395 return ret;
332} 396}
333 397
334static size_t ocfs2_xattr_list_acl_access(struct inode *inode, 398static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
335 char *list, 399 char *list,
336 size_t list_len, 400 size_t list_len,
337 const char *name, 401 const char *name,
338 size_t name_len) 402 size_t name_len,
403 int type)
339{ 404{
340 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 405 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
341 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); 406 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
342 407
343 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 408 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
@@ -348,13 +413,14 @@ static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
348 return size; 413 return size;
349} 414}
350 415
351static size_t ocfs2_xattr_list_acl_default(struct inode *inode, 416static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
352 char *list, 417 char *list,
353 size_t list_len, 418 size_t list_len,
354 const char *name, 419 const char *name,
355 size_t name_len) 420 size_t name_len,
421 int type)
356{ 422{
357 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 423 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
358 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); 424 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
359 425
360 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 426 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
@@ -365,19 +431,19 @@ static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
365 return size; 431 return size;
366} 432}
367 433
368static int ocfs2_xattr_get_acl(struct inode *inode, 434static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
369 int type, 435 void *buffer, size_t size, int type)
370 void *buffer,
371 size_t size)
372{ 436{
373 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 437 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
374 struct posix_acl *acl; 438 struct posix_acl *acl;
375 int ret; 439 int ret;
376 440
441 if (strcmp(name, "") != 0)
442 return -EINVAL;
377 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 443 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
378 return -EOPNOTSUPP; 444 return -EOPNOTSUPP;
379 445
380 acl = ocfs2_get_acl(inode, type); 446 acl = ocfs2_get_acl(dentry->d_inode, type);
381 if (IS_ERR(acl)) 447 if (IS_ERR(acl))
382 return PTR_ERR(acl); 448 return PTR_ERR(acl);
383 if (acl == NULL) 449 if (acl == NULL)
@@ -388,35 +454,16 @@ static int ocfs2_xattr_get_acl(struct inode *inode,
388 return ret; 454 return ret;
389} 455}
390 456
391static int ocfs2_xattr_get_acl_access(struct inode *inode, 457static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
392 const char *name, 458 const void *value, size_t size, int flags, int type)
393 void *buffer,
394 size_t size)
395{
396 if (strcmp(name, "") != 0)
397 return -EINVAL;
398 return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
399}
400
401static int ocfs2_xattr_get_acl_default(struct inode *inode,
402 const char *name,
403 void *buffer,
404 size_t size)
405{
406 if (strcmp(name, "") != 0)
407 return -EINVAL;
408 return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
409}
410
411static int ocfs2_xattr_set_acl(struct inode *inode,
412 int type,
413 const void *value,
414 size_t size)
415{ 459{
460 struct inode *inode = dentry->d_inode;
416 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 461 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
417 struct posix_acl *acl; 462 struct posix_acl *acl;
418 int ret = 0; 463 int ret = 0;
419 464
465 if (strcmp(name, "") != 0)
466 return -EINVAL;
420 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 467 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
421 return -EOPNOTSUPP; 468 return -EOPNOTSUPP;
422 469
@@ -442,38 +489,18 @@ cleanup:
442 return ret; 489 return ret;
443} 490}
444 491
445static int ocfs2_xattr_set_acl_access(struct inode *inode,
446 const char *name,
447 const void *value,
448 size_t size,
449 int flags)
450{
451 if (strcmp(name, "") != 0)
452 return -EINVAL;
453 return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
454}
455
456static int ocfs2_xattr_set_acl_default(struct inode *inode,
457 const char *name,
458 const void *value,
459 size_t size,
460 int flags)
461{
462 if (strcmp(name, "") != 0)
463 return -EINVAL;
464 return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
465}
466
467struct xattr_handler ocfs2_xattr_acl_access_handler = { 492struct xattr_handler ocfs2_xattr_acl_access_handler = {
468 .prefix = POSIX_ACL_XATTR_ACCESS, 493 .prefix = POSIX_ACL_XATTR_ACCESS,
494 .flags = ACL_TYPE_ACCESS,
469 .list = ocfs2_xattr_list_acl_access, 495 .list = ocfs2_xattr_list_acl_access,
470 .get = ocfs2_xattr_get_acl_access, 496 .get = ocfs2_xattr_get_acl,
471 .set = ocfs2_xattr_set_acl_access, 497 .set = ocfs2_xattr_set_acl,
472}; 498};
473 499
474struct xattr_handler ocfs2_xattr_acl_default_handler = { 500struct xattr_handler ocfs2_xattr_acl_default_handler = {
475 .prefix = POSIX_ACL_XATTR_DEFAULT, 501 .prefix = POSIX_ACL_XATTR_DEFAULT,
502 .flags = ACL_TYPE_DEFAULT,
476 .list = ocfs2_xattr_list_acl_default, 503 .list = ocfs2_xattr_list_acl_default,
477 .get = ocfs2_xattr_get_acl_default, 504 .get = ocfs2_xattr_get_acl,
478 .set = ocfs2_xattr_set_acl_default, 505 .set = ocfs2_xattr_set_acl,
479}; 506};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 8f6389ed4da5..5c5d31f05853 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,8 +26,6 @@ struct ocfs2_acl_entry {
26 __le32 e_id; 26 __le32 e_id;
27}; 27};
28 28
29#ifdef CONFIG_OCFS2_FS_POSIX_ACL
30
31extern int ocfs2_check_acl(struct inode *, int); 29extern int ocfs2_check_acl(struct inode *, int);
32extern int ocfs2_acl_chmod(struct inode *); 30extern int ocfs2_acl_chmod(struct inode *);
33extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, 31extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
@@ -35,24 +33,4 @@ extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
35 struct ocfs2_alloc_context *, 33 struct ocfs2_alloc_context *,
36 struct ocfs2_alloc_context *); 34 struct ocfs2_alloc_context *);
37 35
38#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
39
40#define ocfs2_check_acl NULL
41static inline int ocfs2_acl_chmod(struct inode *inode)
42{
43 return 0;
44}
45static inline int ocfs2_init_acl(handle_t *handle,
46 struct inode *inode,
47 struct inode *dir,
48 struct buffer_head *di_bh,
49 struct buffer_head *dir_bh,
50 struct ocfs2_alloc_context *meta_ac,
51 struct ocfs2_alloc_context *data_ac)
52{
53 return 0;
54}
55
56#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
57
58#endif /* OCFS2_ACL_H */ 36#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5d59ff..9f8bd913c51e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1050,7 +1050,8 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1051 eb->h_blkno = cpu_to_le64(first_blkno); 1051 eb->h_blkno = cpu_to_le64(first_blkno);
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1054 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1055 eb->h_list.l_count = 1056 eb->h_list.l_count =
1056 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1765,9 +1766,9 @@ set_and_inc:
1765 * 1766 *
1766 * The array index of the subtree root is passed back. 1767 * The array index of the subtree root is passed back.
1767 */ 1768 */
1768static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, 1769int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1769 struct ocfs2_path *left, 1770 struct ocfs2_path *left,
1770 struct ocfs2_path *right) 1771 struct ocfs2_path *right)
1771{ 1772{
1772 int i = 0; 1773 int i = 0;
1773 1774
@@ -2398,7 +2399,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2398 * 2399 *
2399 * The array is assumed to be large enough to hold an entire path (tree depth). 2400 * The array is assumed to be large enough to hold an entire path (tree depth).
2400 * 2401 *
2401 * Upon succesful return from this function: 2402 * Upon successful return from this function:
2402 * 2403 *
2403 * - The 'right_path' array will contain a path to the leaf block 2404 * - The 'right_path' array will contain a path to the leaf block
2404 * whose range contains e_cpos. 2405 * whose range contains e_cpos.
@@ -2872,8 +2873,8 @@ out:
2872 * This looks similar, but is subtly different to 2873 * This looks similar, but is subtly different to
2873 * ocfs2_find_cpos_for_left_leaf(). 2874 * ocfs2_find_cpos_for_left_leaf().
2874 */ 2875 */
2875static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 2876int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2876 struct ocfs2_path *path, u32 *cpos) 2877 struct ocfs2_path *path, u32 *cpos)
2877{ 2878{
2878 int i, j, ret = 0; 2879 int i, j, ret = 0;
2879 u64 blkno; 2880 u64 blkno;
@@ -5712,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5712 goto out; 5713 goto out;
5713 } 5714 }
5714 5715
5715 vfs_dq_free_space_nodirty(inode, 5716 dquot_free_space_nodirty(inode,
5716 ocfs2_clusters_to_bytes(inode->i_sb, len)); 5717 ocfs2_clusters_to_bytes(inode->i_sb, len));
5717 5718
5718 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); 5719 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
@@ -6037,7 +6038,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
6037 if (status < 0) 6038 if (status < 0)
6038 mlog_errno(status); 6039 mlog_errno(status);
6039 else 6040 else
6040 ocfs2_init_inode_steal_slot(osb); 6041 ocfs2_init_steal_slots(osb);
6041 6042
6042 mlog_exit(status); 6043 mlog_exit(status);
6043} 6044}
@@ -6935,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6935 goto bail; 6936 goto bail;
6936 } 6937 }
6937 6938
6938 vfs_dq_free_space_nodirty(inode, 6939 dquot_free_space_nodirty(inode,
6939 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); 6940 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6940 spin_lock(&OCFS2_I(inode)->ip_lock); 6941 spin_lock(&OCFS2_I(inode)->ip_lock);
6941 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6942 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
@@ -7190,8 +7191,8 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
7190 * wait on them - the truncate_inode_pages() call later will 7191 * wait on them - the truncate_inode_pages() call later will
7191 * do that for us. 7192 * do that for us.
7192 */ 7193 */
7193 ret = do_sync_mapping_range(inode->i_mapping, range_start, 7194 ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
7194 range_end - 1, SYNC_FILE_RANGE_WRITE); 7195 range_end - 1);
7195 if (ret) 7196 if (ret)
7196 mlog_errno(ret); 7197 mlog_errno(ret);
7197 7198
@@ -7300,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7300 unsigned int page_end; 7301 unsigned int page_end;
7301 u64 phys; 7302 u64 phys;
7302 7303
7303 if (vfs_dq_alloc_space_nodirty(inode, 7304 ret = dquot_alloc_space_nodirty(inode,
7304 ocfs2_clusters_to_bytes(osb->sb, 1))) { 7305 ocfs2_clusters_to_bytes(osb->sb, 1));
7305 ret = -EDQUOT; 7306 if (ret)
7306 goto out_commit; 7307 goto out_commit;
7307 }
7308 did_quota = 1; 7308 did_quota = 1;
7309 7309
7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
@@ -7380,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7380 7380
7381out_commit: 7381out_commit:
7382 if (ret < 0 && did_quota) 7382 if (ret < 0 && did_quota)
7383 vfs_dq_free_space_nodirty(inode, 7383 dquot_free_space_nodirty(inode,
7384 ocfs2_clusters_to_bytes(osb->sb, 1)); 7384 ocfs2_clusters_to_bytes(osb->sb, 1));
7385 7385
7386 ocfs2_commit_trans(osb, handle); 7386 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 9c122d574464..1db4359ccb90 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle,
317int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, 317int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
318 handle_t *handle, 318 handle_t *handle,
319 struct ocfs2_path *path); 319 struct ocfs2_path *path);
320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
321 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
323 struct ocfs2_path *left,
324 struct ocfs2_path *right);
320#endif /* OCFS2_ALLOC_H */ 325#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..21441ddb5506 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
547 * 547 *
548 * called like this: dio->get_blocks(dio->inode, fs_startblk, 548 * called like this: dio->get_blocks(dio->inode, fs_startblk,
549 * fs_count, map_bh, dio->rw == WRITE); 549 * fs_count, map_bh, dio->rw == WRITE);
550 *
551 * Note that we never bother to allocate blocks here, and thus ignore the
552 * create argument.
550 */ 553 */
551static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 554static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
552 struct buffer_head *bh_result, int create) 555 struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
563 566
564 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 567 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
565 568
566 /*
567 * Any write past EOF is not allowed because we'd be extending.
568 */
569 if (create && (iblock + max_blocks) > inode_blocks) {
570 ret = -EIO;
571 goto bail;
572 }
573
574 /* This figures out the size of the next contiguous block, and 569 /* This figures out the size of the next contiguous block, and
575 * our logical offset */ 570 * our logical offset */
576 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 571 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,17 +577,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
582 goto bail; 577 goto bail;
583 } 578 }
584 579
585 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { 580 /* We should already CoW the refcounted extent in case of create. */
586 ocfs2_error(inode->i_sb, 581 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
587 "Inode %llu has a hole at block %llu\n",
588 (unsigned long long)OCFS2_I(inode)->ip_blkno,
589 (unsigned long long)iblock);
590 ret = -EROFS;
591 goto bail;
592 }
593 582
594 /* We should already CoW the refcounted extent. */
595 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
596 /* 583 /*
597 * get_more_blocks() expects us to describe a hole by clearing 584 * get_more_blocks() expects us to describe a hole by clearing
598 * the mapped bit on bh_result(). 585 * the mapped bit on bh_result().
@@ -601,20 +588,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
601 */ 588 */
602 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 589 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603 map_bh(bh_result, inode->i_sb, p_blkno); 590 map_bh(bh_result, inode->i_sb, p_blkno);
604 else { 591 else
605 /*
606 * ocfs2_prepare_inode_for_write() should have caught
607 * the case where we'd be filling a hole and triggered
608 * a buffered write instead.
609 */
610 if (create) {
611 ret = -EIO;
612 mlog_errno(ret);
613 goto bail;
614 }
615
616 clear_buffer_mapped(bh_result); 592 clear_buffer_mapped(bh_result);
617 }
618 593
619 /* make sure we don't map more than max_blocks blocks here as 594 /* make sure we don't map more than max_blocks blocks here as
620 that's all the kernel will handle at this point. */ 595 that's all the kernel will handle at this point. */
@@ -625,7 +600,7 @@ bail:
625 return ret; 600 return ret;
626} 601}
627 602
628/* 603/*
629 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 604 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
630 * particularly interested in the aio/dio case. Like the core uses 605 * particularly interested in the aio/dio case. Like the core uses
631 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 606 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
@@ -696,7 +671,7 @@ static ssize_t ocfs2_direct_IO(int rw,
696 671
697 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 672 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
698 inode->i_sb->s_bdev, iov, offset, 673 inode->i_sb->s_bdev, iov, offset,
699 nr_segs, 674 nr_segs,
700 ocfs2_direct_IO_get_blocks, 675 ocfs2_direct_IO_get_blocks,
701 ocfs2_dio_end_io); 676 ocfs2_dio_end_io);
702 677
@@ -1789,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1789 1764
1790 wc->w_handle = handle; 1765 wc->w_handle = handle;
1791 1766
1792 if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, 1767 if (clusters_to_alloc) {
1793 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { 1768 ret = dquot_alloc_space_nodirty(inode,
1794 ret = -EDQUOT; 1769 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1795 goto out_commit; 1770 if (ret)
1771 goto out_commit;
1796 } 1772 }
1797 /* 1773 /*
1798 * We don't want this to fail in ocfs2_write_end(), so do it 1774 * We don't want this to fail in ocfs2_write_end(), so do it
@@ -1835,7 +1811,7 @@ success:
1835 return 0; 1811 return 0;
1836out_quota: 1812out_quota:
1837 if (clusters_to_alloc) 1813 if (clusters_to_alloc)
1838 vfs_dq_free_space(inode, 1814 dquot_free_space(inode,
1839 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); 1815 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1840out_commit: 1816out_commit:
1841 ocfs2_commit_trans(osb, handle); 1817 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index a1163b8b417c..b7428c5d0d3b 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -47,7 +47,7 @@
47 * Calculate the bit offset in the hamming code buffer based on the bit's 47 * Calculate the bit offset in the hamming code buffer based on the bit's
48 * offset in the data buffer. Since the hamming code reserves all 48 * offset in the data buffer. Since the hamming code reserves all
49 * power-of-two bits for parity, the data bit number and the code bit 49 * power-of-two bits for parity, the data bit number and the code bit
50 * number are offest by all the parity bits beforehand. 50 * number are offset by all the parity bits beforehand.
51 * 51 *
52 * Recall that bit numbers in hamming code are 1-based. This function 52 * Recall that bit numbers in hamming code are 1-based. This function
53 * takes the 0-based data bit from the caller. 53 * takes the 0-based data bit from the caller.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d43d34a1dd31..f9d5d3ffc75a 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
@@ -368,7 +367,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
368 } 367 }
369 ocfs2_metadata_cache_io_unlock(ci); 368 ocfs2_metadata_cache_io_unlock(ci);
370 369
371 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 370 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
372 (unsigned long long)block, nr, 371 (unsigned long long)block, nr,
373 ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", 372 ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
374 flags); 373 flags);
@@ -407,6 +406,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
407 struct buffer_head *bh) 406 struct buffer_head *bh)
408{ 407{
409 int ret = 0; 408 int ret = 0;
409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
410 410
411 mlog_entry_void(); 411 mlog_entry_void();
412 412
@@ -426,6 +426,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
426 426
427 get_bh(bh); /* for end_buffer_write_sync() */ 427 get_bh(bh); /* for end_buffer_write_sync() */
428 bh->b_end_io = end_buffer_write_sync; 428 bh->b_end_io = end_buffer_write_sync;
429 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
429 submit_bh(WRITE, bh); 430 submit_bh(WRITE, bh);
430 431
431 wait_on_buffer(bh); 432 wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index c452d116b892..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h> 36#include <linux/debugfs.h>
37#include <linux/slab.h>
37 38
38#include "heartbeat.h" 39#include "heartbeat.h"
39#include "tcp.h" 40#include "tcp.h"
@@ -78,7 +79,7 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
78 79
79unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 80unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
80 81
81/* Only sets a new threshold if there are no active regions. 82/* Only sets a new threshold if there are no active regions.
82 * 83 *
83 * No locking or otherwise interesting code is required for reading 84 * No locking or otherwise interesting code is required for reading
84 * o2hb_dead_threshold as it can't change once regions are active and 85 * o2hb_dead_threshold as it can't change once regions are active and
@@ -170,13 +171,14 @@ static void o2hb_write_timeout(struct work_struct *work)
170 171
171 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " 172 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
172 "milliseconds\n", reg->hr_dev_name, 173 "milliseconds\n", reg->hr_dev_name,
173 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 174 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
174 o2quo_disk_timeout(); 175 o2quo_disk_timeout();
175} 176}
176 177
177static void o2hb_arm_write_timeout(struct o2hb_region *reg) 178static void o2hb_arm_write_timeout(struct o2hb_region *reg)
178{ 179{
179 mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); 180 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
181 O2HB_MAX_WRITE_TIMEOUT_MS);
180 182
181 cancel_delayed_work(&reg->hr_write_timeout_work); 183 cancel_delayed_work(&reg->hr_write_timeout_work);
182 reg->hr_last_timeout_start = jiffies; 184 reg->hr_last_timeout_start = jiffies;
@@ -623,7 +625,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
623 "seq %llu last %llu changed %u equal %u\n", 625 "seq %llu last %llu changed %u equal %u\n",
624 slot->ds_node_num, (long long)slot->ds_last_generation, 626 slot->ds_node_num, (long long)slot->ds_last_generation,
625 le32_to_cpu(hb_block->hb_cksum), 627 le32_to_cpu(hb_block->hb_cksum),
626 (unsigned long long)le64_to_cpu(hb_block->hb_seq), 628 (unsigned long long)le64_to_cpu(hb_block->hb_seq),
627 (unsigned long long)slot->ds_last_time, slot->ds_changed_samples, 629 (unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
628 slot->ds_equal_samples); 630 slot->ds_equal_samples);
629 631
@@ -874,7 +876,8 @@ static int o2hb_thread(void *data)
874 do_gettimeofday(&after_hb); 876 do_gettimeofday(&after_hb);
875 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 877 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
876 878
877 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", 879 mlog(ML_HEARTBEAT,
880 "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
878 before_hb.tv_sec, (unsigned long) before_hb.tv_usec, 881 before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
879 after_hb.tv_sec, (unsigned long) after_hb.tv_usec, 882 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
880 elapsed_msec); 883 elapsed_msec);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1cd2934de615..3bb928a2bf7d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -112,6 +112,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT), 114 define_mask(REFCOUNT),
115 define_mask(BASTS),
115 define_mask(ERROR), 116 define_mask(ERROR),
116 define_mask(NOTICE), 117 define_mask(NOTICE),
117 define_mask(KTHREAD), 118 define_mask(KTHREAD),
@@ -135,7 +136,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
135 return mlog_mask_store(mlog_attr->mask, buf, count); 136 return mlog_mask_store(mlog_attr->mask, buf, count);
136} 137}
137 138
138static struct sysfs_ops mlog_attr_ops = { 139static const struct sysfs_ops mlog_attr_ops = {
139 .show = mlog_show, 140 .show = mlog_show,
140 .store = mlog_store, 141 .store = mlog_store,
141}; 142};
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 9b4d11726cf2..3dfddbec32f2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,6 +114,7 @@
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ 116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
117#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */
117/* bits that are infrequently given and frequently matched in the high word */ 118/* bits that are infrequently given and frequently matched in the high word */
118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
@@ -194,9 +195,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
194 * previous token if args expands to nothing. 195 * previous token if args expands to nothing.
195 */ 196 */
196#define __mlog_printk(level, fmt, args...) \ 197#define __mlog_printk(level, fmt, args...) \
197 printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current), \ 198 printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
198 __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ 199 task_pid_nr(current), __mlog_cpu_guess, \
199 ##args) 200 __PRETTY_FUNCTION__, __LINE__ , ##args)
200 201
201#define mlog(mask, fmt, args...) do { \ 202#define mlog(mask, fmt, args...) do { \
202 u64 __m = MLOG_MASK_PREFIX | (mask); \ 203 u64 __m = MLOG_MASK_PREFIX | (mask); \
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index da794bc07a6c..a3f150e52b02 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -294,10 +294,10 @@ static int sc_seq_show(struct seq_file *seq, void *v)
294 if (sc->sc_sock) { 294 if (sc->sc_sock) {
295 inet = inet_sk(sc->sc_sock->sk); 295 inet = inet_sk(sc->sc_sock->sk);
296 /* the stack's structs aren't sparse endian clean */ 296 /* the stack's structs aren't sparse endian clean */
297 saddr = (__force __be32)inet->saddr; 297 saddr = (__force __be32)inet->inet_saddr;
298 daddr = (__force __be32)inet->daddr; 298 daddr = (__force __be32)inet->inet_daddr;
299 sport = (__force __be16)inet->sport; 299 sport = (__force __be16)inet->inet_sport;
300 dport = (__force __be16)inet->dport; 300 dport = (__force __be16)inet->inet_dport;
301 } 301 }
302 302
303 /* XXX sigh, inet-> doesn't have sparse annotation so any 303 /* XXX sigh, inet-> doesn't have sparse annotation so any
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 7ee6188bc79a..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22#include <linux/slab.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/configfs.h> 25#include <linux/configfs.h>
@@ -35,6 +36,10 @@
35 * cluster references throughout where nodes are looked up */ 36 * cluster references throughout where nodes are looked up */
36struct o2nm_cluster *o2nm_single_cluster = NULL; 37struct o2nm_cluster *o2nm_single_cluster = NULL;
37 38
39char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
40 "reset", /* O2NM_FENCE_RESET */
41 "panic", /* O2NM_FENCE_PANIC */
42};
38 43
39struct o2nm_node *o2nm_get_node_by_num(u8 node_num) 44struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
40{ 45{
@@ -579,6 +584,43 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
579 return o2nm_cluster_attr_write(page, count, 584 return o2nm_cluster_attr_write(page, count,
580 &cluster->cl_reconnect_delay_ms); 585 &cluster->cl_reconnect_delay_ms);
581} 586}
587
588static ssize_t o2nm_cluster_attr_fence_method_read(
589 struct o2nm_cluster *cluster, char *page)
590{
591 ssize_t ret = 0;
592
593 if (cluster)
594 ret = sprintf(page, "%s\n",
595 o2nm_fence_method_desc[cluster->cl_fence_method]);
596 return ret;
597}
598
599static ssize_t o2nm_cluster_attr_fence_method_write(
600 struct o2nm_cluster *cluster, const char *page, size_t count)
601{
602 unsigned int i;
603
604 if (page[count - 1] != '\n')
605 goto bail;
606
607 for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
608 if (count != strlen(o2nm_fence_method_desc[i]) + 1)
609 continue;
610 if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
611 continue;
612 if (cluster->cl_fence_method != i) {
613 printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
614 o2nm_fence_method_desc[i]);
615 cluster->cl_fence_method = i;
616 }
617 return count;
618 }
619
620bail:
621 return -EINVAL;
622}
623
582static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { 624static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
583 .attr = { .ca_owner = THIS_MODULE, 625 .attr = { .ca_owner = THIS_MODULE,
584 .ca_name = "idle_timeout_ms", 626 .ca_name = "idle_timeout_ms",
@@ -603,10 +645,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
603 .store = o2nm_cluster_attr_reconnect_delay_ms_write, 645 .store = o2nm_cluster_attr_reconnect_delay_ms_write,
604}; 646};
605 647
648static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
649 .attr = { .ca_owner = THIS_MODULE,
650 .ca_name = "fence_method",
651 .ca_mode = S_IRUGO | S_IWUSR },
652 .show = o2nm_cluster_attr_fence_method_read,
653 .store = o2nm_cluster_attr_fence_method_write,
654};
655
606static struct configfs_attribute *o2nm_cluster_attrs[] = { 656static struct configfs_attribute *o2nm_cluster_attrs[] = {
607 &o2nm_cluster_attr_idle_timeout_ms.attr, 657 &o2nm_cluster_attr_idle_timeout_ms.attr,
608 &o2nm_cluster_attr_keepalive_delay_ms.attr, 658 &o2nm_cluster_attr_keepalive_delay_ms.attr,
609 &o2nm_cluster_attr_reconnect_delay_ms.attr, 659 &o2nm_cluster_attr_reconnect_delay_ms.attr,
660 &o2nm_cluster_attr_fence_method.attr,
610 NULL, 661 NULL,
611}; 662};
612static ssize_t o2nm_cluster_show(struct config_item *item, 663static ssize_t o2nm_cluster_show(struct config_item *item,
@@ -778,6 +829,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
778 cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; 829 cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
779 cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; 830 cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
780 cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; 831 cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
832 cluster->cl_fence_method = O2NM_FENCE_RESET;
781 833
782 ret = &cluster->cl_group; 834 ret = &cluster->cl_group;
783 o2nm_single_cluster = cluster; 835 o2nm_single_cluster = cluster;
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index c992ea0da4ad..09ea2d388bbb 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,6 +33,12 @@
33#include <linux/configfs.h> 33#include <linux/configfs.h>
34#include <linux/rbtree.h> 34#include <linux/rbtree.h>
35 35
36enum o2nm_fence_method {
37 O2NM_FENCE_RESET = 0,
38 O2NM_FENCE_PANIC,
39 O2NM_FENCE_METHODS, /* Number of fence methods */
40};
41
36struct o2nm_node { 42struct o2nm_node {
37 spinlock_t nd_lock; 43 spinlock_t nd_lock;
38 struct config_item nd_item; 44 struct config_item nd_item;
@@ -58,6 +64,7 @@ struct o2nm_cluster {
58 unsigned int cl_idle_timeout_ms; 64 unsigned int cl_idle_timeout_ms;
59 unsigned int cl_keepalive_delay_ms; 65 unsigned int cl_keepalive_delay_ms;
60 unsigned int cl_reconnect_delay_ms; 66 unsigned int cl_reconnect_delay_ms;
67 enum o2nm_fence_method cl_fence_method;
61 68
62 /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ 69 /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
63 unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 70 unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index bbacf7da48a4..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
44 * and if they're the last, they fire off the decision. 44 * and if they're the last, they fire off the decision.
45 */ 45 */
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/reboot.h> 48#include <linux/reboot.h>
50 49
@@ -74,8 +73,20 @@ static void o2quo_fence_self(void)
74 * threads can still schedule, etc, etc */ 73 * threads can still schedule, etc, etc */
75 o2hb_stop_all_regions(); 74 o2hb_stop_all_regions();
76 75
77 printk("ocfs2 is very sorry to be fencing this system by restarting\n"); 76 switch (o2nm_single_cluster->cl_fence_method) {
78 emergency_restart(); 77 case O2NM_FENCE_PANIC:
78 panic("*** ocfs2 is very sorry to be fencing this system by "
79 "panicing ***\n");
80 break;
81 default:
82 WARN_ON(o2nm_single_cluster->cl_fence_method >=
83 O2NM_FENCE_METHODS);
84 case O2NM_FENCE_RESET:
85 printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
86 "system by restarting ***\n");
87 emergency_restart();
88 break;
89 };
79} 90}
80 91
81/* Indicate that a timeout occured on a hearbeat region write. The 92/* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 334f231a422c..73e743eea2c8 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,9 +72,9 @@
72 72
73#include "tcp_internal.h" 73#include "tcp_internal.h"
74 74
75#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u" 75#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ 76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
77 NIPQUAD(sc->sc_node->nd_ipv4_address), \ 77 &sc->sc_node->nd_ipv4_address, \
78 ntohs(sc->sc_node->nd_ipv4_port) 78 ntohs(sc->sc_node->nd_ipv4_port)
79 79
80/* 80/*
@@ -485,7 +485,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
485 } 485 }
486 486
487 if (was_valid && !valid) { 487 if (was_valid && !valid) {
488 printk(KERN_INFO "o2net: no longer connected to " 488 printk(KERN_NOTICE "o2net: no longer connected to "
489 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 489 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
490 o2net_complete_nodes_nsw(nn); 490 o2net_complete_nodes_nsw(nn);
491 } 491 }
@@ -493,7 +493,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
493 if (!was_valid && valid) { 493 if (!was_valid && valid) {
494 o2quo_conn_up(o2net_num_from_nn(nn)); 494 o2quo_conn_up(o2net_num_from_nn(nn));
495 cancel_delayed_work(&nn->nn_connect_expired); 495 cancel_delayed_work(&nn->nn_connect_expired);
496 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", 496 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
497 o2nm_this_node() > sc->sc_node->nd_num ? 497 o2nm_this_node() > sc->sc_node->nd_num ?
498 "connected to" : "accepted connection from", 498 "connected to" : "accepted connection from",
499 SC_NODEF_ARGS(sc)); 499 SC_NODEF_ARGS(sc));
@@ -930,7 +930,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
930 cond_resched(); 930 cond_resched();
931 continue; 931 continue;
932 } 932 }
933 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 933 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
934 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); 934 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
935 o2net_ensure_shutdown(nn, sc, 0); 935 o2net_ensure_shutdown(nn, sc, 0);
936 break; 936 break;
@@ -1476,14 +1476,14 @@ static void o2net_idle_timer(unsigned long data)
1476 1476
1477 do_gettimeofday(&now); 1477 do_gettimeofday(&now);
1478 1478
1479 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1479 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
1480 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1480 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
1481 o2net_idle_timeout() / 1000, 1481 o2net_idle_timeout() / 1000,
1482 o2net_idle_timeout() % 1000); 1482 o2net_idle_timeout() % 1000);
1483 mlog(ML_NOTICE, "here are some times that might help debug the " 1483 mlog(ML_NOTICE, "here are some times that might help debug the "
1484 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1484 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1485 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1485 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
1486 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, 1486 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
1487 now.tv_sec, (long) now.tv_usec, 1487 now.tv_sec, (long) now.tv_usec,
1488 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, 1488 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
1489 sc->sc_tv_advance_start.tv_sec, 1489 sc->sc_tv_advance_start.tv_sec,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe410b1..96fa7ebc530c 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -32,10 +32,10 @@
32 * on their number */ 32 * on their number */
33#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) 33#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
34 34
35/* 35/*
36 * This version number represents quite a lot, unfortunately. It not 36 * This version number represents quite a lot, unfortunately. It not
37 * only represents the raw network message protocol on the wire but also 37 * only represents the raw network message protocol on the wire but also
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * With version 11, we separate out the filesystem locking portion. The 41 * With version 11, we separate out the filesystem locking portion. The
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec238796..efd77d071c80 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2439,7 +2439,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2440 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num); 2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2964 goto out; 2964 goto out;
2965 } 2965 }
2966 2966
2967 if (vfs_dq_alloc_space_nodirty(dir, 2967 ret = dquot_alloc_space_nodirty(dir,
2968 ocfs2_clusters_to_bytes(osb->sb, 2968 ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
2969 alloc + dx_alloc))) { 2969 if (ret)
2970 ret = -EDQUOT;
2971 goto out_commit; 2970 goto out_commit;
2972 }
2973 did_quota = 1; 2971 did_quota = 1;
2974 2972
2975 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2973 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3178 3176
3179out_commit: 3177out_commit:
3180 if (ret < 0 && did_quota) 3178 if (ret < 0 && did_quota)
3181 vfs_dq_free_space_nodirty(dir, bytes_allocated); 3179 dquot_free_space_nodirty(dir, bytes_allocated);
3182 3180
3183 ocfs2_commit_trans(osb, handle); 3181 ocfs2_commit_trans(osb, handle);
3184 3182
@@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
3221 if (extend) { 3219 if (extend) {
3222 u32 offset = OCFS2_I(dir)->ip_clusters; 3220 u32 offset = OCFS2_I(dir)->ip_clusters;
3223 3221
3224 if (vfs_dq_alloc_space_nodirty(dir, 3222 status = dquot_alloc_space_nodirty(dir,
3225 ocfs2_clusters_to_bytes(sb, 1))) { 3223 ocfs2_clusters_to_bytes(sb, 1));
3226 status = -EDQUOT; 3224 if (status)
3227 goto bail; 3225 goto bail;
3228 }
3229 did_quota = 1; 3226 did_quota = 1;
3230 3227
3231 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 3228 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
3254 status = 0; 3251 status = 0;
3255bail: 3252bail:
3256 if (did_quota && status < 0) 3253 if (did_quota && status < 0)
3257 vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); 3254 dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
3258 mlog_exit(status); 3255 mlog_exit(status);
3259 return status; 3256 return status;
3260} 3257}
@@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3889 goto out; 3886 goto out;
3890 } 3887 }
3891 3888
3892 if (vfs_dq_alloc_space_nodirty(dir, 3889 ret = dquot_alloc_space_nodirty(dir,
3893 ocfs2_clusters_to_bytes(dir->i_sb, 1))) { 3890 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3894 ret = -EDQUOT; 3891 if (ret)
3895 goto out_commit; 3892 goto out_commit;
3896 }
3897 did_quota = 1; 3893 did_quota = 1;
3898 3894
3899 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, 3895 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
@@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3983 3979
3984out_commit: 3980out_commit:
3985 if (ret < 0 && did_quota) 3981 if (ret < 0 && did_quota)
3986 vfs_dq_free_space_nodirty(dir, 3982 dquot_free_space_nodirty(dir,
3987 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3983 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3988 3984
3989 ocfs2_commit_trans(osb, handle); 3985 ocfs2_commit_trans(osb, handle);
@@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4165 goto out; 4161 goto out;
4166 } 4162 }
4167 4163
4168 if (vfs_dq_alloc_space_nodirty(dir, 4164 ret = dquot_alloc_space_nodirty(dir,
4169 ocfs2_clusters_to_bytes(osb->sb, 1))) { 4165 ocfs2_clusters_to_bytes(osb->sb, 1));
4170 ret = -EDQUOT; 4166 if (ret)
4171 goto out_commit; 4167 goto out_commit;
4172 }
4173 did_quota = 1; 4168 did_quota = 1;
4174 4169
4175 /* 4170 /*
@@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4229 4224
4230out_commit: 4225out_commit:
4231 if (ret < 0 && did_quota) 4226 if (ret < 0 && did_quota)
4232 vfs_dq_free_space_nodirty(dir, 4227 dquot_free_space_nodirty(dir,
4233 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 4228 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4234 4229
4235 ocfs2_commit_trans(osb, handle); 4230 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 190361375700..dcebf0d920fa 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
7 7
8ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index b5786a787fab..3cfa114aa391 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -95,7 +95,7 @@ const char *dlm_errname(enum dlm_status err);
95 mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \ 95 mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
96} while (0) 96} while (0)
97 97
98#define DLM_LKSB_UNUSED1 0x01 98#define DLM_LKSB_UNUSED1 0x01
99#define DLM_LKSB_PUT_LVB 0x02 99#define DLM_LKSB_PUT_LVB 0x02
100#define DLM_LKSB_GET_LVB 0x04 100#define DLM_LKSB_GET_LVB 0x04
101#define DLM_LKSB_UNUSED2 0x08 101#define DLM_LKSB_UNUSED2 0x08
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 01cf8cc3d286..12d5eb78a11a 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -123,7 +122,7 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
123 dlm_lock_put(lock); 122 dlm_lock_put(lock);
124 /* free up the reserved bast that we are cancelling. 123 /* free up the reserved bast that we are cancelling.
125 * guaranteed that this will not be the last reserved 124 * guaranteed that this will not be the last reserved
126 * ast because *both* an ast and a bast were reserved 125 * ast because *both* an ast and a bast were reserved
127 * to get to this point. the res->spinlock will not be 126 * to get to this point. the res->spinlock will not be
128 * taken here */ 127 * taken here */
129 dlm_lockres_release_ast(dlm, res); 128 dlm_lockres_release_ast(dlm, res);
@@ -185,9 +184,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
185 BUG_ON(!lksb); 184 BUG_ON(!lksb);
186 185
187 /* only updates if this node masters the lockres */ 186 /* only updates if this node masters the lockres */
187 spin_lock(&res->spinlock);
188 if (res->owner == dlm->node_num) { 188 if (res->owner == dlm->node_num) {
189
190 spin_lock(&res->spinlock);
191 /* check the lksb flags for the direction */ 189 /* check the lksb flags for the direction */
192 if (lksb->flags & DLM_LKSB_GET_LVB) { 190 if (lksb->flags & DLM_LKSB_GET_LVB) {
193 mlog(0, "getting lvb from lockres for %s node\n", 191 mlog(0, "getting lvb from lockres for %s node\n",
@@ -202,8 +200,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
202 * here. In the future we might want to clear it at the time 200 * here. In the future we might want to clear it at the time
203 * the put is actually done. 201 * the put is actually done.
204 */ 202 */
205 spin_unlock(&res->spinlock);
206 } 203 }
204 spin_unlock(&res->spinlock);
207 205
208 /* reset any lvb flags on the lksb */ 206 /* reset any lvb flags on the lksb */
209 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); 207 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index ca96bce50e18..90803b47cd8c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -396,7 +395,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
396 /* instead of logging the same network error over 395 /* instead of logging the same network error over
397 * and over, sleep here and wait for the heartbeat 396 * and over, sleep here and wait for the heartbeat
398 * to notice the node is dead. times out after 5s. */ 397 * to notice the node is dead. times out after 5s. */
399 dlm_wait_for_node_death(dlm, res->owner, 398 dlm_wait_for_node_death(dlm, res->owner,
400 DLM_NODE_DEATH_WAIT_MAX); 399 DLM_NODE_DEATH_WAIT_MAX);
401 ret = DLM_RECOVERING; 400 ret = DLM_RECOVERING;
402 mlog(0, "node %u died so returning DLM_RECOVERING " 401 mlog(0, "node %u died so returning DLM_RECOVERING "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 42b0bad7a612..0cd24cf54396 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -102,7 +102,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
102 assert_spin_locked(&res->spinlock); 102 assert_spin_locked(&res->spinlock);
103 103
104 stringify_lockname(res->lockname.name, res->lockname.len, 104 stringify_lockname(res->lockname.name, res->lockname.len,
105 buf, sizeof(buf) - 1); 105 buf, sizeof(buf));
106 printk("lockres: %s, owner=%u, state=%u\n", 106 printk("lockres: %s, owner=%u, state=%u\n",
107 buf, res->owner, res->state); 107 buf, res->owner, res->state);
108 printk(" last used: %lu, refcnt: %u, on purge list: %s\n", 108 printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0334000676d3..988c9055fd4e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -816,7 +816,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
816 } 816 }
817 817
818 /* Once the dlm ctxt is marked as leaving then we don't want 818 /* Once the dlm ctxt is marked as leaving then we don't want
819 * to be put in someone's domain map. 819 * to be put in someone's domain map.
820 * Also, explicitly disallow joining at certain troublesome 820 * Also, explicitly disallow joining at certain troublesome
821 * times (ie. during recovery). */ 821 * times (ie. during recovery). */
822 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { 822 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 437698e9465f..733337772671 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -269,7 +269,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
269 } 269 }
270 dlm_revert_pending_lock(res, lock); 270 dlm_revert_pending_lock(res, lock);
271 dlm_lock_put(lock); 271 dlm_lock_put(lock);
272 } else if (dlm_is_recovery_lock(res->lockname.name, 272 } else if (dlm_is_recovery_lock(res->lockname.name,
273 res->lockname.len)) { 273 res->lockname.len)) {
274 /* special case for the $RECOVERY lock. 274 /* special case for the $RECOVERY lock.
275 * there will never be an AST delivered to put 275 * there will never be an AST delivered to put
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 83bcaf266b35..9289b4357d27 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -366,7 +366,7 @@ void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
366 struct dlm_master_list_entry *mle; 366 struct dlm_master_list_entry *mle;
367 367
368 assert_spin_locked(&dlm->spinlock); 368 assert_spin_locked(&dlm->spinlock);
369 369
370 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { 370 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
371 if (node_up) 371 if (node_up)
372 dlm_mle_node_up(dlm, mle, NULL, idx); 372 dlm_mle_node_up(dlm, mle, NULL, idx);
@@ -833,7 +833,7 @@ lookup:
833 __dlm_insert_mle(dlm, mle); 833 __dlm_insert_mle(dlm, mle);
834 834
835 /* still holding the dlm spinlock, check the recovery map 835 /* still holding the dlm spinlock, check the recovery map
836 * to see if there are any nodes that still need to be 836 * to see if there are any nodes that still need to be
837 * considered. these will not appear in the mle nodemap 837 * considered. these will not appear in the mle nodemap
838 * but they might own this lockres. wait on them. */ 838 * but they might own this lockres. wait on them. */
839 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 839 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
@@ -883,7 +883,7 @@ redo_request:
883 msleep(500); 883 msleep(500);
884 } 884 }
885 continue; 885 continue;
886 } 886 }
887 887
888 dlm_kick_recovery_thread(dlm); 888 dlm_kick_recovery_thread(dlm);
889 msleep(1000); 889 msleep(1000);
@@ -939,8 +939,8 @@ wait:
939 res->lockname.name, blocked); 939 res->lockname.name, blocked);
940 if (++tries > 20) { 940 if (++tries > 20) {
941 mlog(ML_ERROR, "%s:%.*s: spinning on " 941 mlog(ML_ERROR, "%s:%.*s: spinning on "
942 "dlm_wait_for_lock_mastery, blocked=%d\n", 942 "dlm_wait_for_lock_mastery, blocked=%d\n",
943 dlm->name, res->lockname.len, 943 dlm->name, res->lockname.len,
944 res->lockname.name, blocked); 944 res->lockname.name, blocked);
945 dlm_print_one_lock_resource(res); 945 dlm_print_one_lock_resource(res);
946 dlm_print_one_mle(mle); 946 dlm_print_one_mle(mle);
@@ -1029,7 +1029,7 @@ recheck:
1029 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); 1029 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
1030 b = (mle->type == DLM_MLE_BLOCK); 1030 b = (mle->type == DLM_MLE_BLOCK);
1031 if ((*blocked && !b) || (!*blocked && b)) { 1031 if ((*blocked && !b) || (!*blocked && b)) {
1032 mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 1032 mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
1033 dlm->name, res->lockname.len, res->lockname.name, 1033 dlm->name, res->lockname.len, res->lockname.name,
1034 *blocked, b); 1034 *blocked, b);
1035 *blocked = b; 1035 *blocked = b;
@@ -1602,7 +1602,7 @@ send_response:
1602 } 1602 }
1603 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", 1603 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1604 dlm->node_num, res->lockname.len, res->lockname.name); 1604 dlm->node_num, res->lockname.len, res->lockname.name);
1605 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 1605 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
1606 DLM_ASSERT_MASTER_MLE_CLEANUP); 1606 DLM_ASSERT_MASTER_MLE_CLEANUP);
1607 if (ret < 0) { 1607 if (ret < 0) {
1608 mlog(ML_ERROR, "failed to dispatch assert master work\n"); 1608 mlog(ML_ERROR, "failed to dispatch assert master work\n");
@@ -1701,7 +1701,7 @@ again:
1701 1701
1702 if (r & DLM_ASSERT_RESPONSE_REASSERT) { 1702 if (r & DLM_ASSERT_RESPONSE_REASSERT) {
1703 mlog(0, "%.*s: node %u create mles on other " 1703 mlog(0, "%.*s: node %u create mles on other "
1704 "nodes and requests a re-assert\n", 1704 "nodes and requests a re-assert\n",
1705 namelen, lockname, to); 1705 namelen, lockname, to);
1706 reassert = 1; 1706 reassert = 1;
1707 } 1707 }
@@ -1812,7 +1812,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1812 spin_unlock(&dlm->master_lock); 1812 spin_unlock(&dlm->master_lock);
1813 spin_unlock(&dlm->spinlock); 1813 spin_unlock(&dlm->spinlock);
1814 goto done; 1814 goto done;
1815 } 1815 }
1816 } 1816 }
1817 } 1817 }
1818 spin_unlock(&dlm->master_lock); 1818 spin_unlock(&dlm->master_lock);
@@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1875ok: 1875ok:
1876 spin_unlock(&res->spinlock); 1876 spin_unlock(&res->spinlock);
1877 } 1877 }
1878 spin_unlock(&dlm->spinlock);
1879 1878
1880 // mlog(0, "woo! got an assert_master from node %u!\n", 1879 // mlog(0, "woo! got an assert_master from node %u!\n",
1881 // assert->node_idx); 1880 // assert->node_idx);
@@ -1883,7 +1882,7 @@ ok:
1883 int extra_ref = 0; 1882 int extra_ref = 0;
1884 int nn = -1; 1883 int nn = -1;
1885 int rr, err = 0; 1884 int rr, err = 0;
1886 1885
1887 spin_lock(&mle->spinlock); 1886 spin_lock(&mle->spinlock);
1888 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) 1887 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
1889 extra_ref = 1; 1888 extra_ref = 1;
@@ -1891,7 +1890,7 @@ ok:
1891 /* MASTER mle: if any bits set in the response map 1890 /* MASTER mle: if any bits set in the response map
1892 * then the calling node needs to re-assert to clear 1891 * then the calling node needs to re-assert to clear
1893 * up nodes that this node contacted */ 1892 * up nodes that this node contacted */
1894 while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 1893 while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
1895 nn+1)) < O2NM_MAX_NODES) { 1894 nn+1)) < O2NM_MAX_NODES) {
1896 if (nn != dlm->node_num && nn != assert->node_idx) 1895 if (nn != dlm->node_num && nn != assert->node_idx)
1897 master_request = 1; 1896 master_request = 1;
@@ -1926,7 +1925,6 @@ ok:
1926 /* master is known, detach if not already detached. 1925 /* master is known, detach if not already detached.
1927 * ensures that only one assert_master call will happen 1926 * ensures that only one assert_master call will happen
1928 * on this mle. */ 1927 * on this mle. */
1929 spin_lock(&dlm->spinlock);
1930 spin_lock(&dlm->master_lock); 1928 spin_lock(&dlm->master_lock);
1931 1929
1932 rr = atomic_read(&mle->mle_refs.refcount); 1930 rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1957,6 @@ ok:
1959 __dlm_put_mle(mle); 1957 __dlm_put_mle(mle);
1960 } 1958 }
1961 spin_unlock(&dlm->master_lock); 1959 spin_unlock(&dlm->master_lock);
1962 spin_unlock(&dlm->spinlock);
1963 } else if (res) { 1960 } else if (res) {
1964 if (res->owner != assert->node_idx) { 1961 if (res->owner != assert->node_idx) {
1965 mlog(0, "assert_master from %u, but current " 1962 mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1964,7 @@ ok:
1967 res->owner, namelen, name); 1964 res->owner, namelen, name);
1968 } 1965 }
1969 } 1966 }
1967 spin_unlock(&dlm->spinlock);
1970 1968
1971done: 1969done:
1972 ret = 0; 1970 ret = 0;
@@ -2002,7 +2000,7 @@ kill:
2002 __dlm_print_one_lock_resource(res); 2000 __dlm_print_one_lock_resource(res);
2003 spin_unlock(&res->spinlock); 2001 spin_unlock(&res->spinlock);
2004 spin_unlock(&dlm->spinlock); 2002 spin_unlock(&dlm->spinlock);
2005 *ret_data = (void *)res; 2003 *ret_data = (void *)res;
2006 dlm_put(dlm); 2004 dlm_put(dlm);
2007 return -EINVAL; 2005 return -EINVAL;
2008} 2006}
@@ -2040,10 +2038,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
2040 item->u.am.request_from = request_from; 2038 item->u.am.request_from = request_from;
2041 item->u.am.flags = flags; 2039 item->u.am.flags = flags;
2042 2040
2043 if (ignore_higher) 2041 if (ignore_higher)
2044 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 2042 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
2045 res->lockname.name); 2043 res->lockname.name);
2046 2044
2047 spin_lock(&dlm->work_lock); 2045 spin_lock(&dlm->work_lock);
2048 list_add_tail(&item->list, &dlm->work_list); 2046 list_add_tail(&item->list, &dlm->work_list);
2049 spin_unlock(&dlm->work_lock); 2047 spin_unlock(&dlm->work_lock);
@@ -2133,7 +2131,7 @@ put:
2133 * think that $RECOVERY is currently mastered by a dead node. If so, 2131 * think that $RECOVERY is currently mastered by a dead node. If so,
2134 * we wait a short time to allow that node to get notified by its own 2132 * we wait a short time to allow that node to get notified by its own
2135 * heartbeat stack, then check again. All $RECOVERY lock resources 2133 * heartbeat stack, then check again. All $RECOVERY lock resources
2136 * mastered by dead nodes are purged when the hearbeat callback is 2134 * mastered by dead nodes are purged when the hearbeat callback is
2137 * fired, so we can know for sure that it is safe to continue once 2135 * fired, so we can know for sure that it is safe to continue once
2138 * the node returns a live node or no node. */ 2136 * the node returns a live node or no node. */
2139static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, 2137static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
@@ -2174,7 +2172,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
2174 ret = -EAGAIN; 2172 ret = -EAGAIN;
2175 } 2173 }
2176 spin_unlock(&dlm->spinlock); 2174 spin_unlock(&dlm->spinlock);
2177 mlog(0, "%s: reco lock master is %u\n", dlm->name, 2175 mlog(0, "%s: reco lock master is %u\n", dlm->name,
2178 master); 2176 master);
2179 break; 2177 break;
2180 } 2178 }
@@ -2586,7 +2584,7 @@ fail:
2586 * is complete everywhere. if the target dies while this is 2584 * is complete everywhere. if the target dies while this is
2587 * going on, some nodes could potentially see the target as the 2585 * going on, some nodes could potentially see the target as the
2588 * master, so it is important that my recovery finds the migration 2586 * master, so it is important that my recovery finds the migration
2589 * mle and sets the master to UNKNONWN. */ 2587 * mle and sets the master to UNKNOWN. */
2590 2588
2591 2589
2592 /* wait for new node to assert master */ 2590 /* wait for new node to assert master */
@@ -2602,7 +2600,7 @@ fail:
2602 2600
2603 mlog(0, "%s:%.*s: timed out during migration\n", 2601 mlog(0, "%s:%.*s: timed out during migration\n",
2604 dlm->name, res->lockname.len, res->lockname.name); 2602 dlm->name, res->lockname.len, res->lockname.name);
2605 /* avoid hang during shutdown when migrating lockres 2603 /* avoid hang during shutdown when migrating lockres
2606 * to a node which also goes down */ 2604 * to a node which also goes down */
2607 if (dlm_is_node_dead(dlm, target)) { 2605 if (dlm_is_node_dead(dlm, target)) {
2608 mlog(0, "%s:%.*s: expected migration " 2606 mlog(0, "%s:%.*s: expected migration "
@@ -2738,7 +2736,7 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2738 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); 2736 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2739 spin_unlock(&res->spinlock); 2737 spin_unlock(&res->spinlock);
2740 2738
2741 /* target has died, so make the caller break out of the 2739 /* target has died, so make the caller break out of the
2742 * wait_event, but caller must recheck the domain_map */ 2740 * wait_event, but caller must recheck the domain_map */
2743 spin_lock(&dlm->spinlock); 2741 spin_lock(&dlm->spinlock);
2744 if (!test_bit(mig_target, dlm->domain_map)) 2742 if (!test_bit(mig_target, dlm->domain_map))
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index d9fa3d22e17c..b4f99de2caf3 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data)
310 mlog(0, "dlm thread running for %s...\n", dlm->name); 310 mlog(0, "dlm thread running for %s...\n", dlm->name);
311 311
312 while (!kthread_should_stop()) { 312 while (!kthread_should_stop()) {
313 if (dlm_joined(dlm)) { 313 if (dlm_domain_fully_joined(dlm)) {
314 status = dlm_do_recovery(dlm); 314 status = dlm_do_recovery(dlm);
315 if (status == -EAGAIN) { 315 if (status == -EAGAIN) {
316 /* do not sleep, recheck immediately. */ 316 /* do not sleep, recheck immediately. */
@@ -1050,7 +1050,7 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1050 if (lock->ml.node == dead_node) { 1050 if (lock->ml.node == dead_node) {
1051 mlog(0, "AHA! there was " 1051 mlog(0, "AHA! there was "
1052 "a $RECOVERY lock for dead " 1052 "a $RECOVERY lock for dead "
1053 "node %u (%s)!\n", 1053 "node %u (%s)!\n",
1054 dead_node, dlm->name); 1054 dead_node, dlm->name);
1055 list_del_init(&lock->list); 1055 list_del_init(&lock->list);
1056 dlm_lock_put(lock); 1056 dlm_lock_put(lock);
@@ -1164,6 +1164,39 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
1164 mres->master = master; 1164 mres->master = master;
1165} 1165}
1166 1166
1167static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
1168 struct dlm_migratable_lockres *mres,
1169 int queue)
1170{
1171 if (!lock->lksb)
1172 return;
1173
1174 /* Ignore lvb in all locks in the blocked list */
1175 if (queue == DLM_BLOCKED_LIST)
1176 return;
1177
1178 /* Only consider lvbs in locks with granted EX or PR lock levels */
1179 if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
1180 return;
1181
1182 if (dlm_lvb_is_empty(mres->lvb)) {
1183 memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
1184 return;
1185 }
1186
1187 /* Ensure the lvb copied for migration matches in other valid locks */
1188 if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
1189 return;
1190
1191 mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
1192 "node=%u\n",
1193 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
1194 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
1195 lock->lockres->lockname.len, lock->lockres->lockname.name,
1196 lock->ml.node);
1197 dlm_print_one_lock_resource(lock->lockres);
1198 BUG();
1199}
1167 1200
1168/* returns 1 if this lock fills the network structure, 1201/* returns 1 if this lock fills the network structure,
1169 * 0 otherwise */ 1202 * 0 otherwise */
@@ -1181,20 +1214,7 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
1181 ml->list = queue; 1214 ml->list = queue;
1182 if (lock->lksb) { 1215 if (lock->lksb) {
1183 ml->flags = lock->lksb->flags; 1216 ml->flags = lock->lksb->flags;
1184 /* send our current lvb */ 1217 dlm_prepare_lvb_for_migration(lock, mres, queue);
1185 if (ml->type == LKM_EXMODE ||
1186 ml->type == LKM_PRMODE) {
1187 /* if it is already set, this had better be a PR
1188 * and it has to match */
1189 if (!dlm_lvb_is_empty(mres->lvb) &&
1190 (ml->type == LKM_EXMODE ||
1191 memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
1192 mlog(ML_ERROR, "mismatched lvbs!\n");
1193 dlm_print_one_lock_resource(lock->lockres);
1194 BUG();
1195 }
1196 memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
1197 }
1198 } 1218 }
1199 ml->node = lock->ml.node; 1219 ml->node = lock->ml.node;
1200 mres->num_locks++; 1220 mres->num_locks++;
@@ -1730,6 +1750,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1730 struct dlm_lock *lock = NULL; 1750 struct dlm_lock *lock = NULL;
1731 u8 from = O2NM_MAX_NODES; 1751 u8 from = O2NM_MAX_NODES;
1732 unsigned int added = 0; 1752 unsigned int added = 0;
1753 __be64 c;
1733 1754
1734 mlog(0, "running %d locks for this lockres\n", mres->num_locks); 1755 mlog(0, "running %d locks for this lockres\n", mres->num_locks);
1735 for (i=0; i<mres->num_locks; i++) { 1756 for (i=0; i<mres->num_locks; i++) {
@@ -1777,19 +1798,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1777 /* lock is always created locally first, and 1798 /* lock is always created locally first, and
1778 * destroyed locally last. it must be on the list */ 1799 * destroyed locally last. it must be on the list */
1779 if (!lock) { 1800 if (!lock) {
1780 __be64 c = ml->cookie; 1801 c = ml->cookie;
1781 mlog(ML_ERROR, "could not find local lock " 1802 mlog(ML_ERROR, "Could not find local lock "
1782 "with cookie %u:%llu!\n", 1803 "with cookie %u:%llu, node %u, "
1804 "list %u, flags 0x%x, type %d, "
1805 "conv %d, highest blocked %d\n",
1783 dlm_get_lock_cookie_node(be64_to_cpu(c)), 1806 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1784 dlm_get_lock_cookie_seq(be64_to_cpu(c))); 1807 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1808 ml->node, ml->list, ml->flags, ml->type,
1809 ml->convert_type, ml->highest_blocked);
1810 __dlm_print_one_lock_resource(res);
1811 BUG();
1812 }
1813
1814 if (lock->ml.node != ml->node) {
1815 c = lock->ml.cookie;
1816 mlog(ML_ERROR, "Mismatched node# in lock "
1817 "cookie %u:%llu, name %.*s, node %u\n",
1818 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1819 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1820 res->lockname.len, res->lockname.name,
1821 lock->ml.node);
1822 c = ml->cookie;
1823 mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
1824 "node %u, list %u, flags 0x%x, type %d, "
1825 "conv %d, highest blocked %d\n",
1826 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1827 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1828 ml->node, ml->list, ml->flags, ml->type,
1829 ml->convert_type, ml->highest_blocked);
1785 __dlm_print_one_lock_resource(res); 1830 __dlm_print_one_lock_resource(res);
1786 BUG(); 1831 BUG();
1787 } 1832 }
1788 BUG_ON(lock->ml.node != ml->node);
1789 1833
1790 if (tmpq != queue) { 1834 if (tmpq != queue) {
1791 mlog(0, "lock was on %u instead of %u for %.*s\n", 1835 c = ml->cookie;
1792 j, ml->list, res->lockname.len, res->lockname.name); 1836 mlog(0, "Lock cookie %u:%llu was on list %u "
1837 "instead of list %u for %.*s\n",
1838 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1839 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1840 j, ml->list, res->lockname.len,
1841 res->lockname.name);
1842 __dlm_print_one_lock_resource(res);
1793 spin_unlock(&res->spinlock); 1843 spin_unlock(&res->spinlock);
1794 continue; 1844 continue;
1795 } 1845 }
@@ -1839,7 +1889,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1839 * the lvb. */ 1889 * the lvb. */
1840 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); 1890 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1841 } else { 1891 } else {
1842 /* otherwise, the node is sending its 1892 /* otherwise, the node is sending its
1843 * most recent valid lvb info */ 1893 * most recent valid lvb info */
1844 BUG_ON(ml->type != LKM_EXMODE && 1894 BUG_ON(ml->type != LKM_EXMODE &&
1845 ml->type != LKM_PRMODE); 1895 ml->type != LKM_PRMODE);
@@ -1886,7 +1936,7 @@ skip_lvb:
1886 spin_lock(&res->spinlock); 1936 spin_lock(&res->spinlock);
1887 list_for_each_entry(lock, queue, list) { 1937 list_for_each_entry(lock, queue, list) {
1888 if (lock->ml.cookie == ml->cookie) { 1938 if (lock->ml.cookie == ml->cookie) {
1889 __be64 c = lock->ml.cookie; 1939 c = lock->ml.cookie;
1890 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " 1940 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
1891 "exists on this lockres!\n", dlm->name, 1941 "exists on this lockres!\n", dlm->name,
1892 res->lockname.len, res->lockname.name, 1942 res->lockname.len, res->lockname.name,
@@ -2114,7 +2164,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2114 assert_spin_locked(&res->spinlock); 2164 assert_spin_locked(&res->spinlock);
2115 2165
2116 if (res->owner == dlm->node_num) 2166 if (res->owner == dlm->node_num)
2117 /* if this node owned the lockres, and if the dead node 2167 /* if this node owned the lockres, and if the dead node
2118 * had an EX when he died, blank out the lvb */ 2168 * had an EX when he died, blank out the lvb */
2119 search_node = dead_node; 2169 search_node = dead_node;
2120 else { 2170 else {
@@ -2152,7 +2202,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2152 2202
2153 /* this node is the lockres master: 2203 /* this node is the lockres master:
2154 * 1) remove any stale locks for the dead node 2204 * 1) remove any stale locks for the dead node
2155 * 2) if the dead node had an EX when he died, blank out the lvb 2205 * 2) if the dead node had an EX when he died, blank out the lvb
2156 */ 2206 */
2157 assert_spin_locked(&dlm->spinlock); 2207 assert_spin_locked(&dlm->spinlock);
2158 assert_spin_locked(&res->spinlock); 2208 assert_spin_locked(&res->spinlock);
@@ -2193,7 +2243,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2193 mlog(0, "%s:%.*s: freed %u locks for dead node %u, " 2243 mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
2194 "dropping ref from lockres\n", dlm->name, 2244 "dropping ref from lockres\n", dlm->name,
2195 res->lockname.len, res->lockname.name, freed, dead_node); 2245 res->lockname.len, res->lockname.name, freed, dead_node);
2196 BUG_ON(!test_bit(dead_node, res->refmap)); 2246 if(!test_bit(dead_node, res->refmap)) {
2247 mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
2248 "but ref was not set\n", dlm->name,
2249 res->lockname.len, res->lockname.name, freed, dead_node);
2250 __dlm_print_one_lock_resource(res);
2251 }
2197 dlm_lockres_clear_refmap_bit(dead_node, res); 2252 dlm_lockres_clear_refmap_bit(dead_node, res);
2198 } else if (test_bit(dead_node, res->refmap)) { 2253 } else if (test_bit(dead_node, res->refmap)) {
2199 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2254 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2260,7 +2315,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2260 } 2315 }
2261 spin_unlock(&res->spinlock); 2316 spin_unlock(&res->spinlock);
2262 continue; 2317 continue;
2263 } 2318 }
2264 spin_lock(&res->spinlock); 2319 spin_lock(&res->spinlock);
2265 /* zero the lvb if necessary */ 2320 /* zero the lvb if necessary */
2266 dlm_revalidate_lvb(dlm, res, dead_node); 2321 dlm_revalidate_lvb(dlm, res, dead_node);
@@ -2411,7 +2466,7 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
2411 * this function on each node racing to become the recovery 2466 * this function on each node racing to become the recovery
2412 * master will not stop attempting this until either: 2467 * master will not stop attempting this until either:
2413 * a) this node gets the EX (and becomes the recovery master), 2468 * a) this node gets the EX (and becomes the recovery master),
2414 * or b) dlm->reco.new_master gets set to some nodenum 2469 * or b) dlm->reco.new_master gets set to some nodenum
2415 * != O2NM_INVALID_NODE_NUM (another node will do the reco). 2470 * != O2NM_INVALID_NODE_NUM (another node will do the reco).
2416 * so each time a recovery master is needed, the entire cluster 2471 * so each time a recovery master is needed, the entire cluster
2417 * will sync at this point. if the new master dies, that will 2472 * will sync at this point. if the new master dies, that will
@@ -2424,7 +2479,7 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
2424 2479
2425 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", 2480 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
2426 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); 2481 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
2427again: 2482again:
2428 memset(&lksb, 0, sizeof(lksb)); 2483 memset(&lksb, 0, sizeof(lksb));
2429 2484
2430 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, 2485 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
@@ -2437,8 +2492,8 @@ again:
2437 if (ret == DLM_NORMAL) { 2492 if (ret == DLM_NORMAL) {
2438 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", 2493 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
2439 dlm->name, dlm->node_num); 2494 dlm->name, dlm->node_num);
2440 2495
2441 /* got the EX lock. check to see if another node 2496 /* got the EX lock. check to see if another node
2442 * just became the reco master */ 2497 * just became the reco master */
2443 if (dlm_reco_master_ready(dlm)) { 2498 if (dlm_reco_master_ready(dlm)) {
2444 mlog(0, "%s: got reco EX lock, but %u will " 2499 mlog(0, "%s: got reco EX lock, but %u will "
@@ -2451,12 +2506,12 @@ again:
2451 /* see if recovery was already finished elsewhere */ 2506 /* see if recovery was already finished elsewhere */
2452 spin_lock(&dlm->spinlock); 2507 spin_lock(&dlm->spinlock);
2453 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { 2508 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
2454 status = -EINVAL; 2509 status = -EINVAL;
2455 mlog(0, "%s: got reco EX lock, but " 2510 mlog(0, "%s: got reco EX lock, but "
2456 "node got recovered already\n", dlm->name); 2511 "node got recovered already\n", dlm->name);
2457 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { 2512 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2458 mlog(ML_ERROR, "%s: new master is %u " 2513 mlog(ML_ERROR, "%s: new master is %u "
2459 "but no dead node!\n", 2514 "but no dead node!\n",
2460 dlm->name, dlm->reco.new_master); 2515 dlm->name, dlm->reco.new_master);
2461 BUG(); 2516 BUG();
2462 } 2517 }
@@ -2468,7 +2523,7 @@ again:
2468 * set the master and send the messages to begin recovery */ 2523 * set the master and send the messages to begin recovery */
2469 if (!status) { 2524 if (!status) {
2470 mlog(0, "%s: dead=%u, this=%u, sending " 2525 mlog(0, "%s: dead=%u, this=%u, sending "
2471 "begin_reco now\n", dlm->name, 2526 "begin_reco now\n", dlm->name,
2472 dlm->reco.dead_node, dlm->node_num); 2527 dlm->reco.dead_node, dlm->node_num);
2473 status = dlm_send_begin_reco_message(dlm, 2528 status = dlm_send_begin_reco_message(dlm,
2474 dlm->reco.dead_node); 2529 dlm->reco.dead_node);
@@ -2501,7 +2556,7 @@ again:
2501 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", 2556 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
2502 dlm->name, dlm->node_num); 2557 dlm->name, dlm->node_num);
2503 /* another node is master. wait on 2558 /* another node is master. wait on
2504 * reco.new_master != O2NM_INVALID_NODE_NUM 2559 * reco.new_master != O2NM_INVALID_NODE_NUM
2505 * for at most one second */ 2560 * for at most one second */
2506 wait_event_timeout(dlm->dlm_reco_thread_wq, 2561 wait_event_timeout(dlm->dlm_reco_thread_wq,
2507 dlm_reco_master_ready(dlm), 2562 dlm_reco_master_ready(dlm),
@@ -2589,9 +2644,23 @@ retry:
2589 "begin reco msg (%d)\n", dlm->name, nodenum, ret); 2644 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2590 ret = 0; 2645 ret = 0;
2591 } 2646 }
2647
2648 /*
2649 * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
2650 * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
2651 * We are handling both for compatibility reasons.
2652 */
2653 if (ret == -EAGAIN || ret == EAGAIN) {
2654 mlog(0, "%s: trying to start recovery of node "
2655 "%u, but node %u is waiting for last recovery "
2656 "to complete, backoff for a bit\n", dlm->name,
2657 dead_node, nodenum);
2658 msleep(100);
2659 goto retry;
2660 }
2592 if (ret < 0) { 2661 if (ret < 0) {
2593 struct dlm_lock_resource *res; 2662 struct dlm_lock_resource *res;
2594 /* this is now a serious problem, possibly ENOMEM 2663 /* this is now a serious problem, possibly ENOMEM
2595 * in the network stack. must retry */ 2664 * in the network stack. must retry */
2596 mlog_errno(ret); 2665 mlog_errno(ret);
2597 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2666 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
@@ -2604,18 +2673,10 @@ retry:
2604 } else { 2673 } else {
2605 mlog(ML_ERROR, "recovery lock not found\n"); 2674 mlog(ML_ERROR, "recovery lock not found\n");
2606 } 2675 }
2607 /* sleep for a bit in hopes that we can avoid 2676 /* sleep for a bit in hopes that we can avoid
2608 * another ENOMEM */ 2677 * another ENOMEM */
2609 msleep(100); 2678 msleep(100);
2610 goto retry; 2679 goto retry;
2611 } else if (ret == EAGAIN) {
2612 mlog(0, "%s: trying to start recovery of node "
2613 "%u, but node %u is waiting for last recovery "
2614 "to complete, backoff for a bit\n", dlm->name,
2615 dead_node, nodenum);
2616 /* TODO Look into replacing msleep with cond_resched() */
2617 msleep(100);
2618 goto retry;
2619 } 2680 }
2620 } 2681 }
2621 2682
@@ -2639,7 +2700,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2639 dlm->name, br->node_idx, br->dead_node, 2700 dlm->name, br->node_idx, br->dead_node,
2640 dlm->reco.dead_node, dlm->reco.new_master); 2701 dlm->reco.dead_node, dlm->reco.new_master);
2641 spin_unlock(&dlm->spinlock); 2702 spin_unlock(&dlm->spinlock);
2642 return EAGAIN; 2703 return -EAGAIN;
2643 } 2704 }
2644 spin_unlock(&dlm->spinlock); 2705 spin_unlock(&dlm->spinlock);
2645 2706
@@ -2664,7 +2725,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2664 } 2725 }
2665 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { 2726 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2666 mlog(ML_NOTICE, "%s: dead_node previously set to %u, " 2727 mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
2667 "node %u changing it to %u\n", dlm->name, 2728 "node %u changing it to %u\n", dlm->name,
2668 dlm->reco.dead_node, br->node_idx, br->dead_node); 2729 dlm->reco.dead_node, br->node_idx, br->dead_node);
2669 } 2730 }
2670 dlm_set_reco_master(dlm, br->node_idx); 2731 dlm_set_reco_master(dlm, br->node_idx);
@@ -2730,8 +2791,8 @@ stage2:
2730 if (ret < 0) { 2791 if (ret < 0) {
2731 mlog_errno(ret); 2792 mlog_errno(ret);
2732 if (dlm_is_host_down(ret)) { 2793 if (dlm_is_host_down(ret)) {
2733 /* this has no effect on this recovery 2794 /* this has no effect on this recovery
2734 * session, so set the status to zero to 2795 * session, so set the status to zero to
2735 * finish out the last recovery */ 2796 * finish out the last recovery */
2736 mlog(ML_ERROR, "node %u went down after this " 2797 mlog(ML_ERROR, "node %u went down after this "
2737 "node finished recovery.\n", nodenum); 2798 "node finished recovery.\n", nodenum);
@@ -2768,7 +2829,7 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2768 mlog(0, "%s: node %u finalizing recovery stage%d of " 2829 mlog(0, "%s: node %u finalizing recovery stage%d of "
2769 "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, 2830 "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
2770 fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); 2831 fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
2771 2832
2772 spin_lock(&dlm->spinlock); 2833 spin_lock(&dlm->spinlock);
2773 2834
2774 if (dlm->reco.new_master != fr->node_idx) { 2835 if (dlm->reco.new_master != fr->node_idx) {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..11a6d1fd1d35 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 00f53b2aea76..b47c1b92b82b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -190,8 +189,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
190 actions &= ~(DLM_UNLOCK_REMOVE_LOCK| 189 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
191 DLM_UNLOCK_REGRANT_LOCK| 190 DLM_UNLOCK_REGRANT_LOCK|
192 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 191 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
193 } else if (status == DLM_RECOVERING || 192 } else if (status == DLM_RECOVERING ||
194 status == DLM_MIGRATING || 193 status == DLM_MIGRATING ||
195 status == DLM_FORWARD) { 194 status == DLM_FORWARD) {
196 /* must clear the actions because this unlock 195 /* must clear the actions because this unlock
197 * is about to be retried. cannot free or do 196 * is about to be retried. cannot free or do
@@ -661,14 +660,14 @@ retry:
661 if (call_ast) { 660 if (call_ast) {
662 mlog(0, "calling unlockast(%p, %d)\n", data, status); 661 mlog(0, "calling unlockast(%p, %d)\n", data, status);
663 if (is_master) { 662 if (is_master) {
664 /* it is possible that there is one last bast 663 /* it is possible that there is one last bast
665 * pending. make sure it is flushed, then 664 * pending. make sure it is flushed, then
666 * call the unlockast. 665 * call the unlockast.
667 * not an issue if this is a mastered remotely, 666 * not an issue if this is a mastered remotely,
668 * since this lock has been removed from the 667 * since this lock has been removed from the
669 * lockres queues and cannot be found. */ 668 * lockres queues and cannot be found. */
670 dlm_kick_thread(dlm, NULL); 669 dlm_kick_thread(dlm, NULL);
671 wait_event(dlm->ast_wq, 670 wait_event(dlm->ast_wq,
672 dlm_lock_basts_flushed(dlm, lock)); 671 dlm_lock_basts_flushed(dlm, lock));
673 } 672 }
674 (*unlockast)(data, status); 673 (*unlockast)(data, status);
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000000..df69b4856d0d
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4
5ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 02bf17808bdc..b83d6107a1f5 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -43,24 +43,17 @@
43#include <linux/init.h> 43#include <linux/init.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/backing-dev.h> 45#include <linux/backing-dev.h>
46#include <linux/poll.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
49 50#include "stackglue.h"
50#include "cluster/nodemanager.h"
51#include "cluster/heartbeat.h"
52#include "cluster/tcp.h"
53
54#include "dlmapi.h"
55
56#include "userdlm.h" 51#include "userdlm.h"
57
58#include "dlmfsver.h" 52#include "dlmfsver.h"
59 53
60#define MLOG_MASK_PREFIX ML_DLMFS 54#define MLOG_MASK_PREFIX ML_DLMFS
61#include "cluster/masklog.h" 55#include "cluster/masklog.h"
62 56
63#include "ocfs2_lockingver.h"
64 57
65static const struct super_operations dlmfs_ops; 58static const struct super_operations dlmfs_ops;
66static const struct file_operations dlmfs_file_operations; 59static const struct file_operations dlmfs_file_operations;
@@ -71,15 +64,46 @@ static struct kmem_cache *dlmfs_inode_cache;
71 64
72struct workqueue_struct *user_dlm_worker; 65struct workqueue_struct *user_dlm_worker;
73 66
67
68
74/* 69/*
75 * This is the userdlmfs locking protocol version. 70 * These are the ABI capabilities of dlmfs.
71 *
72 * Over time, dlmfs has added some features that were not part of the
73 * initial ABI. Unfortunately, some of these features are not detectable
74 * via standard usage. For example, Linux's default poll always returns
75 * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
76 * added poll support. Instead, we provide this list of new capabilities.
77 *
78 * Capabilities is a read-only attribute. We do it as a module parameter
79 * so we can discover it whether dlmfs is built in, loaded, or even not
80 * loaded.
76 * 81 *
77 * See fs/ocfs2/dlmglue.c for more details on locking versions. 82 * The ABI features are local to this machine's dlmfs mount. This is
83 * distinct from the locking protocol, which is concerned with inter-node
84 * interaction.
85 *
86 * Capabilities:
87 * - bast : POLLIN against the file descriptor of a held lock
88 * signifies a bast fired on the lock.
78 */ 89 */
79static const struct dlm_protocol_version user_locking_protocol = { 90#define DLMFS_CAPABILITIES "bast stackglue"
80 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, 91extern int param_set_dlmfs_capabilities(const char *val,
81 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, 92 struct kernel_param *kp)
82}; 93{
94 printk(KERN_ERR "%s: readonly parameter\n", kp->name);
95 return -EINVAL;
96}
97static int param_get_dlmfs_capabilities(char *buffer,
98 struct kernel_param *kp)
99{
100 return strlcpy(buffer, DLMFS_CAPABILITIES,
101 strlen(DLMFS_CAPABILITIES) + 1);
102}
103module_param_call(capabilities, param_set_dlmfs_capabilities,
104 param_get_dlmfs_capabilities, NULL, 0444);
105MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
106
83 107
84/* 108/*
85 * decodes a set of open flags into a valid lock level and a set of flags. 109 * decodes a set of open flags into a valid lock level and a set of flags.
@@ -88,20 +112,20 @@ static const struct dlm_protocol_version user_locking_protocol = {
88 * O_RDONLY -> PRMODE level 112 * O_RDONLY -> PRMODE level
89 * O_WRONLY -> EXMODE level 113 * O_WRONLY -> EXMODE level
90 * 114 *
91 * O_NONBLOCK -> LKM_NOQUEUE 115 * O_NONBLOCK -> NOQUEUE
92 */ 116 */
93static int dlmfs_decode_open_flags(int open_flags, 117static int dlmfs_decode_open_flags(int open_flags,
94 int *level, 118 int *level,
95 int *flags) 119 int *flags)
96{ 120{
97 if (open_flags & (O_WRONLY|O_RDWR)) 121 if (open_flags & (O_WRONLY|O_RDWR))
98 *level = LKM_EXMODE; 122 *level = DLM_LOCK_EX;
99 else 123 else
100 *level = LKM_PRMODE; 124 *level = DLM_LOCK_PR;
101 125
102 *flags = 0; 126 *flags = 0;
103 if (open_flags & O_NONBLOCK) 127 if (open_flags & O_NONBLOCK)
104 *flags |= LKM_NOQUEUE; 128 *flags |= DLM_LKF_NOQUEUE;
105 129
106 return 0; 130 return 0;
107} 131}
@@ -142,7 +166,7 @@ static int dlmfs_file_open(struct inode *inode,
142 * to be able userspace to be able to distinguish a 166 * to be able userspace to be able to distinguish a
143 * valid lock request from one that simply couldn't be 167 * valid lock request from one that simply couldn't be
144 * granted. */ 168 * granted. */
145 if (flags & LKM_NOQUEUE && status == -EAGAIN) 169 if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
146 status = -ETXTBSY; 170 status = -ETXTBSY;
147 kfree(fp); 171 kfree(fp);
148 goto bail; 172 goto bail;
@@ -169,7 +193,7 @@ static int dlmfs_file_release(struct inode *inode,
169 status = 0; 193 status = 0;
170 if (fp) { 194 if (fp) {
171 level = fp->fp_lock_level; 195 level = fp->fp_lock_level;
172 if (level != LKM_IVMODE) 196 if (level != DLM_LOCK_IV)
173 user_dlm_cluster_unlock(&ip->ip_lockres, level); 197 user_dlm_cluster_unlock(&ip->ip_lockres, level);
174 198
175 kfree(fp); 199 kfree(fp);
@@ -179,13 +203,46 @@ static int dlmfs_file_release(struct inode *inode,
179 return 0; 203 return 0;
180} 204}
181 205
206/*
207 * We do ->setattr() just to override size changes. Our size is the size
208 * of the LVB and nothing else.
209 */
210static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
211{
212 int error;
213 struct inode *inode = dentry->d_inode;
214
215 attr->ia_valid &= ~ATTR_SIZE;
216 error = inode_change_ok(inode, attr);
217 if (!error)
218 error = inode_setattr(inode, attr);
219
220 return error;
221}
222
223static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
224{
225 int event = 0;
226 struct inode *inode = file->f_path.dentry->d_inode;
227 struct dlmfs_inode_private *ip = DLMFS_I(inode);
228
229 poll_wait(file, &ip->ip_lockres.l_event, wait);
230
231 spin_lock(&ip->ip_lockres.l_lock);
232 if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
233 event = POLLIN | POLLRDNORM;
234 spin_unlock(&ip->ip_lockres.l_lock);
235
236 return event;
237}
238
182static ssize_t dlmfs_file_read(struct file *filp, 239static ssize_t dlmfs_file_read(struct file *filp,
183 char __user *buf, 240 char __user *buf,
184 size_t count, 241 size_t count,
185 loff_t *ppos) 242 loff_t *ppos)
186{ 243{
187 int bytes_left; 244 int bytes_left;
188 ssize_t readlen; 245 ssize_t readlen, got;
189 char *lvb_buf; 246 char *lvb_buf;
190 struct inode *inode = filp->f_path.dentry->d_inode; 247 struct inode *inode = filp->f_path.dentry->d_inode;
191 248
@@ -205,15 +262,19 @@ static ssize_t dlmfs_file_read(struct file *filp,
205 if ((count + *ppos) > i_size_read(inode)) 262 if ((count + *ppos) > i_size_read(inode))
206 readlen = i_size_read(inode) - *ppos; 263 readlen = i_size_read(inode) - *ppos;
207 else 264 else
208 readlen = count - *ppos; 265 readlen = count;
209 266
210 lvb_buf = kmalloc(readlen, GFP_NOFS); 267 lvb_buf = kmalloc(readlen, GFP_NOFS);
211 if (!lvb_buf) 268 if (!lvb_buf)
212 return -ENOMEM; 269 return -ENOMEM;
213 270
214 user_dlm_read_lvb(inode, lvb_buf, readlen); 271 got = user_dlm_read_lvb(inode, lvb_buf, readlen);
215 bytes_left = __copy_to_user(buf, lvb_buf, readlen); 272 if (got) {
216 readlen -= bytes_left; 273 BUG_ON(got != readlen);
274 bytes_left = __copy_to_user(buf, lvb_buf, readlen);
275 readlen -= bytes_left;
276 } else
277 readlen = 0;
217 278
218 kfree(lvb_buf); 279 kfree(lvb_buf);
219 280
@@ -272,7 +333,7 @@ static void dlmfs_init_once(void *foo)
272 struct dlmfs_inode_private *ip = 333 struct dlmfs_inode_private *ip =
273 (struct dlmfs_inode_private *) foo; 334 (struct dlmfs_inode_private *) foo;
274 335
275 ip->ip_dlm = NULL; 336 ip->ip_conn = NULL;
276 ip->ip_parent = NULL; 337 ip->ip_parent = NULL;
277 338
278 inode_init_once(&ip->ip_vfs_inode); 339 inode_init_once(&ip->ip_vfs_inode);
@@ -314,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode)
314 goto clear_fields; 375 goto clear_fields;
315 } 376 }
316 377
317 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); 378 mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
318 /* we must be a directory. If required, lets unregister the 379 /* we must be a directory. If required, lets unregister the
319 * dlm context now. */ 380 * dlm context now. */
320 if (ip->ip_dlm) 381 if (ip->ip_conn)
321 user_dlm_unregister_context(ip->ip_dlm); 382 user_dlm_unregister(ip->ip_conn);
322clear_fields: 383clear_fields:
323 ip->ip_parent = NULL; 384 ip->ip_parent = NULL;
324 ip->ip_dlm = NULL; 385 ip->ip_conn = NULL;
325} 386}
326 387
327static struct backing_dev_info dlmfs_backing_dev_info = { 388static struct backing_dev_info dlmfs_backing_dev_info = {
@@ -371,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
371 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 432 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
372 433
373 ip = DLMFS_I(inode); 434 ip = DLMFS_I(inode);
374 ip->ip_dlm = DLMFS_I(parent)->ip_dlm; 435 ip->ip_conn = DLMFS_I(parent)->ip_conn;
375 436
376 switch (mode & S_IFMT) { 437 switch (mode & S_IFMT) {
377 default: 438 default:
@@ -425,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir,
425 struct inode *inode = NULL; 486 struct inode *inode = NULL;
426 struct qstr *domain = &dentry->d_name; 487 struct qstr *domain = &dentry->d_name;
427 struct dlmfs_inode_private *ip; 488 struct dlmfs_inode_private *ip;
428 struct dlm_ctxt *dlm; 489 struct ocfs2_cluster_connection *conn;
429 struct dlm_protocol_version proto = user_locking_protocol;
430 490
431 mlog(0, "mkdir %.*s\n", domain->len, domain->name); 491 mlog(0, "mkdir %.*s\n", domain->len, domain->name);
432 492
433 /* verify that we have a proper domain */ 493 /* verify that we have a proper domain */
434 if (domain->len >= O2NM_MAX_NAME_LEN) { 494 if (domain->len >= GROUP_NAME_MAX) {
435 status = -EINVAL; 495 status = -EINVAL;
436 mlog(ML_ERROR, "invalid domain name for directory.\n"); 496 mlog(ML_ERROR, "invalid domain name for directory.\n");
437 goto bail; 497 goto bail;
@@ -446,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir,
446 506
447 ip = DLMFS_I(inode); 507 ip = DLMFS_I(inode);
448 508
449 dlm = user_dlm_register_context(domain, &proto); 509 conn = user_dlm_register(domain);
450 if (IS_ERR(dlm)) { 510 if (IS_ERR(conn)) {
451 status = PTR_ERR(dlm); 511 status = PTR_ERR(conn);
452 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", 512 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
453 status, domain->len, domain->name); 513 status, domain->len, domain->name);
454 goto bail; 514 goto bail;
455 } 515 }
456 ip->ip_dlm = dlm; 516 ip->ip_conn = conn;
457 517
458 inc_nlink(dir); 518 inc_nlink(dir);
459 d_instantiate(dentry, inode); 519 d_instantiate(dentry, inode);
@@ -549,6 +609,7 @@ static int dlmfs_fill_super(struct super_block * sb,
549static const struct file_operations dlmfs_file_operations = { 609static const struct file_operations dlmfs_file_operations = {
550 .open = dlmfs_file_open, 610 .open = dlmfs_file_open,
551 .release = dlmfs_file_release, 611 .release = dlmfs_file_release,
612 .poll = dlmfs_file_poll,
552 .read = dlmfs_file_read, 613 .read = dlmfs_file_read,
553 .write = dlmfs_file_write, 614 .write = dlmfs_file_write,
554}; 615};
@@ -576,6 +637,7 @@ static const struct super_operations dlmfs_ops = {
576 637
577static const struct inode_operations dlmfs_file_inode_operations = { 638static const struct inode_operations dlmfs_file_inode_operations = {
578 .getattr = simple_getattr, 639 .getattr = simple_getattr,
640 .setattr = dlmfs_file_setattr,
579}; 641};
580 642
581static int dlmfs_get_sb(struct file_system_type *fs_type, 643static int dlmfs_get_sb(struct file_system_type *fs_type,
@@ -620,6 +682,7 @@ static int __init init_dlmfs_fs(void)
620 } 682 }
621 cleanup_worker = 1; 683 cleanup_worker = 1;
622 684
685 user_dlm_set_locking_protocol();
623 status = register_filesystem(&dlmfs_fs_type); 686 status = register_filesystem(&dlmfs_fs_type);
624bail: 687bail:
625 if (status) { 688 if (status) {
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
index a733b3321f83..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
index f35eadbed25c..f35eadbed25c 100644
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 4cb1d3dae250..0499e3fb7bdb 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
34#include <linux/types.h> 34#include <linux/types.h>
35#include <linux/crc32.h> 35#include <linux/crc32.h>
36 36
37 37#include "ocfs2_lockingver.h"
38#include "cluster/nodemanager.h" 38#include "stackglue.h"
39#include "cluster/heartbeat.h"
40#include "cluster/tcp.h"
41
42#include "dlmapi.h"
43
44#include "userdlm.h" 39#include "userdlm.h"
45 40
46#define MLOG_MASK_PREFIX ML_DLMFS 41#define MLOG_MASK_PREFIX ML_DLMFS
47#include "cluster/masklog.h" 42#include "cluster/masklog.h"
48 43
44
45static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
46{
47 return container_of(lksb, struct user_lock_res, l_lksb);
48}
49
49static inline int user_check_wait_flag(struct user_lock_res *lockres, 50static inline int user_check_wait_flag(struct user_lock_res *lockres,
50 int flag) 51 int flag)
51{ 52{
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
73} 74}
74 75
75/* I heart container_of... */ 76/* I heart container_of... */
76static inline struct dlm_ctxt * 77static inline struct ocfs2_cluster_connection *
77dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) 78cluster_connection_from_user_lockres(struct user_lock_res *lockres)
78{ 79{
79 struct dlmfs_inode_private *ip; 80 struct dlmfs_inode_private *ip;
80 81
81 ip = container_of(lockres, 82 ip = container_of(lockres,
82 struct dlmfs_inode_private, 83 struct dlmfs_inode_private,
83 ip_lockres); 84 ip_lockres);
84 return ip->ip_dlm; 85 return ip->ip_conn;
85} 86}
86 87
87static struct inode * 88static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
103} 104}
104 105
105#define user_log_dlm_error(_func, _stat, _lockres) do { \ 106#define user_log_dlm_error(_func, _stat, _lockres) do { \
106 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 107 mlog(ML_ERROR, "Dlm error %d while calling %s on " \
107 "resource %.*s: %s\n", dlm_errname(_stat), _func, \ 108 "resource %.*s\n", _stat, _func, \
108 _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ 109 _lockres->l_namelen, _lockres->l_name); \
109} while (0) 110} while (0)
110 111
111/* WARNING: This function lives in a world where the only three lock 112/* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
113 * lock types are added. */ 114 * lock types are added. */
114static inline int user_highest_compat_lock_level(int level) 115static inline int user_highest_compat_lock_level(int level)
115{ 116{
116 int new_level = LKM_EXMODE; 117 int new_level = DLM_LOCK_EX;
117 118
118 if (level == LKM_EXMODE) 119 if (level == DLM_LOCK_EX)
119 new_level = LKM_NLMODE; 120 new_level = DLM_LOCK_NL;
120 else if (level == LKM_PRMODE) 121 else if (level == DLM_LOCK_PR)
121 new_level = LKM_PRMODE; 122 new_level = DLM_LOCK_PR;
122 return new_level; 123 return new_level;
123} 124}
124 125
125static void user_ast(void *opaque) 126static void user_ast(struct ocfs2_dlm_lksb *lksb)
126{ 127{
127 struct user_lock_res *lockres = opaque; 128 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
128 struct dlm_lockstatus *lksb; 129 int status;
129 130
130 mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, 131 mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
131 lockres->l_name); 132 lockres->l_namelen, lockres->l_name, lockres->l_level,
133 lockres->l_requested);
132 134
133 spin_lock(&lockres->l_lock); 135 spin_lock(&lockres->l_lock);
134 136
135 lksb = &(lockres->l_lksb); 137 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
136 if (lksb->status != DLM_NORMAL) { 138 if (status) {
137 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", 139 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
138 lksb->status, lockres->l_namelen, lockres->l_name); 140 status, lockres->l_namelen, lockres->l_name);
139 spin_unlock(&lockres->l_lock); 141 spin_unlock(&lockres->l_lock);
140 return; 142 return;
141 } 143 }
142 144
143 mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, 145 mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
144 "Lockres %.*s, requested ivmode. flags 0x%x\n", 146 "Lockres %.*s, requested ivmode. flags 0x%x\n",
145 lockres->l_namelen, lockres->l_name, lockres->l_flags); 147 lockres->l_namelen, lockres->l_name, lockres->l_flags);
146 148
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
148 if (lockres->l_requested < lockres->l_level) { 150 if (lockres->l_requested < lockres->l_level) {
149 if (lockres->l_requested <= 151 if (lockres->l_requested <=
150 user_highest_compat_lock_level(lockres->l_blocking)) { 152 user_highest_compat_lock_level(lockres->l_blocking)) {
151 lockres->l_blocking = LKM_NLMODE; 153 lockres->l_blocking = DLM_LOCK_NL;
152 lockres->l_flags &= ~USER_LOCK_BLOCKED; 154 lockres->l_flags &= ~USER_LOCK_BLOCKED;
153 } 155 }
154 } 156 }
155 157
156 lockres->l_level = lockres->l_requested; 158 lockres->l_level = lockres->l_requested;
157 lockres->l_requested = LKM_IVMODE; 159 lockres->l_requested = DLM_LOCK_IV;
158 lockres->l_flags |= USER_LOCK_ATTACHED; 160 lockres->l_flags |= USER_LOCK_ATTACHED;
159 lockres->l_flags &= ~USER_LOCK_BUSY; 161 lockres->l_flags &= ~USER_LOCK_BUSY;
160 162
@@ -193,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
193 return; 195 return;
194 196
195 switch (lockres->l_blocking) { 197 switch (lockres->l_blocking) {
196 case LKM_EXMODE: 198 case DLM_LOCK_EX:
197 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 199 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
198 queue = 1; 200 queue = 1;
199 break; 201 break;
200 case LKM_PRMODE: 202 case DLM_LOCK_PR:
201 if (!lockres->l_ex_holders) 203 if (!lockres->l_ex_holders)
202 queue = 1; 204 queue = 1;
203 break; 205 break;
@@ -209,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
209 __user_dlm_queue_lockres(lockres); 211 __user_dlm_queue_lockres(lockres);
210} 212}
211 213
212static void user_bast(void *opaque, int level) 214static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
213{ 215{
214 struct user_lock_res *lockres = opaque; 216 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
215 217
216 mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", 218 mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
217 lockres->l_namelen, lockres->l_name, level); 219 lockres->l_namelen, lockres->l_name, level, lockres->l_level);
218 220
219 spin_lock(&lockres->l_lock); 221 spin_lock(&lockres->l_lock);
220 lockres->l_flags |= USER_LOCK_BLOCKED; 222 lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -227,15 +229,15 @@ static void user_bast(void *opaque, int level)
227 wake_up(&lockres->l_event); 229 wake_up(&lockres->l_event);
228} 230}
229 231
230static void user_unlock_ast(void *opaque, enum dlm_status status) 232static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
231{ 233{
232 struct user_lock_res *lockres = opaque; 234 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
233 235
234 mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, 236 mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
235 lockres->l_name); 237 lockres->l_namelen, lockres->l_name, lockres->l_flags);
236 238
237 if (status != DLM_NORMAL && status != DLM_CANCELGRANT) 239 if (status)
238 mlog(ML_ERROR, "Dlm returns status %d\n", status); 240 mlog(ML_ERROR, "dlm returns status %d\n", status);
239 241
240 spin_lock(&lockres->l_lock); 242 spin_lock(&lockres->l_lock);
241 /* The teardown flag gets set early during the unlock process, 243 /* The teardown flag gets set early during the unlock process,
@@ -243,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
243 * for a concurrent cancel. */ 245 * for a concurrent cancel. */
244 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN 246 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
245 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { 247 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
246 lockres->l_level = LKM_IVMODE; 248 lockres->l_level = DLM_LOCK_IV;
247 } else if (status == DLM_CANCELGRANT) { 249 } else if (status == DLM_CANCELGRANT) {
248 /* We tried to cancel a convert request, but it was 250 /* We tried to cancel a convert request, but it was
249 * already granted. Don't clear the busy flag - the 251 * already granted. Don't clear the busy flag - the
@@ -254,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
254 } else { 256 } else {
255 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); 257 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
256 /* Cancel succeeded, we want to re-queue */ 258 /* Cancel succeeded, we want to re-queue */
257 lockres->l_requested = LKM_IVMODE; /* cancel an 259 lockres->l_requested = DLM_LOCK_IV; /* cancel an
258 * upconvert 260 * upconvert
259 * request. */ 261 * request. */
260 lockres->l_flags &= ~USER_LOCK_IN_CANCEL; 262 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +273,21 @@ out_noclear:
271 wake_up(&lockres->l_event); 273 wake_up(&lockres->l_event);
272} 274}
273 275
276/*
277 * This is the userdlmfs locking protocol version.
278 *
279 * See fs/ocfs2/dlmglue.c for more details on locking versions.
280 */
281static struct ocfs2_locking_protocol user_dlm_lproto = {
282 .lp_max_version = {
283 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
284 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
285 },
286 .lp_lock_ast = user_ast,
287 .lp_blocking_ast = user_bast,
288 .lp_unlock_ast = user_unlock_ast,
289};
290
274static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) 291static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
275{ 292{
276 struct inode *inode; 293 struct inode *inode;
@@ -283,10 +300,10 @@ static void user_dlm_unblock_lock(struct work_struct *work)
283 int new_level, status; 300 int new_level, status;
284 struct user_lock_res *lockres = 301 struct user_lock_res *lockres =
285 container_of(work, struct user_lock_res, l_work); 302 container_of(work, struct user_lock_res, l_work);
286 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 303 struct ocfs2_cluster_connection *conn =
304 cluster_connection_from_user_lockres(lockres);
287 305
288 mlog(0, "processing lockres %.*s\n", lockres->l_namelen, 306 mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
289 lockres->l_name);
290 307
291 spin_lock(&lockres->l_lock); 308 spin_lock(&lockres->l_lock);
292 309
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
304 * flag, and finally we might get another bast which re-queues 321 * flag, and finally we might get another bast which re-queues
305 * us before our ast for the downconvert is called. */ 322 * us before our ast for the downconvert is called. */
306 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { 323 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
324 mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
325 lockres->l_namelen, lockres->l_name);
307 spin_unlock(&lockres->l_lock); 326 spin_unlock(&lockres->l_lock);
308 goto drop_ref; 327 goto drop_ref;
309 } 328 }
310 329
311 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 330 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
331 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
332 lockres->l_namelen, lockres->l_name);
312 spin_unlock(&lockres->l_lock); 333 spin_unlock(&lockres->l_lock);
313 goto drop_ref; 334 goto drop_ref;
314 } 335 }
315 336
316 if (lockres->l_flags & USER_LOCK_BUSY) { 337 if (lockres->l_flags & USER_LOCK_BUSY) {
317 if (lockres->l_flags & USER_LOCK_IN_CANCEL) { 338 if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
339 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
340 lockres->l_namelen, lockres->l_name);
318 spin_unlock(&lockres->l_lock); 341 spin_unlock(&lockres->l_lock);
319 goto drop_ref; 342 goto drop_ref;
320 } 343 }
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(struct work_struct *work)
322 lockres->l_flags |= USER_LOCK_IN_CANCEL; 345 lockres->l_flags |= USER_LOCK_IN_CANCEL;
323 spin_unlock(&lockres->l_lock); 346 spin_unlock(&lockres->l_lock);
324 347
325 status = dlmunlock(dlm, 348 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
326 &lockres->l_lksb, 349 DLM_LKF_CANCEL);
327 LKM_CANCEL, 350 if (status)
328 user_unlock_ast, 351 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
329 lockres);
330 if (status != DLM_NORMAL)
331 user_log_dlm_error("dlmunlock", status, lockres);
332 goto drop_ref; 352 goto drop_ref;
333 } 353 }
334 354
335 /* If there are still incompat holders, we can exit safely 355 /* If there are still incompat holders, we can exit safely
336 * without worrying about re-queueing this lock as that will 356 * without worrying about re-queueing this lock as that will
337 * happen on the last call to user_cluster_unlock. */ 357 * happen on the last call to user_cluster_unlock. */
338 if ((lockres->l_blocking == LKM_EXMODE) 358 if ((lockres->l_blocking == DLM_LOCK_EX)
339 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 359 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
340 spin_unlock(&lockres->l_lock); 360 spin_unlock(&lockres->l_lock);
341 mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", 361 mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
342 lockres->l_ro_holders, lockres->l_ex_holders); 362 lockres->l_namelen, lockres->l_name,
363 lockres->l_ex_holders, lockres->l_ro_holders);
343 goto drop_ref; 364 goto drop_ref;
344 } 365 }
345 366
346 if ((lockres->l_blocking == LKM_PRMODE) 367 if ((lockres->l_blocking == DLM_LOCK_PR)
347 && lockres->l_ex_holders) { 368 && lockres->l_ex_holders) {
348 spin_unlock(&lockres->l_lock); 369 spin_unlock(&lockres->l_lock);
349 mlog(0, "can't downconvert for pr: ex = %u\n", 370 mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
350 lockres->l_ex_holders); 371 lockres->l_namelen, lockres->l_name,
372 lockres->l_ex_holders);
351 goto drop_ref; 373 goto drop_ref;
352 } 374 }
353 375
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
355 new_level = user_highest_compat_lock_level(lockres->l_blocking); 377 new_level = user_highest_compat_lock_level(lockres->l_blocking);
356 lockres->l_requested = new_level; 378 lockres->l_requested = new_level;
357 lockres->l_flags |= USER_LOCK_BUSY; 379 lockres->l_flags |= USER_LOCK_BUSY;
358 mlog(0, "Downconvert lock from %d to %d\n", 380 mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
359 lockres->l_level, new_level); 381 lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
360 spin_unlock(&lockres->l_lock); 382 spin_unlock(&lockres->l_lock);
361 383
362 /* need lock downconvert request now... */ 384 /* need lock downconvert request now... */
363 status = dlmlock(dlm, 385 status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
364 new_level, 386 DLM_LKF_CONVERT|DLM_LKF_VALBLK,
365 &lockres->l_lksb, 387 lockres->l_name,
366 LKM_CONVERT|LKM_VALBLK, 388 lockres->l_namelen);
367 lockres->l_name, 389 if (status) {
368 lockres->l_namelen, 390 user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
369 user_ast,
370 lockres,
371 user_bast);
372 if (status != DLM_NORMAL) {
373 user_log_dlm_error("dlmlock", status, lockres);
374 user_recover_from_dlm_error(lockres); 391 user_recover_from_dlm_error(lockres);
375 } 392 }
376 393
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
382 int level) 399 int level)
383{ 400{
384 switch(level) { 401 switch(level) {
385 case LKM_EXMODE: 402 case DLM_LOCK_EX:
386 lockres->l_ex_holders++; 403 lockres->l_ex_holders++;
387 break; 404 break;
388 case LKM_PRMODE: 405 case DLM_LOCK_PR:
389 lockres->l_ro_holders++; 406 lockres->l_ro_holders++;
390 break; 407 break;
391 default: 408 default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
410 int lkm_flags) 427 int lkm_flags)
411{ 428{
412 int status, local_flags; 429 int status, local_flags;
413 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 430 struct ocfs2_cluster_connection *conn =
431 cluster_connection_from_user_lockres(lockres);
414 432
415 if (level != LKM_EXMODE && 433 if (level != DLM_LOCK_EX &&
416 level != LKM_PRMODE) { 434 level != DLM_LOCK_PR) {
417 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 435 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
418 lockres->l_namelen, lockres->l_name); 436 lockres->l_namelen, lockres->l_name);
419 status = -EINVAL; 437 status = -EINVAL;
420 goto bail; 438 goto bail;
421 } 439 }
422 440
423 mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", 441 mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
424 lockres->l_namelen, lockres->l_name, 442 lockres->l_namelen, lockres->l_name, level, lkm_flags);
425 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
426 lkm_flags);
427 443
428again: 444again:
429 if (signal_pending(current)) { 445 if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
457 } 473 }
458 474
459 if (level > lockres->l_level) { 475 if (level > lockres->l_level) {
460 local_flags = lkm_flags | LKM_VALBLK; 476 local_flags = lkm_flags | DLM_LKF_VALBLK;
461 if (lockres->l_level != LKM_IVMODE) 477 if (lockres->l_level != DLM_LOCK_IV)
462 local_flags |= LKM_CONVERT; 478 local_flags |= DLM_LKF_CONVERT;
463 479
464 lockres->l_requested = level; 480 lockres->l_requested = level;
465 lockres->l_flags |= USER_LOCK_BUSY; 481 lockres->l_flags |= USER_LOCK_BUSY;
466 spin_unlock(&lockres->l_lock); 482 spin_unlock(&lockres->l_lock);
467 483
468 BUG_ON(level == LKM_IVMODE); 484 BUG_ON(level == DLM_LOCK_IV);
469 BUG_ON(level == LKM_NLMODE); 485 BUG_ON(level == DLM_LOCK_NL);
470 486
471 /* call dlm_lock to upgrade lock now */ 487 /* call dlm_lock to upgrade lock now */
472 status = dlmlock(dlm, 488 status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
473 level, 489 local_flags, lockres->l_name,
474 &lockres->l_lksb, 490 lockres->l_namelen);
475 local_flags, 491 if (status) {
476 lockres->l_name, 492 if ((lkm_flags & DLM_LKF_NOQUEUE) &&
477 lockres->l_namelen, 493 (status != -EAGAIN))
478 user_ast, 494 user_log_dlm_error("ocfs2_dlm_lock",
479 lockres, 495 status, lockres);
480 user_bast);
481 if (status != DLM_NORMAL) {
482 if ((lkm_flags & LKM_NOQUEUE) &&
483 (status == DLM_NOTQUEUED))
484 status = -EAGAIN;
485 else {
486 user_log_dlm_error("dlmlock", status, lockres);
487 status = -EINVAL;
488 }
489 user_recover_from_dlm_error(lockres); 496 user_recover_from_dlm_error(lockres);
490 goto bail; 497 goto bail;
491 } 498 }
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
506 int level) 513 int level)
507{ 514{
508 switch(level) { 515 switch(level) {
509 case LKM_EXMODE: 516 case DLM_LOCK_EX:
510 BUG_ON(!lockres->l_ex_holders); 517 BUG_ON(!lockres->l_ex_holders);
511 lockres->l_ex_holders--; 518 lockres->l_ex_holders--;
512 break; 519 break;
513 case LKM_PRMODE: 520 case DLM_LOCK_PR:
514 BUG_ON(!lockres->l_ro_holders); 521 BUG_ON(!lockres->l_ro_holders);
515 lockres->l_ro_holders--; 522 lockres->l_ro_holders--;
516 break; 523 break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
522void user_dlm_cluster_unlock(struct user_lock_res *lockres, 529void user_dlm_cluster_unlock(struct user_lock_res *lockres,
523 int level) 530 int level)
524{ 531{
525 if (level != LKM_EXMODE && 532 if (level != DLM_LOCK_EX &&
526 level != LKM_PRMODE) { 533 level != DLM_LOCK_PR) {
527 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 534 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
528 lockres->l_namelen, lockres->l_name); 535 lockres->l_namelen, lockres->l_name);
529 return; 536 return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
540 unsigned int len) 547 unsigned int len)
541{ 548{
542 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 549 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
543 char *lvb = lockres->l_lksb.lvb; 550 char *lvb;
544 551
545 BUG_ON(len > DLM_LVB_LEN); 552 BUG_ON(len > DLM_LVB_LEN);
546 553
547 spin_lock(&lockres->l_lock); 554 spin_lock(&lockres->l_lock);
548 555
549 BUG_ON(lockres->l_level < LKM_EXMODE); 556 BUG_ON(lockres->l_level < DLM_LOCK_EX);
557 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
550 memcpy(lvb, val, len); 558 memcpy(lvb, val, len);
551 559
552 spin_unlock(&lockres->l_lock); 560 spin_unlock(&lockres->l_lock);
553} 561}
554 562
555void user_dlm_read_lvb(struct inode *inode, 563ssize_t user_dlm_read_lvb(struct inode *inode,
556 char *val, 564 char *val,
557 unsigned int len) 565 unsigned int len)
558{ 566{
559 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 567 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
560 char *lvb = lockres->l_lksb.lvb; 568 char *lvb;
569 ssize_t ret = len;
561 570
562 BUG_ON(len > DLM_LVB_LEN); 571 BUG_ON(len > DLM_LVB_LEN);
563 572
564 spin_lock(&lockres->l_lock); 573 spin_lock(&lockres->l_lock);
565 574
566 BUG_ON(lockres->l_level < LKM_PRMODE); 575 BUG_ON(lockres->l_level < DLM_LOCK_PR);
567 memcpy(val, lvb, len); 576 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
577 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
578 memcpy(val, lvb, len);
579 } else
580 ret = 0;
568 581
569 spin_unlock(&lockres->l_lock); 582 spin_unlock(&lockres->l_lock);
583 return ret;
570} 584}
571 585
572void user_dlm_lock_res_init(struct user_lock_res *lockres, 586void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
576 590
577 spin_lock_init(&lockres->l_lock); 591 spin_lock_init(&lockres->l_lock);
578 init_waitqueue_head(&lockres->l_event); 592 init_waitqueue_head(&lockres->l_event);
579 lockres->l_level = LKM_IVMODE; 593 lockres->l_level = DLM_LOCK_IV;
580 lockres->l_requested = LKM_IVMODE; 594 lockres->l_requested = DLM_LOCK_IV;
581 lockres->l_blocking = LKM_IVMODE; 595 lockres->l_blocking = DLM_LOCK_IV;
582 596
583 /* should have been checked before getting here. */ 597 /* should have been checked before getting here. */
584 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); 598 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
592int user_dlm_destroy_lock(struct user_lock_res *lockres) 606int user_dlm_destroy_lock(struct user_lock_res *lockres)
593{ 607{
594 int status = -EBUSY; 608 int status = -EBUSY;
595 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 609 struct ocfs2_cluster_connection *conn =
610 cluster_connection_from_user_lockres(lockres);
596 611
597 mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); 612 mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
598 613
599 spin_lock(&lockres->l_lock); 614 spin_lock(&lockres->l_lock);
600 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 615 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
627 lockres->l_flags |= USER_LOCK_BUSY; 642 lockres->l_flags |= USER_LOCK_BUSY;
628 spin_unlock(&lockres->l_lock); 643 spin_unlock(&lockres->l_lock);
629 644
630 status = dlmunlock(dlm, 645 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
631 &lockres->l_lksb, 646 if (status) {
632 LKM_VALBLK, 647 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
633 user_unlock_ast,
634 lockres);
635 if (status != DLM_NORMAL) {
636 user_log_dlm_error("dlmunlock", status, lockres);
637 status = -EINVAL;
638 goto bail; 648 goto bail;
639 } 649 }
640 650
@@ -645,32 +655,34 @@ bail:
645 return status; 655 return status;
646} 656}
647 657
648struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 658static void user_dlm_recovery_handler_noop(int node_num,
649 struct dlm_protocol_version *proto) 659 void *recovery_data)
650{ 660{
651 struct dlm_ctxt *dlm; 661 /* We ignore recovery events */
652 u32 dlm_key; 662 return;
653 char *domain; 663}
654
655 domain = kmalloc(name->len + 1, GFP_NOFS);
656 if (!domain) {
657 mlog_errno(-ENOMEM);
658 return ERR_PTR(-ENOMEM);
659 }
660 664
661 dlm_key = crc32_le(0, name->name, name->len); 665void user_dlm_set_locking_protocol(void)
666{
667 ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
668}
662 669
663 snprintf(domain, name->len + 1, "%.*s", name->len, name->name); 670struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
671{
672 int rc;
673 struct ocfs2_cluster_connection *conn;
664 674
665 dlm = dlm_register_domain(domain, dlm_key, proto); 675 rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
666 if (IS_ERR(dlm)) 676 &user_dlm_lproto,
667 mlog_errno(PTR_ERR(dlm)); 677 user_dlm_recovery_handler_noop,
678 NULL, &conn);
679 if (rc)
680 mlog_errno(rc);
668 681
669 kfree(domain); 682 return rc ? ERR_PTR(rc) : conn;
670 return dlm;
671} 683}
672 684
673void user_dlm_unregister_context(struct dlm_ctxt *dlm) 685void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
674{ 686{
675 dlm_unregister_domain(dlm); 687 ocfs2_cluster_disconnect(conn, 0);
676} 688}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61fa..3b42d79531d7 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
57 int l_level; 57 int l_level;
58 unsigned int l_ro_holders; 58 unsigned int l_ro_holders;
59 unsigned int l_ex_holders; 59 unsigned int l_ex_holders;
60 struct dlm_lockstatus l_lksb; 60 struct ocfs2_dlm_lksb l_lksb;
61 61
62 int l_requested; 62 int l_requested;
63 int l_blocking; 63 int l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
80void user_dlm_write_lvb(struct inode *inode, 80void user_dlm_write_lvb(struct inode *inode,
81 const char *val, 81 const char *val,
82 unsigned int len); 82 unsigned int len);
83void user_dlm_read_lvb(struct inode *inode, 83ssize_t user_dlm_read_lvb(struct inode *inode,
84 char *val, 84 char *val,
85 unsigned int len); 85 unsigned int len);
86struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 86struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
87 struct dlm_protocol_version *proto); 87void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
88void user_dlm_unregister_context(struct dlm_ctxt *dlm); 88void user_dlm_set_locking_protocol(void);
89 89
90struct dlmfs_inode_private { 90struct dlmfs_inode_private {
91 struct dlm_ctxt *ip_dlm; 91 struct ocfs2_cluster_connection *ip_conn;
92 92
93 struct user_lock_res ip_lockres; /* unused for directories. */ 93 struct user_lock_res ip_lockres; /* unused for directories. */
94 struct inode *ip_parent; 94 struct inode *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0d38d67194cb..50c4ee805da4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
298} 298}
299 299
300static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
301{
302 return container_of(lksb, struct ocfs2_lock_res, l_lksb);
303}
304
300static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 305static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
301{ 306{
302 BUG_ON(!ocfs2_is_inode_lock(lockres)); 307 BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -875,6 +880,14 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
875 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 880 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
876 881
877 lockres->l_level = lockres->l_requested; 882 lockres->l_level = lockres->l_requested;
883
884 /*
885 * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
886 * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
887 * downconverting the lock before the upconvert has fully completed.
888 */
889 lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
890
878 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 891 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
879 892
880 mlog_exit_void(); 893 mlog_exit_void();
@@ -907,8 +920,6 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
907 920
908 assert_spin_locked(&lockres->l_lock); 921 assert_spin_locked(&lockres->l_lock);
909 922
910 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
911
912 if (level > lockres->l_blocking) { 923 if (level > lockres->l_blocking) {
913 /* only schedule a downconvert if we haven't already scheduled 924 /* only schedule a downconvert if we haven't already scheduled
914 * one that goes low enough to satisfy the level we're 925 * one that goes low enough to satisfy the level we're
@@ -921,6 +932,13 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
921 lockres->l_blocking = level; 932 lockres->l_blocking = level;
922 } 933 }
923 934
935 mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
936 lockres->l_name, level, lockres->l_level, lockres->l_blocking,
937 needs_downconvert);
938
939 if (needs_downconvert)
940 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
941
924 mlog_exit(needs_downconvert); 942 mlog_exit(needs_downconvert);
925 return needs_downconvert; 943 return needs_downconvert;
926} 944}
@@ -1031,18 +1049,17 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
1031 return lockres->l_pending_gen; 1049 return lockres->l_pending_gen;
1032} 1050}
1033 1051
1034 1052static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
1035static void ocfs2_blocking_ast(void *opaque, int level)
1036{ 1053{
1037 struct ocfs2_lock_res *lockres = opaque; 1054 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1038 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1055 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1039 int needs_downconvert; 1056 int needs_downconvert;
1040 unsigned long flags; 1057 unsigned long flags;
1041 1058
1042 BUG_ON(level <= DLM_LOCK_NL); 1059 BUG_ON(level <= DLM_LOCK_NL);
1043 1060
1044 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", 1061 mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
1045 lockres->l_name, level, lockres->l_level, 1062 "type %s\n", lockres->l_name, level, lockres->l_level,
1046 ocfs2_lock_type_string(lockres->l_type)); 1063 ocfs2_lock_type_string(lockres->l_type));
1047 1064
1048 /* 1065 /*
@@ -1063,9 +1080,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
1063 ocfs2_wake_downconvert_thread(osb); 1080 ocfs2_wake_downconvert_thread(osb);
1064} 1081}
1065 1082
1066static void ocfs2_locking_ast(void *opaque) 1083static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
1067{ 1084{
1068 struct ocfs2_lock_res *lockres = opaque; 1085 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1069 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1086 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1070 unsigned long flags; 1087 unsigned long flags;
1071 int status; 1088 int status;
@@ -1086,6 +1103,10 @@ static void ocfs2_locking_ast(void *opaque)
1086 return; 1103 return;
1087 } 1104 }
1088 1105
1106 mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
1107 "level %d => %d\n", lockres->l_name, lockres->l_action,
1108 lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
1109
1089 switch(lockres->l_action) { 1110 switch(lockres->l_action) {
1090 case OCFS2_AST_ATTACH: 1111 case OCFS2_AST_ATTACH:
1091 ocfs2_generic_handle_attach_action(lockres); 1112 ocfs2_generic_handle_attach_action(lockres);
@@ -1098,8 +1119,8 @@ static void ocfs2_locking_ast(void *opaque)
1098 ocfs2_generic_handle_downconvert_action(lockres); 1119 ocfs2_generic_handle_downconvert_action(lockres);
1099 break; 1120 break;
1100 default: 1121 default:
1101 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " 1122 mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
1102 "lockres flags = 0x%lx, unlock action: %u\n", 1123 "flags 0x%lx, unlock: %u\n",
1103 lockres->l_name, lockres->l_action, lockres->l_flags, 1124 lockres->l_name, lockres->l_action, lockres->l_flags,
1104 lockres->l_unlock_action); 1125 lockres->l_unlock_action);
1105 BUG(); 1126 BUG();
@@ -1125,6 +1146,88 @@ out:
1125 spin_unlock_irqrestore(&lockres->l_lock, flags); 1146 spin_unlock_irqrestore(&lockres->l_lock, flags);
1126} 1147}
1127 1148
1149static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1150{
1151 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1152 unsigned long flags;
1153
1154 mlog_entry_void();
1155
1156 mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
1157 lockres->l_name, lockres->l_unlock_action);
1158
1159 spin_lock_irqsave(&lockres->l_lock, flags);
1160 if (error) {
1161 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
1162 "unlock_action %d\n", error, lockres->l_name,
1163 lockres->l_unlock_action);
1164 spin_unlock_irqrestore(&lockres->l_lock, flags);
1165 mlog_exit_void();
1166 return;
1167 }
1168
1169 switch(lockres->l_unlock_action) {
1170 case OCFS2_UNLOCK_CANCEL_CONVERT:
1171 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
1172 lockres->l_action = OCFS2_AST_INVALID;
1173 /* Downconvert thread may have requeued this lock, we
1174 * need to wake it. */
1175 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
1176 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
1177 break;
1178 case OCFS2_UNLOCK_DROP_LOCK:
1179 lockres->l_level = DLM_LOCK_IV;
1180 break;
1181 default:
1182 BUG();
1183 }
1184
1185 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1187 wake_up(&lockres->l_event);
1188 spin_unlock_irqrestore(&lockres->l_lock, flags);
1189
1190 mlog_exit_void();
1191}
1192
1193/*
1194 * This is the filesystem locking protocol. It provides the lock handling
1195 * hooks for the underlying DLM. It has a maximum version number.
1196 * The version number allows interoperability with systems running at
1197 * the same major number and an equal or smaller minor number.
1198 *
1199 * Whenever the filesystem does new things with locks (adds or removes a
1200 * lock, orders them differently, does different things underneath a lock),
1201 * the version must be changed. The protocol is negotiated when joining
1202 * the dlm domain. A node may join the domain if its major version is
1203 * identical to all other nodes and its minor version is greater than
1204 * or equal to all other nodes. When its minor version is greater than
1205 * the other nodes, it will run at the minor version specified by the
1206 * other nodes.
1207 *
1208 * If a locking change is made that will not be compatible with older
1209 * versions, the major number must be increased and the minor version set
1210 * to zero. If a change merely adds a behavior that can be disabled when
1211 * speaking to older versions, the minor version must be increased. If a
1212 * change adds a fully backwards compatible change (eg, LVB changes that
1213 * are just ignored by older versions), the version does not need to be
1214 * updated.
1215 */
1216static struct ocfs2_locking_protocol lproto = {
1217 .lp_max_version = {
1218 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
1219 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
1220 },
1221 .lp_lock_ast = ocfs2_locking_ast,
1222 .lp_blocking_ast = ocfs2_blocking_ast,
1223 .lp_unlock_ast = ocfs2_unlock_ast,
1224};
1225
1226void ocfs2_set_locking_protocol(void)
1227{
1228 ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
1229}
1230
1128static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 1231static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1129 int convert) 1232 int convert)
1130{ 1233{
@@ -1133,6 +1236,7 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1133 mlog_entry_void(); 1236 mlog_entry_void();
1134 spin_lock_irqsave(&lockres->l_lock, flags); 1237 spin_lock_irqsave(&lockres->l_lock, flags);
1135 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 1238 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1239 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
1136 if (convert) 1240 if (convert)
1137 lockres->l_action = OCFS2_AST_INVALID; 1241 lockres->l_action = OCFS2_AST_INVALID;
1138 else 1242 else
@@ -1179,8 +1283,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
1179 &lockres->l_lksb, 1283 &lockres->l_lksb,
1180 dlm_flags, 1284 dlm_flags,
1181 lockres->l_name, 1285 lockres->l_name,
1182 OCFS2_LOCK_ID_MAX_LEN - 1, 1286 OCFS2_LOCK_ID_MAX_LEN - 1);
1183 lockres);
1184 lockres_clear_pending(lockres, gen, osb); 1287 lockres_clear_pending(lockres, gen, osb);
1185 if (ret) { 1288 if (ret) {
1186 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1289 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1323,13 +1426,13 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
1323again: 1426again:
1324 wait = 0; 1427 wait = 0;
1325 1428
1429 spin_lock_irqsave(&lockres->l_lock, flags);
1430
1326 if (catch_signals && signal_pending(current)) { 1431 if (catch_signals && signal_pending(current)) {
1327 ret = -ERESTARTSYS; 1432 ret = -ERESTARTSYS;
1328 goto out; 1433 goto unlock;
1329 } 1434 }
1330 1435
1331 spin_lock_irqsave(&lockres->l_lock, flags);
1332
1333 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, 1436 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
1334 "Cluster lock called on freeing lockres %s! flags " 1437 "Cluster lock called on freeing lockres %s! flags "
1335 "0x%lx\n", lockres->l_name, lockres->l_flags); 1438 "0x%lx\n", lockres->l_name, lockres->l_flags);
@@ -1346,6 +1449,25 @@ again:
1346 goto unlock; 1449 goto unlock;
1347 } 1450 }
1348 1451
1452 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
1453 /*
1454 * We've upconverted. If the lock now has a level we can
1455 * work with, we take it. If, however, the lock is not at the
1456 * required level, we go thru the full cycle. One way this could
1457 * happen is if a process requesting an upconvert to PR is
1458 * closely followed by another requesting upconvert to an EX.
1459 * If the process requesting EX lands here, we want it to
1460 * continue attempting to upconvert and let the process
1461 * requesting PR take the lock.
1462 * If multiple processes request upconvert to PR, the first one
1463 * here will take the lock. The others will have to go thru the
1464 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
1465 * downconvert request.
1466 */
1467 if (level <= lockres->l_level)
1468 goto update_holders;
1469 }
1470
1349 if (lockres->l_flags & OCFS2_LOCK_BLOCKED && 1471 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
1350 !ocfs2_may_continue_on_blocked_lock(lockres, level)) { 1472 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
1351 /* is the lock is currently blocked on behalf of 1473 /* is the lock is currently blocked on behalf of
@@ -1383,7 +1505,7 @@ again:
1383 BUG_ON(level == DLM_LOCK_IV); 1505 BUG_ON(level == DLM_LOCK_IV);
1384 BUG_ON(level == DLM_LOCK_NL); 1506 BUG_ON(level == DLM_LOCK_NL);
1385 1507
1386 mlog(0, "lock %s, convert from %d to level = %d\n", 1508 mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
1387 lockres->l_name, lockres->l_level, level); 1509 lockres->l_name, lockres->l_level, level);
1388 1510
1389 /* call dlm_lock to upgrade lock now */ 1511 /* call dlm_lock to upgrade lock now */
@@ -1392,8 +1514,7 @@ again:
1392 &lockres->l_lksb, 1514 &lockres->l_lksb,
1393 lkm_flags, 1515 lkm_flags,
1394 lockres->l_name, 1516 lockres->l_name,
1395 OCFS2_LOCK_ID_MAX_LEN - 1, 1517 OCFS2_LOCK_ID_MAX_LEN - 1);
1396 lockres);
1397 lockres_clear_pending(lockres, gen, osb); 1518 lockres_clear_pending(lockres, gen, osb);
1398 if (ret) { 1519 if (ret) {
1399 if (!(lkm_flags & DLM_LKF_NOQUEUE) || 1520 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1416,11 +1537,14 @@ again:
1416 goto again; 1537 goto again;
1417 } 1538 }
1418 1539
1540update_holders:
1419 /* Ok, if we get here then we're good to go. */ 1541 /* Ok, if we get here then we're good to go. */
1420 ocfs2_inc_holders(lockres, level); 1542 ocfs2_inc_holders(lockres, level);
1421 1543
1422 ret = 0; 1544 ret = 0;
1423unlock: 1545unlock:
1546 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
1547
1424 spin_unlock_irqrestore(&lockres->l_lock, flags); 1548 spin_unlock_irqrestore(&lockres->l_lock, flags);
1425out: 1549out:
1426 /* 1550 /*
@@ -1757,7 +1881,7 @@ out:
1757 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of 1881 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1758 * flock() calls. The locking approach this requires is sufficiently 1882 * flock() calls. The locking approach this requires is sufficiently
1759 * different from all other cluster lock types that we implement a 1883 * different from all other cluster lock types that we implement a
1760 * seperate path to the "low-level" dlm calls. In particular: 1884 * separate path to the "low-level" dlm calls. In particular:
1761 * 1885 *
1762 * - No optimization of lock levels is done - we take at exactly 1886 * - No optimization of lock levels is done - we take at exactly
1763 * what's been requested. 1887 * what's been requested.
@@ -1827,8 +1951,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1827 spin_unlock_irqrestore(&lockres->l_lock, flags); 1951 spin_unlock_irqrestore(&lockres->l_lock, flags);
1828 1952
1829 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, 1953 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
1830 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, 1954 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
1831 lockres);
1832 if (ret) { 1955 if (ret) {
1833 if (!trylock || (ret != -EAGAIN)) { 1956 if (!trylock || (ret != -EAGAIN)) {
1834 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1957 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1855,7 +1978,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1855 * outstanding lock request, so a cancel convert is 1978 * outstanding lock request, so a cancel convert is
1856 * required. We intentionally overwrite 'ret' - if the 1979 * required. We intentionally overwrite 'ret' - if the
1857 * cancel fails and the lock was granted, it's easier 1980 * cancel fails and the lock was granted, it's easier
1858 * to just bubble sucess back up to the user. 1981 * to just bubble success back up to the user.
1859 */ 1982 */
1860 ret = ocfs2_flock_handle_signal(lockres, level); 1983 ret = ocfs2_flock_handle_signal(lockres, level);
1861 } else if (!ret && (level > lockres->l_level)) { 1984 } else if (!ret && (level > lockres->l_level)) {
@@ -2957,7 +3080,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2957 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 3080 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2958 osb->uuid_str, 3081 osb->uuid_str,
2959 strlen(osb->uuid_str), 3082 strlen(osb->uuid_str),
2960 ocfs2_do_node_down, osb, 3083 &lproto, ocfs2_do_node_down, osb,
2961 &conn); 3084 &conn);
2962 if (status) { 3085 if (status) {
2963 mlog_errno(status); 3086 mlog_errno(status);
@@ -3024,50 +3147,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
3024 mlog_exit_void(); 3147 mlog_exit_void();
3025} 3148}
3026 3149
3027static void ocfs2_unlock_ast(void *opaque, int error)
3028{
3029 struct ocfs2_lock_res *lockres = opaque;
3030 unsigned long flags;
3031
3032 mlog_entry_void();
3033
3034 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
3035 lockres->l_unlock_action);
3036
3037 spin_lock_irqsave(&lockres->l_lock, flags);
3038 if (error) {
3039 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
3040 "unlock_action %d\n", error, lockres->l_name,
3041 lockres->l_unlock_action);
3042 spin_unlock_irqrestore(&lockres->l_lock, flags);
3043 mlog_exit_void();
3044 return;
3045 }
3046
3047 switch(lockres->l_unlock_action) {
3048 case OCFS2_UNLOCK_CANCEL_CONVERT:
3049 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
3050 lockres->l_action = OCFS2_AST_INVALID;
3051 /* Downconvert thread may have requeued this lock, we
3052 * need to wake it. */
3053 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
3054 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
3055 break;
3056 case OCFS2_UNLOCK_DROP_LOCK:
3057 lockres->l_level = DLM_LOCK_IV;
3058 break;
3059 default:
3060 BUG();
3061 }
3062
3063 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
3064 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
3065 wake_up(&lockres->l_event);
3066 spin_unlock_irqrestore(&lockres->l_lock, flags);
3067
3068 mlog_exit_void();
3069}
3070
3071static int ocfs2_drop_lock(struct ocfs2_super *osb, 3150static int ocfs2_drop_lock(struct ocfs2_super *osb,
3072 struct ocfs2_lock_res *lockres) 3151 struct ocfs2_lock_res *lockres)
3073{ 3152{
@@ -3135,8 +3214,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
3135 3214
3136 mlog(0, "lock %s\n", lockres->l_name); 3215 mlog(0, "lock %s\n", lockres->l_name);
3137 3216
3138 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags, 3217 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
3139 lockres);
3140 if (ret) { 3218 if (ret) {
3141 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3219 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3142 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 3220 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3155,7 +3233,7 @@ out:
3155/* Mark the lockres as being dropped. It will no longer be 3233/* Mark the lockres as being dropped. It will no longer be
3156 * queued if blocking, but we still may have to wait on it 3234 * queued if blocking, but we still may have to wait on it
3157 * being dequeued from the downconvert thread before we can consider 3235 * being dequeued from the downconvert thread before we can consider
3158 * it safe to drop. 3236 * it safe to drop.
3159 * 3237 *
3160 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 3238 * You can *not* attempt to call cluster_lock on this lockres anymore. */
3161void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) 3239void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
@@ -3244,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
3244 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); 3322 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
3245 3323
3246 if (lockres->l_level <= new_level) { 3324 if (lockres->l_level <= new_level) {
3247 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n", 3325 mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
3248 lockres->l_level, new_level); 3326 "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
3327 "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
3328 new_level, list_empty(&lockres->l_blocked_list),
3329 list_empty(&lockres->l_mask_waiters), lockres->l_type,
3330 lockres->l_flags, lockres->l_ro_holders,
3331 lockres->l_ex_holders, lockres->l_action,
3332 lockres->l_unlock_action, lockres->l_requested,
3333 lockres->l_blocking, lockres->l_pending_gen);
3249 BUG(); 3334 BUG();
3250 } 3335 }
3251 3336
3252 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", 3337 mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
3253 lockres->l_name, new_level, lockres->l_blocking); 3338 lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
3254 3339
3255 lockres->l_action = OCFS2_AST_DOWNCONVERT; 3340 lockres->l_action = OCFS2_AST_DOWNCONVERT;
3256 lockres->l_requested = new_level; 3341 lockres->l_requested = new_level;
@@ -3269,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3269 3354
3270 mlog_entry_void(); 3355 mlog_entry_void();
3271 3356
3357 mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
3358 lockres->l_level, new_level);
3359
3272 if (lvb) 3360 if (lvb)
3273 dlm_flags |= DLM_LKF_VALBLK; 3361 dlm_flags |= DLM_LKF_VALBLK;
3274 3362
@@ -3277,8 +3365,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3277 &lockres->l_lksb, 3365 &lockres->l_lksb,
3278 dlm_flags, 3366 dlm_flags,
3279 lockres->l_name, 3367 lockres->l_name,
3280 OCFS2_LOCK_ID_MAX_LEN - 1, 3368 OCFS2_LOCK_ID_MAX_LEN - 1);
3281 lockres);
3282 lockres_clear_pending(lockres, generation, osb); 3369 lockres_clear_pending(lockres, generation, osb);
3283 if (ret) { 3370 if (ret) {
3284 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 3371 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3299,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3299 assert_spin_locked(&lockres->l_lock); 3386 assert_spin_locked(&lockres->l_lock);
3300 3387
3301 mlog_entry_void(); 3388 mlog_entry_void();
3302 mlog(0, "lock %s\n", lockres->l_name);
3303 3389
3304 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 3390 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
3305 /* If we're already trying to cancel a lock conversion 3391 /* If we're already trying to cancel a lock conversion
3306 * then just drop the spinlock and allow the caller to 3392 * then just drop the spinlock and allow the caller to
3307 * requeue this lock. */ 3393 * requeue this lock. */
3308 3394 mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
3309 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
3310 return 0; 3395 return 0;
3311 } 3396 }
3312 3397
@@ -3321,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3321 "lock %s, invalid flags: 0x%lx\n", 3406 "lock %s, invalid flags: 0x%lx\n",
3322 lockres->l_name, lockres->l_flags); 3407 lockres->l_name, lockres->l_flags);
3323 3408
3409 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3410
3324 return 1; 3411 return 1;
3325} 3412}
3326 3413
@@ -3330,16 +3417,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
3330 int ret; 3417 int ret;
3331 3418
3332 mlog_entry_void(); 3419 mlog_entry_void();
3333 mlog(0, "lock %s\n", lockres->l_name);
3334 3420
3335 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, 3421 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
3336 DLM_LKF_CANCEL, lockres); 3422 DLM_LKF_CANCEL);
3337 if (ret) { 3423 if (ret) {
3338 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3424 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3339 ocfs2_recover_from_dlm_error(lockres, 0); 3425 ocfs2_recover_from_dlm_error(lockres, 0);
3340 } 3426 }
3341 3427
3342 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name); 3428 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3343 3429
3344 mlog_exit(ret); 3430 mlog_exit(ret);
3345 return ret; 3431 return ret;
@@ -3352,6 +3438,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
3352 unsigned long flags; 3438 unsigned long flags;
3353 int blocking; 3439 int blocking;
3354 int new_level; 3440 int new_level;
3441 int level;
3355 int ret = 0; 3442 int ret = 0;
3356 int set_lvb = 0; 3443 int set_lvb = 0;
3357 unsigned int gen; 3444 unsigned int gen;
@@ -3360,9 +3447,17 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
3360 3447
3361 spin_lock_irqsave(&lockres->l_lock, flags); 3448 spin_lock_irqsave(&lockres->l_lock, flags);
3362 3449
3363 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
3364
3365recheck: 3450recheck:
3451 /*
3452 * Is it still blocking? If not, we have no more work to do.
3453 */
3454 if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
3455 BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
3456 spin_unlock_irqrestore(&lockres->l_lock, flags);
3457 ret = 0;
3458 goto leave;
3459 }
3460
3366 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 3461 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
3367 /* XXX 3462 /* XXX
3368 * This is a *big* race. The OCFS2_LOCK_PENDING flag 3463 * This is a *big* race. The OCFS2_LOCK_PENDING flag
@@ -3387,8 +3482,11 @@ recheck:
3387 * at the same time they set OCFS2_DLM_BUSY. They must 3482 * at the same time they set OCFS2_DLM_BUSY. They must
3388 * clear OCFS2_DLM_PENDING after dlm_lock() returns. 3483 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3389 */ 3484 */
3390 if (lockres->l_flags & OCFS2_LOCK_PENDING) 3485 if (lockres->l_flags & OCFS2_LOCK_PENDING) {
3486 mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
3487 lockres->l_name);
3391 goto leave_requeue; 3488 goto leave_requeue;
3489 }
3392 3490
3393 ctl->requeue = 1; 3491 ctl->requeue = 1;
3394 ret = ocfs2_prepare_cancel_convert(osb, lockres); 3492 ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3401,31 +3499,70 @@ recheck:
3401 goto leave; 3499 goto leave;
3402 } 3500 }
3403 3501
3502 /*
3503 * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
3504 * set when the ast is received for an upconvert just before the
3505 * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
3506 * on the heels of the ast, we want to delay the downconvert just
3507 * enough to allow the up requestor to do its task. Because this
3508 * lock is in the blocked queue, the lock will be downconverted
3509 * as soon as the requestor is done with the lock.
3510 */
3511 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
3512 goto leave_requeue;
3513
3514 /*
3515 * How can we block and yet be at NL? We were trying to upconvert
3516 * from NL and got canceled. The code comes back here, and now
3517 * we notice and clear BLOCKING.
3518 */
3519 if (lockres->l_level == DLM_LOCK_NL) {
3520 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
3521 mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
3522 lockres->l_blocking = DLM_LOCK_NL;
3523 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
3524 spin_unlock_irqrestore(&lockres->l_lock, flags);
3525 goto leave;
3526 }
3527
3404 /* if we're blocking an exclusive and we have *any* holders, 3528 /* if we're blocking an exclusive and we have *any* holders,
3405 * then requeue. */ 3529 * then requeue. */
3406 if ((lockres->l_blocking == DLM_LOCK_EX) 3530 if ((lockres->l_blocking == DLM_LOCK_EX)
3407 && (lockres->l_ex_holders || lockres->l_ro_holders)) 3531 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
3532 mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
3533 lockres->l_name, lockres->l_ex_holders,
3534 lockres->l_ro_holders);
3408 goto leave_requeue; 3535 goto leave_requeue;
3536 }
3409 3537
3410 /* If it's a PR we're blocking, then only 3538 /* If it's a PR we're blocking, then only
3411 * requeue if we've got any EX holders */ 3539 * requeue if we've got any EX holders */
3412 if (lockres->l_blocking == DLM_LOCK_PR && 3540 if (lockres->l_blocking == DLM_LOCK_PR &&
3413 lockres->l_ex_holders) 3541 lockres->l_ex_holders) {
3542 mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
3543 lockres->l_name, lockres->l_ex_holders);
3414 goto leave_requeue; 3544 goto leave_requeue;
3545 }
3415 3546
3416 /* 3547 /*
3417 * Can we get a lock in this state if the holder counts are 3548 * Can we get a lock in this state if the holder counts are
3418 * zero? The meta data unblock code used to check this. 3549 * zero? The meta data unblock code used to check this.
3419 */ 3550 */
3420 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 3551 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
3421 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) 3552 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
3553 mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
3554 lockres->l_name);
3422 goto leave_requeue; 3555 goto leave_requeue;
3556 }
3423 3557
3424 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 3558 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
3425 3559
3426 if (lockres->l_ops->check_downconvert 3560 if (lockres->l_ops->check_downconvert
3427 && !lockres->l_ops->check_downconvert(lockres, new_level)) 3561 && !lockres->l_ops->check_downconvert(lockres, new_level)) {
3562 mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
3563 lockres->l_name);
3428 goto leave_requeue; 3564 goto leave_requeue;
3565 }
3429 3566
3430 /* If we get here, then we know that there are no more 3567 /* If we get here, then we know that there are no more
3431 * incompatible holders (and anyone asking for an incompatible 3568 * incompatible holders (and anyone asking for an incompatible
@@ -3438,17 +3575,24 @@ recheck:
3438 * may sleep, so we save off a copy of what we're blocking as 3575 * may sleep, so we save off a copy of what we're blocking as
3439 * it may change while we're not holding the spin lock. */ 3576 * it may change while we're not holding the spin lock. */
3440 blocking = lockres->l_blocking; 3577 blocking = lockres->l_blocking;
3578 level = lockres->l_level;
3441 spin_unlock_irqrestore(&lockres->l_lock, flags); 3579 spin_unlock_irqrestore(&lockres->l_lock, flags);
3442 3580
3443 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); 3581 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
3444 3582
3445 if (ctl->unblock_action == UNBLOCK_STOP_POST) 3583 if (ctl->unblock_action == UNBLOCK_STOP_POST) {
3584 mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
3585 lockres->l_name);
3446 goto leave; 3586 goto leave;
3587 }
3447 3588
3448 spin_lock_irqsave(&lockres->l_lock, flags); 3589 spin_lock_irqsave(&lockres->l_lock, flags);
3449 if (blocking != lockres->l_blocking) { 3590 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
3450 /* If this changed underneath us, then we can't drop 3591 /* If this changed underneath us, then we can't drop
3451 * it just yet. */ 3592 * it just yet. */
3593 mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
3594 "Recheck\n", lockres->l_name, blocking,
3595 lockres->l_blocking, level, lockres->l_level);
3452 goto recheck; 3596 goto recheck;
3453 } 3597 }
3454 3598
@@ -3843,45 +3987,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3843 ocfs2_cluster_unlock(osb, lockres, level); 3987 ocfs2_cluster_unlock(osb, lockres, level);
3844} 3988}
3845 3989
3846/*
3847 * This is the filesystem locking protocol. It provides the lock handling
3848 * hooks for the underlying DLM. It has a maximum version number.
3849 * The version number allows interoperability with systems running at
3850 * the same major number and an equal or smaller minor number.
3851 *
3852 * Whenever the filesystem does new things with locks (adds or removes a
3853 * lock, orders them differently, does different things underneath a lock),
3854 * the version must be changed. The protocol is negotiated when joining
3855 * the dlm domain. A node may join the domain if its major version is
3856 * identical to all other nodes and its minor version is greater than
3857 * or equal to all other nodes. When its minor version is greater than
3858 * the other nodes, it will run at the minor version specified by the
3859 * other nodes.
3860 *
3861 * If a locking change is made that will not be compatible with older
3862 * versions, the major number must be increased and the minor version set
3863 * to zero. If a change merely adds a behavior that can be disabled when
3864 * speaking to older versions, the minor version must be increased. If a
3865 * change adds a fully backwards compatible change (eg, LVB changes that
3866 * are just ignored by older versions), the version does not need to be
3867 * updated.
3868 */
3869static struct ocfs2_locking_protocol lproto = {
3870 .lp_max_version = {
3871 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
3872 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
3873 },
3874 .lp_lock_ast = ocfs2_locking_ast,
3875 .lp_blocking_ast = ocfs2_blocking_ast,
3876 .lp_unlock_ast = ocfs2_unlock_ast,
3877};
3878
3879void ocfs2_set_locking_protocol(void)
3880{
3881 ocfs2_stack_glue_set_locking_protocol(&lproto);
3882}
3883
3884
3885static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 3990static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3886 struct ocfs2_lock_res *lockres) 3991 struct ocfs2_lock_res *lockres)
3887{ 3992{
@@ -3898,7 +4003,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3898 BUG_ON(!lockres); 4003 BUG_ON(!lockres);
3899 BUG_ON(!lockres->l_ops); 4004 BUG_ON(!lockres->l_ops);
3900 4005
3901 mlog(0, "lockres %s blocked.\n", lockres->l_name); 4006 mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
3902 4007
3903 /* Detect whether a lock has been marked as going away while 4008 /* Detect whether a lock has been marked as going away while
3904 * the downconvert thread was processing other things. A lock can 4009 * the downconvert thread was processing other things. A lock can
@@ -3921,7 +4026,7 @@ unqueue:
3921 } else 4026 } else
3922 ocfs2_schedule_blocked_lock(osb, lockres); 4027 ocfs2_schedule_blocked_lock(osb, lockres);
3923 4028
3924 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, 4029 mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
3925 ctl.requeue ? "yes" : "no"); 4030 ctl.requeue ? "yes" : "no");
3926 spin_unlock_irqrestore(&lockres->l_lock, flags); 4031 spin_unlock_irqrestore(&lockres->l_lock, flags);
3927 4032
@@ -3943,7 +4048,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3943 /* Do not schedule a lock for downconvert when it's on 4048 /* Do not schedule a lock for downconvert when it's on
3944 * the way to destruction - any nodes wanting access 4049 * the way to destruction - any nodes wanting access
3945 * to the resource will get it soon. */ 4050 * to the resource will get it soon. */
3946 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", 4051 mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
3947 lockres->l_name, lockres->l_flags); 4052 lockres->l_name, lockres->l_flags);
3948 return; 4053 return;
3949 } 4054 }
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 15713cbb865c..19ad145d2af3 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -239,7 +239,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
239 mlog(0, "Encoding parent: blkno: %llu, generation: %u\n", 239 mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
240 (unsigned long long)blkno, generation); 240 (unsigned long long)blkno, generation);
241 } 241 }
242 242
243 *max_len = len; 243 *max_len = len;
244 244
245bail: 245bail:
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 843db64e9d4a..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/slab.h>
27#include <linux/types.h> 28#include <linux/types.h>
28#include <linux/fiemap.h> 29#include <linux/fiemap.h>
29 30
@@ -37,6 +38,7 @@
37#include "extent_map.h" 38#include "extent_map.h"
38#include "inode.h" 39#include "inode.h"
39#include "super.h" 40#include "super.h"
41#include "symlink.h"
40 42
41#include "buffer_head_io.h" 43#include "buffer_head_io.h"
42 44
@@ -191,7 +193,7 @@ static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
191 emi->ei_clusters += ins->ei_clusters; 193 emi->ei_clusters += ins->ei_clusters;
192 return 1; 194 return 1;
193 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && 195 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
194 (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && 196 (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
195 ins->ei_flags == emi->ei_flags) { 197 ins->ei_flags == emi->ei_flags) {
196 emi->ei_phys = ins->ei_phys; 198 emi->ei_phys = ins->ei_phys;
197 emi->ei_cpos = ins->ei_cpos; 199 emi->ei_cpos = ins->ei_cpos;
@@ -452,7 +454,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
452 if (i == -1) { 454 if (i == -1) {
453 /* 455 /*
454 * Holes can be larger than the maximum size of an 456 * Holes can be larger than the maximum size of an
455 * extent, so we return their lengths in a seperate 457 * extent, so we return their lengths in a separate
456 * field. 458 * field.
457 */ 459 */
458 if (hole_len) { 460 if (hole_len) {
@@ -703,6 +705,12 @@ out:
703 return ret; 705 return ret;
704} 706}
705 707
708/*
709 * The ocfs2_fiemap_inline() may be a little bit misleading, since
710 * it not only handles the fiemap for inlined files, but also deals
711 * with the fast symlink, cause they have no difference for extent
712 * mapping per se.
713 */
706static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, 714static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
707 struct fiemap_extent_info *fieinfo, 715 struct fiemap_extent_info *fieinfo,
708 u64 map_start) 716 u64 map_start)
@@ -715,11 +723,18 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
715 struct ocfs2_inode_info *oi = OCFS2_I(inode); 723 struct ocfs2_inode_info *oi = OCFS2_I(inode);
716 724
717 di = (struct ocfs2_dinode *)di_bh->b_data; 725 di = (struct ocfs2_dinode *)di_bh->b_data;
718 id_count = le16_to_cpu(di->id2.i_data.id_count); 726 if (ocfs2_inode_is_fast_symlink(inode))
727 id_count = ocfs2_fast_symlink_chars(inode->i_sb);
728 else
729 id_count = le16_to_cpu(di->id2.i_data.id_count);
719 730
720 if (map_start < id_count) { 731 if (map_start < id_count) {
721 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; 732 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
722 phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); 733 if (ocfs2_inode_is_fast_symlink(inode))
734 phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
735 else
736 phys += offsetof(struct ocfs2_dinode,
737 id2.i_data.id_data);
723 738
724 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, 739 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
725 flags); 740 flags);
@@ -756,9 +771,10 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
756 down_read(&OCFS2_I(inode)->ip_alloc_sem); 771 down_read(&OCFS2_I(inode)->ip_alloc_sem);
757 772
758 /* 773 /*
759 * Handle inline-data separately. 774 * Handle inline-data and fast symlink separately.
760 */ 775 */
761 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 776 if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
777 ocfs2_inode_is_fast_symlink(inode)) {
762 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); 778 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
763 goto out_unlock; 779 goto out_unlock;
764 } 780 }
@@ -786,6 +802,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
786 fe_flags = 0; 802 fe_flags = 0;
787 if (rec.e_flags & OCFS2_EXT_UNWRITTEN) 803 if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
788 fe_flags |= FIEMAP_EXTENT_UNWRITTEN; 804 fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
805 if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
806 fe_flags |= FIEMAP_EXTENT_SHARED;
789 if (is_last) 807 if (is_last)
790 fe_flags |= FIEMAP_EXTENT_LAST; 808 fe_flags |= FIEMAP_EXTENT_LAST;
791 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; 809 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index de059f490586..a5fbd9cea968 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
109 109
110 if (file->f_mode & FMODE_WRITE)
111 dquot_initialize(inode);
112
110 spin_lock(&oi->ip_lock); 113 spin_lock(&oi->ip_lock);
111 114
112 /* Check that the inode hasn't been wiped from disk by another 115 /* Check that the inode hasn't been wiped from disk by another
@@ -629,11 +632,10 @@ restart_all:
629 } 632 }
630 633
631restarted_transaction: 634restarted_transaction:
632 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 635 status = dquot_alloc_space_nodirty(inode,
633 clusters_to_add))) { 636 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
634 status = -EDQUOT; 637 if (status)
635 goto leave; 638 goto leave;
636 }
637 did_quota = 1; 639 did_quota = 1;
638 640
639 /* reserve a write to the file entry early on - that we if we 641 /* reserve a write to the file entry early on - that we if we
@@ -674,7 +676,7 @@ restarted_transaction:
674 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 676 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
675 spin_unlock(&OCFS2_I(inode)->ip_lock); 677 spin_unlock(&OCFS2_I(inode)->ip_lock);
676 /* Release unused quota reservation */ 678 /* Release unused quota reservation */
677 vfs_dq_free_space(inode, 679 dquot_free_space(inode,
678 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 680 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
679 did_quota = 0; 681 did_quota = 0;
680 682
@@ -682,6 +684,7 @@ restarted_transaction:
682 if (why == RESTART_META) { 684 if (why == RESTART_META) {
683 mlog(0, "restarting function.\n"); 685 mlog(0, "restarting function.\n");
684 restart_func = 1; 686 restart_func = 1;
687 status = 0;
685 } else { 688 } else {
686 BUG_ON(why != RESTART_TRANS); 689 BUG_ON(why != RESTART_TRANS);
687 690
@@ -710,7 +713,7 @@ restarted_transaction:
710 713
711leave: 714leave:
712 if (status < 0 && did_quota) 715 if (status < 0 && did_quota)
713 vfs_dq_free_space(inode, 716 dquot_free_space(inode,
714 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 717 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
715 if (handle) { 718 if (handle) {
716 ocfs2_commit_trans(osb, handle); 719 ocfs2_commit_trans(osb, handle);
@@ -749,7 +752,7 @@ static int ocfs2_write_zero_page(struct inode *inode,
749 int ret; 752 int ret;
750 753
751 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 754 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
752 /* ugh. in prepare/commit_write, if from==to==start of block, we 755 /* ugh. in prepare/commit_write, if from==to==start of block, we
753 ** skip the prepare. make sure we never send an offset for the start 756 ** skip the prepare. make sure we never send an offset for the start
754 ** of a block 757 ** of a block
755 */ 758 */
@@ -978,6 +981,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
978 981
979 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 982 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
980 if (size_change) { 983 if (size_change) {
984 dquot_initialize(inode);
985
981 status = ocfs2_rw_lock(inode, 1); 986 status = ocfs2_rw_lock(inode, 1);
982 if (status < 0) { 987 if (status < 0) {
983 mlog_errno(status); 988 mlog_errno(status);
@@ -993,10 +998,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
993 } 998 }
994 999
995 if (size_change && attr->ia_size != i_size_read(inode)) { 1000 if (size_change && attr->ia_size != i_size_read(inode)) {
996 if (attr->ia_size > sb->s_maxbytes) { 1001 status = inode_newsize_ok(inode, attr->ia_size);
997 status = -EFBIG; 1002 if (status)
998 goto bail_unlock; 1003 goto bail_unlock;
999 }
1000 1004
1001 if (i_size_read(inode) > attr->ia_size) { 1005 if (i_size_read(inode) > attr->ia_size) {
1002 if (ocfs2_should_order_data(inode)) { 1006 if (ocfs2_should_order_data(inode)) {
@@ -1021,7 +1025,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1021 /* 1025 /*
1022 * Gather pointers to quota structures so that allocation / 1026 * Gather pointers to quota structures so that allocation /
1023 * freeing of quota structures happens here and not inside 1027 * freeing of quota structures happens here and not inside
1024 * vfs_dq_transfer() where we have problems with lock ordering 1028 * dquot_transfer() where we have problems with lock ordering
1025 */ 1029 */
1026 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 1030 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1027 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1031 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1054,7 +1058,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1054 mlog_errno(status); 1058 mlog_errno(status);
1055 goto bail_unlock; 1059 goto bail_unlock;
1056 } 1060 }
1057 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 1061 status = dquot_transfer(inode, attr);
1058 if (status < 0) 1062 if (status < 0)
1059 goto bail_commit; 1063 goto bail_commit;
1060 } else { 1064 } else {
@@ -1772,13 +1776,14 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1772 loff_t *ppos, 1776 loff_t *ppos,
1773 size_t count, 1777 size_t count,
1774 int appending, 1778 int appending,
1775 int *direct_io) 1779 int *direct_io,
1780 int *has_refcount)
1776{ 1781{
1777 int ret = 0, meta_level = 0; 1782 int ret = 0, meta_level = 0;
1778 struct inode *inode = dentry->d_inode; 1783 struct inode *inode = dentry->d_inode;
1779 loff_t saved_pos, end; 1784 loff_t saved_pos, end;
1780 1785
1781 /* 1786 /*
1782 * We start with a read level meta lock and only jump to an ex 1787 * We start with a read level meta lock and only jump to an ex
1783 * if we need to make modifications here. 1788 * if we need to make modifications here.
1784 */ 1789 */
@@ -1833,6 +1838,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1833 saved_pos, 1838 saved_pos,
1834 count, 1839 count,
1835 &meta_level); 1840 &meta_level);
1841 if (has_refcount)
1842 *has_refcount = 1;
1843 if (direct_io)
1844 *direct_io = 0;
1836 } 1845 }
1837 1846
1838 if (ret < 0) { 1847 if (ret < 0) {
@@ -1899,7 +1908,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1899 loff_t pos) 1908 loff_t pos)
1900{ 1909{
1901 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 1910 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1902 int can_do_direct; 1911 int can_do_direct, has_refcount = 0;
1903 ssize_t written = 0; 1912 ssize_t written = 0;
1904 size_t ocount; /* original count */ 1913 size_t ocount; /* original count */
1905 size_t count; /* after file limit checks */ 1914 size_t count; /* after file limit checks */
@@ -1942,7 +1951,7 @@ relock:
1942 can_do_direct = direct_io; 1951 can_do_direct = direct_io;
1943 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 1952 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1944 iocb->ki_left, appending, 1953 iocb->ki_left, appending,
1945 &can_do_direct); 1954 &can_do_direct, &has_refcount);
1946 if (ret < 0) { 1955 if (ret < 0) {
1947 mlog_errno(ret); 1956 mlog_errno(ret);
1948 goto out; 1957 goto out;
@@ -1973,18 +1982,18 @@ relock:
1973 /* communicate with ocfs2_dio_end_io */ 1982 /* communicate with ocfs2_dio_end_io */
1974 ocfs2_iocb_set_rw_locked(iocb, rw_level); 1983 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1975 1984
1976 if (direct_io) { 1985 ret = generic_segment_checks(iov, &nr_segs, &ocount,
1977 ret = generic_segment_checks(iov, &nr_segs, &ocount, 1986 VERIFY_READ);
1978 VERIFY_READ); 1987 if (ret)
1979 if (ret) 1988 goto out_dio;
1980 goto out_dio;
1981 1989
1982 count = ocount; 1990 count = ocount;
1983 ret = generic_write_checks(file, ppos, &count, 1991 ret = generic_write_checks(file, ppos, &count,
1984 S_ISBLK(inode->i_mode)); 1992 S_ISBLK(inode->i_mode));
1985 if (ret) 1993 if (ret)
1986 goto out_dio; 1994 goto out_dio;
1987 1995
1996 if (direct_io) {
1988 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 1997 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1989 ppos, count, ocount); 1998 ppos, count, ocount);
1990 if (written < 0) { 1999 if (written < 0) {
@@ -1999,21 +2008,26 @@ relock:
1999 goto out_dio; 2008 goto out_dio;
2000 } 2009 }
2001 } else { 2010 } else {
2002 written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); 2011 current->backing_dev_info = file->f_mapping->backing_dev_info;
2012 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
2013 ppos, count, 0);
2014 current->backing_dev_info = NULL;
2003 } 2015 }
2004 2016
2005out_dio: 2017out_dio:
2006 /* buffered aio wouldn't have proper lock coverage today */ 2018 /* buffered aio wouldn't have proper lock coverage today */
2007 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2019 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2008 2020
2009 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { 2021 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2022 ((file->f_flags & O_DIRECT) && has_refcount)) {
2010 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2023 ret = filemap_fdatawrite_range(file->f_mapping, pos,
2011 pos + count - 1); 2024 pos + count - 1);
2012 if (ret < 0) 2025 if (ret < 0)
2013 written = ret; 2026 written = ret;
2014 2027
2015 if (!ret && (old_size != i_size_read(inode) || 2028 if (!ret && ((old_size != i_size_read(inode)) ||
2016 old_clusters != OCFS2_I(inode)->ip_clusters)) { 2029 (old_clusters != OCFS2_I(inode)->ip_clusters) ||
2030 has_refcount)) {
2017 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2031 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2018 if (ret < 0) 2032 if (ret < 0)
2019 written = ret; 2033 written = ret;
@@ -2024,7 +2038,7 @@ out_dio:
2024 pos + count - 1); 2038 pos + count - 1);
2025 } 2039 }
2026 2040
2027 /* 2041 /*
2028 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2042 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2029 * function pointer which is called when o_direct io completes so that 2043 * function pointer which is called when o_direct io completes so that
2030 * it can unlock our rw lock. (it's the clustered equivalent of 2044 * it can unlock our rw lock. (it's the clustered equivalent of
@@ -2034,7 +2048,7 @@ out_dio:
2034 * async dio is going to do it in the future or an end_io after an 2048 * async dio is going to do it in the future or an end_io after an
2035 * error has already done it. 2049 * error has already done it.
2036 */ 2050 */
2037 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2051 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2038 rw_level = -1; 2052 rw_level = -1;
2039 have_alloc_sem = 0; 2053 have_alloc_sem = 0;
2040 } 2054 }
@@ -2062,7 +2076,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2062 int ret; 2076 int ret;
2063 2077
2064 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2078 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
2065 sd->total_len, 0, NULL); 2079 sd->total_len, 0, NULL, NULL);
2066 if (ret < 0) { 2080 if (ret < 0) {
2067 mlog_errno(ret); 2081 mlog_errno(ret);
2068 return ret; 2082 return ret;
@@ -2189,7 +2203,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2189 goto bail; 2203 goto bail;
2190 } 2204 }
2191 2205
2192 /* 2206 /*
2193 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2207 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2194 * need locks to protect pending reads from racing with truncate. 2208 * need locks to protect pending reads from racing with truncate.
2195 */ 2209 */
@@ -2211,10 +2225,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2211 * We're fine letting folks race truncates and extending 2225 * We're fine letting folks race truncates and extending
2212 * writes with read across the cluster, just like they can 2226 * writes with read across the cluster, just like they can
2213 * locally. Hence no rw_lock during read. 2227 * locally. Hence no rw_lock during read.
2214 * 2228 *
2215 * Take and drop the meta data lock to update inode fields 2229 * Take and drop the meta data lock to update inode fields
2216 * like i_size. This allows the checks down below 2230 * like i_size. This allows the checks down below
2217 * generic_file_aio_read() a chance of actually working. 2231 * generic_file_aio_read() a chance of actually working.
2218 */ 2232 */
2219 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2233 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2220 if (ret < 0) { 2234 if (ret < 0) {
@@ -2239,7 +2253,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2239bail: 2253bail:
2240 if (have_alloc_sem) 2254 if (have_alloc_sem)
2241 up_read(&inode->i_alloc_sem); 2255 up_read(&inode->i_alloc_sem);
2242 if (rw_level != -1) 2256 if (rw_level != -1)
2243 ocfs2_rw_unlock(inode, rw_level); 2257 ocfs2_rw_unlock(inode, rw_level);
2244 mlog_exit(ret); 2258 mlog_exit(ret);
2245 2259
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h> 29#include <linux/highmem.h>
31 30
32#define MLOG_MASK_PREFIX ML_SUPER 31#define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0297fb8982b8..af189887201c 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/quotaops.h> 30#include <linux/quotaops.h>
@@ -475,7 +474,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
475 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { 474 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
476 status = ocfs2_try_open_lock(inode, 0); 475 status = ocfs2_try_open_lock(inode, 0);
477 if (status) { 476 if (status) {
478 make_bad_inode(inode); 477 make_bad_inode(inode);
479 return status; 478 return status;
480 } 479 }
481 } 480 }
@@ -559,6 +558,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
559 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 558 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
560 if (IS_ERR(handle)) { 559 if (IS_ERR(handle)) {
561 status = PTR_ERR(handle); 560 status = PTR_ERR(handle);
561 handle = NULL;
562 mlog_errno(status); 562 mlog_errno(status);
563 goto out; 563 goto out;
564 } 564 }
@@ -640,11 +640,13 @@ static int ocfs2_remove_inode(struct inode *inode,
640 goto bail_unlock; 640 goto bail_unlock;
641 } 641 }
642 642
643 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 643 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
644 orphan_dir_bh); 644 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
645 if (status < 0) { 645 orphan_dir_bh);
646 mlog_errno(status); 646 if (status < 0) {
647 goto bail_commit; 647 mlog_errno(status);
648 goto bail_commit;
649 }
648 } 650 }
649 651
650 /* set the inodes dtime */ 652 /* set the inodes dtime */
@@ -665,7 +667,7 @@ static int ocfs2_remove_inode(struct inode *inode,
665 } 667 }
666 668
667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 669 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
668 vfs_dq_free_inode(inode); 670 dquot_free_inode(inode);
669 671
670 status = ocfs2_free_dinode(handle, inode_alloc_inode, 672 status = ocfs2_free_dinode(handle, inode_alloc_inode,
671 inode_alloc_bh, di); 673 inode_alloc_bh, di);
@@ -684,7 +686,7 @@ bail:
684 return status; 686 return status;
685} 687}
686 688
687/* 689/*
688 * Serialize with orphan dir recovery. If the process doing 690 * Serialize with orphan dir recovery. If the process doing
689 * recovery on this orphan dir does an iget() with the dir 691 * recovery on this orphan dir does an iget() with the dir
690 * i_mutex held, we'll deadlock here. Instead we detect this 692 * i_mutex held, we'll deadlock here. Instead we detect this
@@ -723,38 +725,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
723static int ocfs2_wipe_inode(struct inode *inode, 725static int ocfs2_wipe_inode(struct inode *inode,
724 struct buffer_head *di_bh) 726 struct buffer_head *di_bh)
725{ 727{
726 int status, orphaned_slot; 728 int status, orphaned_slot = -1;
727 struct inode *orphan_dir_inode = NULL; 729 struct inode *orphan_dir_inode = NULL;
728 struct buffer_head *orphan_dir_bh = NULL; 730 struct buffer_head *orphan_dir_bh = NULL;
729 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 731 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
730 struct ocfs2_dinode *di; 732 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
731 733
732 di = (struct ocfs2_dinode *) di_bh->b_data; 734 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
733 orphaned_slot = le16_to_cpu(di->i_orphaned_slot); 735 orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
734 736
735 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); 737 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
736 if (status) 738 if (status)
737 return status; 739 return status;
738 740
739 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 741 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
740 ORPHAN_DIR_SYSTEM_INODE, 742 ORPHAN_DIR_SYSTEM_INODE,
741 orphaned_slot); 743 orphaned_slot);
742 if (!orphan_dir_inode) { 744 if (!orphan_dir_inode) {
743 status = -EEXIST; 745 status = -EEXIST;
744 mlog_errno(status); 746 mlog_errno(status);
745 goto bail; 747 goto bail;
746 } 748 }
747 749
748 /* Lock the orphan dir. The lock will be held for the entire 750 /* Lock the orphan dir. The lock will be held for the entire
749 * delete_inode operation. We do this now to avoid races with 751 * delete_inode operation. We do this now to avoid races with
750 * recovery completion on other nodes. */ 752 * recovery completion on other nodes. */
751 mutex_lock(&orphan_dir_inode->i_mutex); 753 mutex_lock(&orphan_dir_inode->i_mutex);
752 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); 754 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
753 if (status < 0) { 755 if (status < 0) {
754 mutex_unlock(&orphan_dir_inode->i_mutex); 756 mutex_unlock(&orphan_dir_inode->i_mutex);
755 757
756 mlog_errno(status); 758 mlog_errno(status);
757 goto bail; 759 goto bail;
760 }
758 } 761 }
759 762
760 /* we do this while holding the orphan dir lock because we 763 /* we do this while holding the orphan dir lock because we
@@ -795,6 +798,9 @@ static int ocfs2_wipe_inode(struct inode *inode,
795 mlog_errno(status); 798 mlog_errno(status);
796 799
797bail_unlock_dir: 800bail_unlock_dir:
801 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
802 return status;
803
798 ocfs2_inode_unlock(orphan_dir_inode, 1); 804 ocfs2_inode_unlock(orphan_dir_inode, 1);
799 mutex_unlock(&orphan_dir_inode->i_mutex); 805 mutex_unlock(&orphan_dir_inode->i_mutex);
800 brelse(orphan_dir_bh); 806 brelse(orphan_dir_bh);
@@ -890,7 +896,23 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
890 896
891 /* Do some basic inode verification... */ 897 /* Do some basic inode verification... */
892 di = (struct ocfs2_dinode *) di_bh->b_data; 898 di = (struct ocfs2_dinode *) di_bh->b_data;
893 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { 899 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
900 !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
901 /*
902 * Inodes in the orphan dir must have ORPHANED_FL. The only
903 * inodes that come back out of the orphan dir are reflink
904 * targets. A reflink target may be moved out of the orphan
905 * dir between the time we scan the directory and the time we
906 * process it. This would lead to HAS_REFCOUNT_FL being set but
907 * ORPHANED_FL not.
908 */
909 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
910 mlog(0, "Reflinked inode %llu is no longer orphaned. "
911 "it shouldn't be deleted\n",
912 (unsigned long long)oi->ip_blkno);
913 goto bail;
914 }
915
894 /* for lack of a better error? */ 916 /* for lack of a better error? */
895 status = -EEXIST; 917 status = -EEXIST;
896 mlog(ML_ERROR, 918 mlog(ML_ERROR,
@@ -971,6 +993,8 @@ void ocfs2_delete_inode(struct inode *inode)
971 goto bail; 993 goto bail;
972 } 994 }
973 995
996 dquot_initialize(inode);
997
974 if (!ocfs2_inode_is_valid_to_delete(inode)) { 998 if (!ocfs2_inode_is_valid_to_delete(inode)) {
975 /* It's probably not necessary to truncate_inode_pages 999 /* It's probably not necessary to truncate_inode_pages
976 * here but we do it for safety anyway (it will most 1000 * here but we do it for safety anyway (it will most
@@ -1087,6 +1111,8 @@ void ocfs2_clear_inode(struct inode *inode)
1087 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1111 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1088 "Inode=%lu\n", inode->i_ino); 1112 "Inode=%lu\n", inode->i_ino);
1089 1113
1114 dquot_drop(inode);
1115
1090 /* To preven remote deletes we hold open lock before, now it 1116 /* To preven remote deletes we hold open lock before, now it
1091 * is time to unlock PR and EX open locks. */ 1117 * is time to unlock PR and EX open locks. */
1092 ocfs2_open_unlock(inode); 1118 ocfs2_open_unlock(inode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ba4fe07b293c..0b28e1921a39 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -100,6 +100,8 @@ struct ocfs2_inode_info
100#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 100#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
101/* Does someone have the file open O_DIRECT */ 101/* Does someone have the file open O_DIRECT */
102#define OCFS2_INODE_OPEN_DIRECT 0x00000040 102#define OCFS2_INODE_OPEN_DIRECT 0x00000040
103/* Tell the inode wipe code it's not in orphan dir */
104#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080
103 105
104static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 106static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
105{ 107{
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 31fbb0619510..7d9d9c132cef 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/compat.h>
10 11
11#define MLOG_MASK_PREFIX ML_INODE 12#define MLOG_MASK_PREFIX ML_INODE
12#include <cluster/masklog.h> 13#include <cluster/masklog.h>
@@ -181,6 +182,10 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
181#ifdef CONFIG_COMPAT 182#ifdef CONFIG_COMPAT
182long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) 183long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
183{ 184{
185 bool preserve;
186 struct reflink_arguments args;
187 struct inode *inode = file->f_path.dentry->d_inode;
188
184 switch (cmd) { 189 switch (cmd) {
185 case OCFS2_IOC32_GETFLAGS: 190 case OCFS2_IOC32_GETFLAGS:
186 cmd = OCFS2_IOC_GETFLAGS; 191 cmd = OCFS2_IOC_GETFLAGS;
@@ -195,8 +200,15 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
195 case OCFS2_IOC_GROUP_EXTEND: 200 case OCFS2_IOC_GROUP_EXTEND:
196 case OCFS2_IOC_GROUP_ADD: 201 case OCFS2_IOC_GROUP_ADD:
197 case OCFS2_IOC_GROUP_ADD64: 202 case OCFS2_IOC_GROUP_ADD64:
198 case OCFS2_IOC_REFLINK:
199 break; 203 break;
204 case OCFS2_IOC_REFLINK:
205 if (copy_from_user(&args, (struct reflink_arguments *)arg,
206 sizeof(args)))
207 return -EFAULT;
208 preserve = (args.preserve != 0);
209
210 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
211 compat_ptr(args.new_path), preserve);
200 default: 212 default:
201 return -ENOIOCTLCMD; 213 return -ENOIOCTLCMD;
202 } 214 }
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fef..0cd5323bd3f0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
7 * 7 *
8 */ 8 */
9 9
10#ifndef OCFS2_IOCTL_H 10#ifndef OCFS2_IOCTL_PROTO_H
11#define OCFS2_IOCTL_H 11#define OCFS2_IOCTL_PROTO_H
12 12
13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); 14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
15 15
16#endif /* OCFS2_IOCTL_H */ 16#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 54c16b66327e..9336c60e3a36 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -659,7 +659,7 @@ static int __ocfs2_journal_access(handle_t *handle,
659 659
660 default: 660 default:
661 status = -EINVAL; 661 status = -EINVAL;
662 mlog(ML_ERROR, "Uknown access type!\n"); 662 mlog(ML_ERROR, "Unknown access type!\n");
663 } 663 }
664 if (!status && ocfs2_meta_ecc(osb) && triggers) 664 if (!status && ocfs2_meta_ecc(osb) && triggers)
665 jbd2_journal_set_triggers(bh, &triggers->ot_triggers); 665 jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
@@ -2034,7 +2034,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
2034 status = -ENOENT; 2034 status = -ENOENT;
2035 mlog_errno(status); 2035 mlog_errno(status);
2036 return status; 2036 return status;
2037 } 2037 }
2038 2038
2039 mutex_lock(&orphan_dir_inode->i_mutex); 2039 mutex_lock(&orphan_dir_inode->i_mutex);
2040 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); 2040 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ac10f83edb95..c983715d8d8c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,7 +476,7 @@ out_mutex:
476 476
477out: 477out:
478 if (!status) 478 if (!status)
479 ocfs2_init_inode_steal_slot(osb); 479 ocfs2_init_steal_slots(osb);
480 mlog_exit(status); 480 mlog_exit(status);
481 return status; 481 return status;
482} 482}
@@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
872 (unsigned long long)la_start_blk, 872 (unsigned long long)la_start_blk,
873 (unsigned long long)blkno); 873 (unsigned long long)blkno);
874 874
875 status = ocfs2_free_clusters(handle, main_bm_inode, 875 status = ocfs2_release_clusters(handle,
876 main_bm_bh, blkno, count); 876 main_bm_inode,
877 main_bm_bh, blkno,
878 count);
877 if (status < 0) { 879 if (status < 0) {
878 mlog_errno(status); 880 mlog_errno(status);
879 goto bail; 881 goto bail;
@@ -984,8 +986,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
984 } 986 }
985 987
986retry_enospc: 988retry_enospc:
987 (*ac)->ac_bits_wanted = osb->local_alloc_bits; 989 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
988
989 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 990 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
990 if (status == -ENOSPC) { 991 if (status == -ENOSPC) {
991 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 992 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1061,6 +1062,7 @@ retry_enospc:
1061 OCFS2_LA_DISABLED) 1062 OCFS2_LA_DISABLED)
1062 goto bail; 1063 goto bail;
1063 1064
1065 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1064 status = ocfs2_claim_clusters(osb, handle, ac, 1066 status = ocfs2_claim_clusters(osb, handle, ac,
1065 osb->local_alloc_bits, 1067 osb->local_alloc_bits,
1066 &cluster_off, 1068 &cluster_off,
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
133 133
134 if (!(fl->fl_flags & FL_POSIX)) 134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK; 135 return -ENOLCK;
136 if (__mandatory_lock(inode)) 136 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
137 return -ENOLCK; 137 return -ENOLCK;
138 138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); 139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..7898bd3a99f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/uio.h> 30#include <linux/uio.h>
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f010b22b1c44..4cbb18f26c5f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 84static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 85 handle_t *handle,
86 struct inode *inode, 86 struct inode *inode,
87 struct ocfs2_dinode *fe, 87 struct buffer_head *fe_bh,
88 char *name, 88 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 89 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 90 struct inode *orphan_dir_inode);
@@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
212 } else 212 } else
213 inode->i_gid = current_fsgid(); 213 inode->i_gid = current_fsgid();
214 inode->i_mode = mode; 214 inode->i_mode = mode;
215 vfs_dq_init(inode); 215 dquot_initialize(inode);
216 return inode; 216 return inode;
217} 217}
218 218
@@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir,
244 (unsigned long)dev, dentry->d_name.len, 244 (unsigned long)dev, dentry->d_name.len,
245 dentry->d_name.name); 245 dentry->d_name.name);
246 246
247 dquot_initialize(dir);
248
247 /* get our super block */ 249 /* get our super block */
248 osb = OCFS2_SB(dir->i_sb); 250 osb = OCFS2_SB(dir->i_sb);
249 251
@@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir,
348 goto leave; 350 goto leave;
349 } 351 }
350 352
351 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 353 status = dquot_alloc_inode(inode);
352 * to be called. */ 354 if (status)
353 if (sb_any_quota_active(osb->sb) &&
354 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
355 status = -EDQUOT;
356 goto leave; 355 goto leave;
357 }
358 did_quota_inode = 1; 356 did_quota_inode = 1;
359 357
360 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 358 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
@@ -410,28 +408,33 @@ static int ocfs2_mknod(struct inode *dir,
410 } 408 }
411 } 409 }
412 410
413 status = ocfs2_add_entry(handle, dentry, inode, 411 /*
414 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 412 * Do this before adding the entry to the directory. We add
415 &lookup); 413 * also set d_op after success so that ->d_iput() will cleanup
416 if (status < 0) { 414 * the dentry lock even if ocfs2_add_entry() fails below.
415 */
416 status = ocfs2_dentry_attach_lock(dentry, inode,
417 OCFS2_I(dir)->ip_blkno);
418 if (status) {
417 mlog_errno(status); 419 mlog_errno(status);
418 goto leave; 420 goto leave;
419 } 421 }
422 dentry->d_op = &ocfs2_dentry_ops;
420 423
421 status = ocfs2_dentry_attach_lock(dentry, inode, 424 status = ocfs2_add_entry(handle, dentry, inode,
422 OCFS2_I(dir)->ip_blkno); 425 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
423 if (status) { 426 &lookup);
427 if (status < 0) {
424 mlog_errno(status); 428 mlog_errno(status);
425 goto leave; 429 goto leave;
426 } 430 }
427 431
428 insert_inode_hash(inode); 432 insert_inode_hash(inode);
429 dentry->d_op = &ocfs2_dentry_ops;
430 d_instantiate(dentry, inode); 433 d_instantiate(dentry, inode);
431 status = 0; 434 status = 0;
432leave: 435leave:
433 if (status < 0 && did_quota_inode) 436 if (status < 0 && did_quota_inode)
434 vfs_dq_free_inode(inode); 437 dquot_free_inode(inode);
435 if (handle) 438 if (handle)
436 ocfs2_commit_trans(osb, handle); 439 ocfs2_commit_trans(osb, handle);
437 440
@@ -447,11 +450,6 @@ leave:
447 450
448 ocfs2_free_dir_lookup_result(&lookup); 451 ocfs2_free_dir_lookup_result(&lookup);
449 452
450 if ((status < 0) && inode) {
451 clear_nlink(inode);
452 iput(inode);
453 }
454
455 if (inode_ac) 453 if (inode_ac)
456 ocfs2_free_alloc_context(inode_ac); 454 ocfs2_free_alloc_context(inode_ac);
457 455
@@ -461,6 +459,17 @@ leave:
461 if (meta_ac) 459 if (meta_ac)
462 ocfs2_free_alloc_context(meta_ac); 460 ocfs2_free_alloc_context(meta_ac);
463 461
462 /*
463 * We should call iput after the i_mutex of the bitmap been
464 * unlocked in ocfs2_free_alloc_context, or the
465 * ocfs2_delete_inode will mutex_lock again.
466 */
467 if ((status < 0) && inode) {
468 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
469 clear_nlink(inode);
470 iput(inode);
471 }
472
464 mlog_exit(status); 473 mlog_exit(status);
465 474
466 return status; 475 return status;
@@ -636,6 +645,8 @@ static int ocfs2_link(struct dentry *old_dentry,
636 if (S_ISDIR(inode->i_mode)) 645 if (S_ISDIR(inode->i_mode))
637 return -EPERM; 646 return -EPERM;
638 647
648 dquot_initialize(dir);
649
639 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); 650 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
640 if (err < 0) { 651 if (err < 0) {
641 if (err != -ENOENT) 652 if (err != -ENOENT)
@@ -791,6 +802,8 @@ static int ocfs2_unlink(struct inode *dir,
791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 802 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
792 dentry->d_name.len, dentry->d_name.name); 803 dentry->d_name.len, dentry->d_name.name);
793 804
805 dquot_initialize(dir);
806
794 BUG_ON(dentry->d_parent->d_inode != dir); 807 BUG_ON(dentry->d_parent->d_inode != dir);
795 808
796 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 809 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -877,7 +890,7 @@ static int ocfs2_unlink(struct inode *dir,
877 fe = (struct ocfs2_dinode *) fe_bh->b_data; 890 fe = (struct ocfs2_dinode *) fe_bh->b_data;
878 891
879 if (inode_is_unlinkable(inode)) { 892 if (inode_is_unlinkable(inode)) {
880 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 893 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
881 &orphan_insert, orphan_dir); 894 &orphan_insert, orphan_dir);
882 if (status < 0) { 895 if (status < 0) {
883 mlog_errno(status); 896 mlog_errno(status);
@@ -1051,6 +1064,9 @@ static int ocfs2_rename(struct inode *old_dir,
1051 old_dentry->d_name.len, old_dentry->d_name.name, 1064 old_dentry->d_name.len, old_dentry->d_name.name,
1052 new_dentry->d_name.len, new_dentry->d_name.name); 1065 new_dentry->d_name.len, new_dentry->d_name.name);
1053 1066
1067 dquot_initialize(old_dir);
1068 dquot_initialize(new_dir);
1069
1054 osb = OCFS2_SB(old_dir->i_sb); 1070 osb = OCFS2_SB(old_dir->i_sb);
1055 1071
1056 if (new_inode) { 1072 if (new_inode) {
@@ -1295,7 +1311,7 @@ static int ocfs2_rename(struct inode *old_dir,
1295 if (S_ISDIR(new_inode->i_mode) || 1311 if (S_ISDIR(new_inode->i_mode) ||
1296 (ocfs2_read_links_count(newfe) == 1)) { 1312 (ocfs2_read_links_count(newfe) == 1)) {
1297 status = ocfs2_orphan_add(osb, handle, new_inode, 1313 status = ocfs2_orphan_add(osb, handle, new_inode,
1298 newfe, orphan_name, 1314 newfe_bh, orphan_name,
1299 &orphan_insert, orphan_dir); 1315 &orphan_insert, orphan_dir);
1300 if (status < 0) { 1316 if (status < 0) {
1301 mlog_errno(status); 1317 mlog_errno(status);
@@ -1599,6 +1615,8 @@ static int ocfs2_symlink(struct inode *dir,
1599 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1615 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1600 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1616 dentry, symname, dentry->d_name.len, dentry->d_name.name);
1601 1617
1618 dquot_initialize(dir);
1619
1602 sb = dir->i_sb; 1620 sb = dir->i_sb;
1603 osb = OCFS2_SB(sb); 1621 osb = OCFS2_SB(sb);
1604 1622
@@ -1688,13 +1706,9 @@ static int ocfs2_symlink(struct inode *dir,
1688 goto bail; 1706 goto bail;
1689 } 1707 }
1690 1708
1691 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 1709 status = dquot_alloc_inode(inode);
1692 * to be called. */ 1710 if (status)
1693 if (sb_any_quota_active(osb->sb) &&
1694 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
1695 status = -EDQUOT;
1696 goto bail; 1711 goto bail;
1697 }
1698 did_quota_inode = 1; 1712 did_quota_inode = 1;
1699 1713
1700 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, 1714 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
@@ -1716,11 +1730,10 @@ static int ocfs2_symlink(struct inode *dir,
1716 u32 offset = 0; 1730 u32 offset = 0;
1717 1731
1718 inode->i_op = &ocfs2_symlink_inode_operations; 1732 inode->i_op = &ocfs2_symlink_inode_operations;
1719 if (vfs_dq_alloc_space_nodirty(inode, 1733 status = dquot_alloc_space_nodirty(inode,
1720 ocfs2_clusters_to_bytes(osb->sb, 1))) { 1734 ocfs2_clusters_to_bytes(osb->sb, 1));
1721 status = -EDQUOT; 1735 if (status)
1722 goto bail; 1736 goto bail;
1723 }
1724 did_quota = 1; 1737 did_quota = 1;
1725 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, 1738 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1726 new_fe_bh, 1739 new_fe_bh,
@@ -1769,29 +1782,34 @@ static int ocfs2_symlink(struct inode *dir,
1769 } 1782 }
1770 } 1783 }
1771 1784
1772 status = ocfs2_add_entry(handle, dentry, inode, 1785 /*
1773 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1786 * Do this before adding the entry to the directory. We add
1774 &lookup); 1787 * also set d_op after success so that ->d_iput() will cleanup
1775 if (status < 0) { 1788 * the dentry lock even if ocfs2_add_entry() fails below.
1789 */
1790 status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
1791 if (status) {
1776 mlog_errno(status); 1792 mlog_errno(status);
1777 goto bail; 1793 goto bail;
1778 } 1794 }
1795 dentry->d_op = &ocfs2_dentry_ops;
1779 1796
1780 status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); 1797 status = ocfs2_add_entry(handle, dentry, inode,
1781 if (status) { 1798 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1799 &lookup);
1800 if (status < 0) {
1782 mlog_errno(status); 1801 mlog_errno(status);
1783 goto bail; 1802 goto bail;
1784 } 1803 }
1785 1804
1786 insert_inode_hash(inode); 1805 insert_inode_hash(inode);
1787 dentry->d_op = &ocfs2_dentry_ops;
1788 d_instantiate(dentry, inode); 1806 d_instantiate(dentry, inode);
1789bail: 1807bail:
1790 if (status < 0 && did_quota) 1808 if (status < 0 && did_quota)
1791 vfs_dq_free_space_nodirty(inode, 1809 dquot_free_space_nodirty(inode,
1792 ocfs2_clusters_to_bytes(osb->sb, 1)); 1810 ocfs2_clusters_to_bytes(osb->sb, 1));
1793 if (status < 0 && did_quota_inode) 1811 if (status < 0 && did_quota_inode)
1794 vfs_dq_free_inode(inode); 1812 dquot_free_inode(inode);
1795 if (handle) 1813 if (handle)
1796 ocfs2_commit_trans(osb, handle); 1814 ocfs2_commit_trans(osb, handle);
1797 1815
@@ -1809,6 +1827,7 @@ bail:
1809 if (xattr_ac) 1827 if (xattr_ac)
1810 ocfs2_free_alloc_context(xattr_ac); 1828 ocfs2_free_alloc_context(xattr_ac);
1811 if ((status < 0) && inode) { 1829 if ((status < 0) && inode) {
1830 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
1812 clear_nlink(inode); 1831 clear_nlink(inode);
1813 iput(inode); 1832 iput(inode);
1814 } 1833 }
@@ -1909,7 +1928,7 @@ leave:
1909static int ocfs2_orphan_add(struct ocfs2_super *osb, 1928static int ocfs2_orphan_add(struct ocfs2_super *osb,
1910 handle_t *handle, 1929 handle_t *handle,
1911 struct inode *inode, 1930 struct inode *inode,
1912 struct ocfs2_dinode *fe, 1931 struct buffer_head *fe_bh,
1913 char *name, 1932 char *name,
1914 struct ocfs2_dir_lookup_result *lookup, 1933 struct ocfs2_dir_lookup_result *lookup,
1915 struct inode *orphan_dir_inode) 1934 struct inode *orphan_dir_inode)
@@ -1917,6 +1936,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1917 struct buffer_head *orphan_dir_bh = NULL; 1936 struct buffer_head *orphan_dir_bh = NULL;
1918 int status = 0; 1937 int status = 0;
1919 struct ocfs2_dinode *orphan_fe; 1938 struct ocfs2_dinode *orphan_fe;
1939 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1920 1940
1921 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1941 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1922 1942
@@ -1957,13 +1977,31 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1957 goto leave; 1977 goto leave;
1958 } 1978 }
1959 1979
1980 /*
1981 * We're going to journal the change of i_flags and i_orphaned_slot.
1982 * It's safe anyway, though some callers may duplicate the journaling.
1983 * Journaling within the func just make the logic look more
1984 * straightforward.
1985 */
1986 status = ocfs2_journal_access_di(handle,
1987 INODE_CACHE(inode),
1988 fe_bh,
1989 OCFS2_JOURNAL_ACCESS_WRITE);
1990 if (status < 0) {
1991 mlog_errno(status);
1992 goto leave;
1993 }
1994
1960 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 1995 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
1996 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
1961 1997
1962 /* Record which orphan dir our inode now resides 1998 /* Record which orphan dir our inode now resides
1963 * in. delete_inode will use this to determine which orphan 1999 * in. delete_inode will use this to determine which orphan
1964 * dir to lock. */ 2000 * dir to lock. */
1965 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 2001 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
1966 2002
2003 ocfs2_journal_dirty(handle, fe_bh);
2004
1967 mlog(0, "Inode %llu orphaned in slot %d\n", 2005 mlog(0, "Inode %llu orphaned in slot %d\n",
1968 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 2006 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1969 2007
@@ -2099,15 +2137,12 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2099 goto leave; 2137 goto leave;
2100 } 2138 }
2101 2139
2102 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 2140 status = dquot_alloc_inode(inode);
2103 * to be called. */ 2141 if (status)
2104 if (sb_any_quota_active(osb->sb) &&
2105 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
2106 status = -EDQUOT;
2107 goto leave; 2142 goto leave;
2108 }
2109 did_quota_inode = 1; 2143 did_quota_inode = 1;
2110 2144
2145 inode->i_nlink = 0;
2111 /* do the real work now. */ 2146 /* do the real work now. */
2112 status = ocfs2_mknod_locked(osb, dir, inode, 2147 status = ocfs2_mknod_locked(osb, dir, inode,
2113 0, &new_di_bh, parent_di_bh, handle, 2148 0, &new_di_bh, parent_di_bh, handle,
@@ -2124,7 +2159,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2124 } 2159 }
2125 2160
2126 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2161 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2127 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, 2162 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2128 &orphan_insert, orphan_dir); 2163 &orphan_insert, orphan_dir);
2129 if (status < 0) { 2164 if (status < 0) {
2130 mlog_errno(status); 2165 mlog_errno(status);
@@ -2136,9 +2171,10 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2136 if (status < 0) 2171 if (status < 0)
2137 mlog_errno(status); 2172 mlog_errno(status);
2138 2173
2174 insert_inode_hash(inode);
2139leave: 2175leave:
2140 if (status < 0 && did_quota_inode) 2176 if (status < 0 && did_quota_inode)
2141 vfs_dq_free_inode(inode); 2177 dquot_free_inode(inode);
2142 if (handle) 2178 if (handle)
2143 ocfs2_commit_trans(osb, handle); 2179 ocfs2_commit_trans(osb, handle);
2144 2180
@@ -2267,6 +2303,8 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2267 di = (struct ocfs2_dinode *)di_bh->b_data; 2303 di = (struct ocfs2_dinode *)di_bh->b_data;
2268 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); 2304 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
2269 di->i_orphaned_slot = 0; 2305 di->i_orphaned_slot = 0;
2306 inode->i_nlink = 1;
2307 ocfs2_set_links_count(di, inode->i_nlink);
2270 ocfs2_journal_dirty(handle, di_bh); 2308 ocfs2_journal_dirty(handle, di_bh);
2271 2309
2272 status = ocfs2_add_entry(handle, dentry, inode, 2310 status = ocfs2_add_entry(handle, dentry, inode,
@@ -2284,7 +2322,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2284 goto out_commit; 2322 goto out_commit;
2285 } 2323 }
2286 2324
2287 insert_inode_hash(inode);
2288 dentry->d_op = &ocfs2_dentry_ops; 2325 dentry->d_op = &ocfs2_dentry_ops;
2289 d_instantiate(dentry, inode); 2326 d_instantiate(dentry, inode);
2290 status = 0; 2327 status = 0;
@@ -2326,4 +2363,5 @@ const struct inode_operations ocfs2_dir_iops = {
2326 .getxattr = generic_getxattr, 2363 .getxattr = generic_getxattr,
2327 .listxattr = ocfs2_listxattr, 2364 .listxattr = ocfs2_listxattr,
2328 .removexattr = generic_removexattr, 2365 .removexattr = generic_removexattr,
2366 .fiemap = ocfs2_fiemap,
2329}; 2367};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d963d8638709..adf5e2ebc2c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -42,6 +42,7 @@
42 42
43#include "ocfs2_fs.h" 43#include "ocfs2_fs.h"
44#include "ocfs2_lockid.h" 44#include "ocfs2_lockid.h"
45#include "ocfs2_ioctl.h"
45 46
46/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
47#include "blockcheck.h" 48#include "blockcheck.h"
@@ -136,6 +137,10 @@ enum ocfs2_unlock_action {
136#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a 137#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
137 call to dlm_lock. Only 138 call to dlm_lock. Only
138 exists with BUSY set. */ 139 exists with BUSY set. */
140#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
141 * from downconverting
142 * before the upconvert
143 * has completed */
139 144
140struct ocfs2_lock_res_ops; 145struct ocfs2_lock_res_ops;
141 146
@@ -155,7 +160,7 @@ struct ocfs2_lock_res {
155 int l_level; 160 int l_level;
156 unsigned int l_ro_holders; 161 unsigned int l_ro_holders;
157 unsigned int l_ex_holders; 162 unsigned int l_ex_holders;
158 union ocfs2_dlm_lksb l_lksb; 163 struct ocfs2_dlm_lksb l_lksb;
159 164
160 /* used from AST/BAST funcs. */ 165 /* used from AST/BAST funcs. */
161 enum ocfs2_ast_action l_action; 166 enum ocfs2_ast_action l_action;
@@ -245,9 +250,11 @@ enum ocfs2_mount_options
245 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ 250 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
246 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ 251 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
247 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ 252 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
248 OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */ 253 OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */
249 OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */ 254 OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access
250 OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ 255 control lists */
256 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
257 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
251}; 258};
252 259
253#define OCFS2_OSB_SOFT_RO 0x0001 260#define OCFS2_OSB_SOFT_RO 0x0001
@@ -299,7 +306,9 @@ struct ocfs2_super
299 u32 s_next_generation; 306 u32 s_next_generation;
300 unsigned long osb_flags; 307 unsigned long osb_flags;
301 s16 s_inode_steal_slot; 308 s16 s_inode_steal_slot;
309 s16 s_meta_steal_slot;
302 atomic_t s_num_inodes_stolen; 310 atomic_t s_num_inodes_stolen;
311 atomic_t s_num_meta_stolen;
303 312
304 unsigned long s_mount_opt; 313 unsigned long s_mount_opt;
305 unsigned int s_atime_quantum; 314 unsigned int s_atime_quantum;
@@ -754,35 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
754 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
755} 764}
756 765
757static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 766static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
758{ 767{
759 spin_lock(&osb->osb_lock); 768 ext2_set_bit(bit, bitmap);
760 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
761 spin_unlock(&osb->osb_lock);
762 atomic_set(&osb->s_num_inodes_stolen, 0);
763} 769}
770#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
764 771
765static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb, 772static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
766 s16 slot)
767{ 773{
768 spin_lock(&osb->osb_lock); 774 ext2_clear_bit(bit, bitmap);
769 osb->s_inode_steal_slot = slot;
770 spin_unlock(&osb->osb_lock);
771}
772
773static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
774{
775 s16 slot;
776
777 spin_lock(&osb->osb_lock);
778 slot = osb->s_inode_steal_slot;
779 spin_unlock(&osb->osb_lock);
780
781 return slot;
782} 775}
776#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
783 777
784#define ocfs2_set_bit ext2_set_bit
785#define ocfs2_clear_bit ext2_clear_bit
786#define ocfs2_test_bit ext2_test_bit 778#define ocfs2_test_bit ext2_test_bit
787#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 779#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
788#define ocfs2_find_next_bit ext2_find_next_bit 780#define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e9431e4a5e7c..bb37218a7978 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -254,63 +254,6 @@
254 * refcount tree */ 254 * refcount tree */
255 255
256/* 256/*
257 * ioctl commands
258 */
259#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
260#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
261#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
262#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
263
264/*
265 * Space reservation / allocation / free ioctls and argument structure
266 * are designed to be compatible with XFS.
267 *
268 * ALLOCSP* and FREESP* are not and will never be supported, but are
269 * included here for completeness.
270 */
271struct ocfs2_space_resv {
272 __s16 l_type;
273 __s16 l_whence;
274 __s64 l_start;
275 __s64 l_len; /* len == 0 means until end of file */
276 __s32 l_sysid;
277 __u32 l_pid;
278 __s32 l_pad[4]; /* reserve area */
279};
280
281#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
282#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
283#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
284#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
285#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
286#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
287#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
288#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
289
290/* Used to pass group descriptor data when online resize is done */
291struct ocfs2_new_group_input {
292 __u64 group; /* Group descriptor's blkno. */
293 __u32 clusters; /* Total number of clusters in this group */
294 __u32 frees; /* Total free clusters in this group */
295 __u16 chain; /* Chain for this group */
296 __u16 reserved1;
297 __u32 reserved2;
298};
299
300#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
301#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
302#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
303
304/* Used to pass 2 file names to reflink. */
305struct reflink_arguments {
306 __u64 old_path;
307 __u64 new_path;
308 __u64 preserve;
309};
310#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
311
312
313/*
314 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 257 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
315 */ 258 */
316#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 259#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
@@ -1202,7 +1145,7 @@ struct ocfs2_local_disk_dqinfo {
1202/* Header of one chunk of a quota file */ 1145/* Header of one chunk of a quota file */
1203struct ocfs2_local_disk_chunk { 1146struct ocfs2_local_disk_chunk {
1204 __le32 dqc_free; /* Number of free entries in the bitmap */ 1147 __le32 dqc_free; /* Number of free entries in the bitmap */
1205 u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding 1148 __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
1206 * chunk of quota file */ 1149 * chunk of quota file */
1207}; 1150};
1208 1151
@@ -1417,9 +1360,16 @@ static inline int ocfs2_fast_symlink_chars(int blocksize)
1417 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); 1360 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
1418} 1361}
1419 1362
1420static inline int ocfs2_max_inline_data(int blocksize) 1363static inline int ocfs2_max_inline_data_with_xattr(int blocksize,
1364 struct ocfs2_dinode *di)
1421{ 1365{
1422 return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); 1366 if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL))
1367 return blocksize -
1368 offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
1369 di->i_xattr_inline_size;
1370 else
1371 return blocksize -
1372 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
1423} 1373}
1424 1374
1425static inline int ocfs2_extent_recs_per_inode(int blocksize) 1375static inline int ocfs2_extent_recs_per_inode(int blocksize)
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000000..2d3420af1a83
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,79 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_ioctl.h
5 *
6 * Defines OCFS2 ioctls.
7 *
8 * Copyright (C) 2010 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_IOCTL_H
21#define OCFS2_IOCTL_H
22
23/*
24 * ioctl commands
25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
30
31/*
32 * Space reservation / allocation / free ioctls and argument structure
33 * are designed to be compatible with XFS.
34 *
35 * ALLOCSP* and FREESP* are not and will never be supported, but are
36 * included here for completeness.
37 */
38struct ocfs2_space_resv {
39 __s16 l_type;
40 __s16 l_whence;
41 __s64 l_start;
42 __s64 l_len; /* len == 0 means until end of file */
43 __s32 l_sysid;
44 __u32 l_pid;
45 __s32 l_pad[4]; /* reserve area */
46};
47
48#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
49#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
50#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
51#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
52#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
53#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
54#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
55#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
56
57/* Used to pass group descriptor data when online resize is done */
58struct ocfs2_new_group_input {
59 __u64 group; /* Group descriptor's blkno. */
60 __u32 clusters; /* Total number of clusters in this group */
61 __u32 frees; /* Total free clusters in this group */
62 __u16 chain; /* Chain for this group */
63 __u16 reserved1;
64 __u32 reserved2;
65};
66
67#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
68#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
69#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
70
71/* Used to pass 2 file names to reflink. */
72struct reflink_arguments {
73 __u64 old_path;
74 __u64 new_path;
75 __u64 preserve;
76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78
79#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0fff..2e45c8d2ea7e 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
23/* 23/*
24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for 24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for
25 * more details. 25 * more details.
26 *
27 * 1.0 - Initial locking version from ocfs2 1.4.
26 */ 28 */
27#define OCFS2_LOCKING_PROTOCOL_MAJOR 1 29#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
28#define OCFS2_LOCKING_PROTOCOL_MINOR 0 30#define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index e5df9d170b0c..123bc520a2c0 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -17,10 +17,6 @@
17 17
18#include "ocfs2.h" 18#include "ocfs2.h"
19 19
20/* Common stuff */
21/* id number of quota format */
22#define QFMT_OCFS2 3
23
24/* 20/*
25 * In-memory structures 21 * In-memory structures
26 */ 22 */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b437dc0c4cad..ab42a74c7539 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h> 9#include <linux/dqblk_qtree.h>
@@ -851,13 +852,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
851} 852}
852 853
853const struct dquot_operations ocfs2_quota_operations = { 854const struct dquot_operations ocfs2_quota_operations = {
854 .initialize = dquot_initialize,
855 .drop = dquot_drop,
856 .alloc_space = dquot_alloc_space,
857 .alloc_inode = dquot_alloc_inode,
858 .free_space = dquot_free_space,
859 .free_inode = dquot_free_inode,
860 .transfer = dquot_transfer,
861 .write_dquot = ocfs2_write_dquot, 855 .write_dquot = ocfs2_write_dquot,
862 .acquire_dquot = ocfs2_acquire_dquot, 856 .acquire_dquot = ocfs2_acquire_dquot,
863 .release_dquot = ocfs2_release_dquot, 857 .release_dquot = ocfs2_release_dquot,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 1a2c50a759fa..9ad49305f450 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
3 */ 3 */
4 4
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/module.h> 9#include <linux/module.h>
@@ -457,7 +458,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
457 break; 458 break;
458 } 459 }
459 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; 460 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
460 for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { 461 for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
461 qbh = NULL; 462 qbh = NULL;
462 status = ocfs2_read_quota_block(lqinode, 463 status = ocfs2_read_quota_block(lqinode,
463 ol_dqblk_block(sb, chunk, bit), 464 ol_dqblk_block(sb, chunk, bit),
@@ -1325,7 +1326,7 @@ out:
1325 return status; 1326 return status;
1326} 1327}
1327 1328
1328static struct quota_format_ops ocfs2_format_ops = { 1329static const struct quota_format_ops ocfs2_format_ops = {
1329 .check_quota_file = ocfs2_local_check_quota_file, 1330 .check_quota_file = ocfs2_local_check_quota_file,
1330 .read_file_info = ocfs2_local_read_info, 1331 .read_file_info = ocfs2_local_read_info,
1331 .write_file_info = ocfs2_global_write_info, 1332 .write_file_info = ocfs2_global_write_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3a0df7a1b810..5cbcd0f008fc 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h> 40#include <linux/slab.h>
42#include <linux/writeback.h> 41#include <linux/writeback.h>
43#include <linux/pagevec.h> 42#include <linux/pagevec.h>
@@ -276,7 +275,7 @@ static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
276 spin_unlock(&osb->osb_lock); 275 spin_unlock(&osb->osb_lock);
277} 276}
278 277
279void ocfs2_kref_remove_refcount_tree(struct kref *kref) 278static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
280{ 279{
281 struct ocfs2_refcount_tree *tree = 280 struct ocfs2_refcount_tree *tree =
282 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); 281 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
@@ -524,23 +523,6 @@ out:
524 return ret; 523 return ret;
525} 524}
526 525
527int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
528 struct ocfs2_refcount_tree **ret_tree,
529 struct buffer_head **ref_bh)
530{
531 int ret;
532 u64 ref_blkno;
533
534 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
535 if (ret) {
536 mlog_errno(ret);
537 return ret;
538 }
539
540 return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
541 rw, ret_tree, ref_bh);
542}
543
544void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, 526void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
545 struct ocfs2_refcount_tree *tree, int rw) 527 struct ocfs2_refcount_tree *tree, int rw)
546{ 528{
@@ -643,7 +625,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
643 rb = (struct ocfs2_refcount_block *)new_bh->b_data; 625 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
644 memset(rb, 0, inode->i_sb->s_blocksize); 626 memset(rb, 0, inode->i_sb->s_blocksize);
645 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 627 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
646 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); 628 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
647 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 629 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
648 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 630 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
649 rb->rf_blkno = cpu_to_le64(first_blkno); 631 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -969,6 +951,103 @@ out:
969} 951}
970 952
971/* 953/*
954 * Find the end range for a leaf refcount block indicated by
955 * el->l_recs[index].e_blkno.
956 */
957static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
958 struct buffer_head *ref_root_bh,
959 struct ocfs2_extent_block *eb,
960 struct ocfs2_extent_list *el,
961 int index, u32 *cpos_end)
962{
963 int ret, i, subtree_root;
964 u32 cpos;
965 u64 blkno;
966 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
967 struct ocfs2_path *left_path = NULL, *right_path = NULL;
968 struct ocfs2_extent_tree et;
969 struct ocfs2_extent_list *tmp_el;
970
971 if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
972 /*
973 * We have a extent rec after index, so just use the e_cpos
974 * of the next extent rec.
975 */
976 *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
977 return 0;
978 }
979
980 if (!eb || (eb && !eb->h_next_leaf_blk)) {
981 /*
982 * We are the last extent rec, so any high cpos should
983 * be stored in this leaf refcount block.
984 */
985 *cpos_end = UINT_MAX;
986 return 0;
987 }
988
989 /*
990 * If the extent block isn't the last one, we have to find
991 * the subtree root between this extent block and the next
992 * leaf extent block and get the corresponding e_cpos from
993 * the subroot. Otherwise we may corrupt the b-tree.
994 */
995 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
996
997 left_path = ocfs2_new_path_from_et(&et);
998 if (!left_path) {
999 ret = -ENOMEM;
1000 mlog_errno(ret);
1001 goto out;
1002 }
1003
1004 cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
1005 ret = ocfs2_find_path(ci, left_path, cpos);
1006 if (ret) {
1007 mlog_errno(ret);
1008 goto out;
1009 }
1010
1011 right_path = ocfs2_new_path_from_path(left_path);
1012 if (!right_path) {
1013 ret = -ENOMEM;
1014 mlog_errno(ret);
1015 goto out;
1016 }
1017
1018 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
1019 if (ret) {
1020 mlog_errno(ret);
1021 goto out;
1022 }
1023
1024 ret = ocfs2_find_path(ci, right_path, cpos);
1025 if (ret) {
1026 mlog_errno(ret);
1027 goto out;
1028 }
1029
1030 subtree_root = ocfs2_find_subtree_root(&et, left_path,
1031 right_path);
1032
1033 tmp_el = left_path->p_node[subtree_root].el;
1034 blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
1035 for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
1036 if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
1037 *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
1038 break;
1039 }
1040 }
1041
1042 BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
1043
1044out:
1045 ocfs2_free_path(left_path);
1046 ocfs2_free_path(right_path);
1047 return ret;
1048}
1049
1050/*
972 * Given a cpos and len, try to find the refcount record which contains cpos. 1051 * Given a cpos and len, try to find the refcount record which contains cpos.
973 * 1. If cpos can be found in one refcount record, return the record. 1052 * 1. If cpos can be found in one refcount record, return the record.
974 * 2. If cpos can't be found, return a fake record which start from cpos 1053 * 2. If cpos can't be found, return a fake record which start from cpos
@@ -983,10 +1062,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
983 struct buffer_head **ret_bh) 1062 struct buffer_head **ret_bh)
984{ 1063{
985 int ret = 0, i, found; 1064 int ret = 0, i, found;
986 u32 low_cpos; 1065 u32 low_cpos, uninitialized_var(cpos_end);
987 struct ocfs2_extent_list *el; 1066 struct ocfs2_extent_list *el;
988 struct ocfs2_extent_rec *tmp, *rec = NULL; 1067 struct ocfs2_extent_rec *rec = NULL;
989 struct ocfs2_extent_block *eb; 1068 struct ocfs2_extent_block *eb = NULL;
990 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; 1069 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
991 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1070 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
992 struct ocfs2_refcount_block *rb = 1071 struct ocfs2_refcount_block *rb =
@@ -1034,12 +1113,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
1034 } 1113 }
1035 } 1114 }
1036 1115
1037 /* adjust len when we have ocfs2_extent_rec after it. */ 1116 if (found) {
1038 if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { 1117 ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
1039 tmp = &el->l_recs[i+1]; 1118 eb, el, i, &cpos_end);
1119 if (ret) {
1120 mlog_errno(ret);
1121 goto out;
1122 }
1040 1123
1041 if (le32_to_cpu(tmp->e_cpos) < cpos + len) 1124 if (cpos_end < low_cpos + len)
1042 len = le32_to_cpu(tmp->e_cpos) - cpos; 1125 len = cpos_end - low_cpos;
1043 } 1126 }
1044 1127
1045 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), 1128 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
@@ -1246,7 +1329,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1246 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); 1329 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1247 1330
1248 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1331 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1249 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1332 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1250 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1333 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1251 new_rb->rf_blkno = cpu_to_le64(blkno); 1334 new_rb->rf_blkno = cpu_to_le64(blkno);
1252 new_rb->rf_cpos = cpu_to_le32(0); 1335 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1418,7 +1501,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1418 1501
1419 /* change old and new rl_used accordingly. */ 1502 /* change old and new rl_used accordingly. */
1420 le16_add_cpu(&rl->rl_used, -num_moved); 1503 le16_add_cpu(&rl->rl_used, -num_moved);
1421 new_rl->rl_used = cpu_to_le32(num_moved); 1504 new_rl->rl_used = cpu_to_le16(num_moved);
1422 1505
1423 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), 1506 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1424 sizeof(struct ocfs2_refcount_rec), 1507 sizeof(struct ocfs2_refcount_rec),
@@ -1492,7 +1575,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1492 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1575 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1493 memset(new_rb, 0, sb->s_blocksize); 1576 memset(new_rb, 0, sb->s_blocksize);
1494 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1577 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1495 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1578 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1496 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1579 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1497 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1580 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1498 new_rb->rf_blkno = cpu_to_le64(blkno); 1581 new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1797,7 +1880,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1797 recs_need++; 1880 recs_need++;
1798 1881
1799 /* If the leaf block don't have enough record, expand it. */ 1882 /* If the leaf block don't have enough record, expand it. */
1800 if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { 1883 if (le16_to_cpu(rf_list->rl_used) + recs_need >
1884 le16_to_cpu(rf_list->rl_count)) {
1801 struct ocfs2_refcount_rec tmp_rec; 1885 struct ocfs2_refcount_rec tmp_rec;
1802 u64 cpos = le64_to_cpu(orig_rec->r_cpos); 1886 u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1803 len = le32_to_cpu(orig_rec->r_clusters); 1887 len = le32_to_cpu(orig_rec->r_clusters);
@@ -1859,7 +1943,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1859 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); 1943 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1860 le64_add_cpu(&tail_rec->r_cpos, 1944 le64_add_cpu(&tail_rec->r_cpos,
1861 le32_to_cpu(tail_rec->r_clusters) - len); 1945 le32_to_cpu(tail_rec->r_clusters) - len);
1862 tail_rec->r_clusters = le32_to_cpu(len); 1946 tail_rec->r_clusters = cpu_to_le32(len);
1863 } 1947 }
1864 1948
1865 /* 1949 /*
@@ -2431,7 +2515,7 @@ out:
2431 * we gonna touch and whether we need to create new blocks. 2515 * we gonna touch and whether we need to create new blocks.
2432 * 2516 *
2433 * Normally the refcount blocks store these refcount should be 2517 * Normally the refcount blocks store these refcount should be
2434 * continguous also, so that we can get the number easily. 2518 * contiguous also, so that we can get the number easily.
2435 * As for meta_ac, we will at most add split 2 refcount record and 2519 * As for meta_ac, we will at most add split 2 refcount record and
2436 * 2 more refcount block, so just check it in a rough way. 2520 * 2 more refcount block, so just check it in a rough way.
2437 * 2521 *
@@ -2860,7 +2944,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2860 2944
2861 while (offset < end) { 2945 while (offset < end) {
2862 page_index = offset >> PAGE_CACHE_SHIFT; 2946 page_index = offset >> PAGE_CACHE_SHIFT;
2863 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 2947 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
2864 if (map_end > end) 2948 if (map_end > end)
2865 map_end = end; 2949 map_end = end;
2866 2950
@@ -2872,8 +2956,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2872 2956
2873 page = grab_cache_page(mapping, page_index); 2957 page = grab_cache_page(mapping, page_index);
2874 2958
2875 /* This page can't be dirtied before we CoW it out. */ 2959 /*
2876 BUG_ON(PageDirty(page)); 2960 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
2961 * can't be dirtied before we CoW it out.
2962 */
2963 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2964 BUG_ON(PageDirty(page));
2877 2965
2878 if (!PageUptodate(page)) { 2966 if (!PageUptodate(page)) {
2879 ret = block_read_full_page(page, ocfs2_get_block); 2967 ret = block_read_full_page(page, ocfs2_get_block);
@@ -3085,7 +3173,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3085 3173
3086 while (offset < end) { 3174 while (offset < end) {
3087 page_index = offset >> PAGE_CACHE_SHIFT; 3175 page_index = offset >> PAGE_CACHE_SHIFT;
3088 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 3176 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
3089 if (map_end > end) 3177 if (map_end > end)
3090 map_end = end; 3178 map_end = end;
3091 3179
@@ -3840,8 +3928,7 @@ static int ocfs2_add_refcounted_extent(struct inode *inode,
3840 } 3928 }
3841 3929
3842 ret = ocfs2_insert_extent(handle, et, cpos, 3930 ret = ocfs2_insert_extent(handle, et, cpos,
3843 cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 3931 ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
3844 p_cluster)),
3845 num_clusters, ext_flags, meta_ac); 3932 num_clusters, ext_flags, meta_ac);
3846 if (ret) { 3933 if (ret) {
3847 mlog_errno(ret); 3934 mlog_errno(ret);
@@ -3987,6 +4074,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
3987 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4074 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
3988 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4075 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3989 i_size_write(t_inode, size); 4076 i_size_write(t_inode, size);
4077 t_inode->i_blocks = s_inode->i_blocks;
3990 4078
3991 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
3992 di->i_clusters = s_di->i_clusters; 4080 di->i_clusters = s_di->i_clusters;
@@ -3995,6 +4083,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
3995 di->i_attr = s_di->i_attr; 4083 di->i_attr = s_di->i_attr;
3996 4084
3997 if (preserve) { 4085 if (preserve) {
4086 t_inode->i_uid = s_inode->i_uid;
4087 t_inode->i_gid = s_inode->i_gid;
4088 t_inode->i_mode = s_inode->i_mode;
3998 di->i_uid = s_di->i_uid; 4089 di->i_uid = s_di->i_uid;
3999 di->i_gid = s_di->i_gid; 4090 di->i_gid = s_di->i_gid;
4000 di->i_mode = s_di->i_mode; 4091 di->i_mode = s_di->i_mode;
@@ -4253,8 +4344,8 @@ static int ocfs2_user_path_parent(const char __user *path,
4253 * @new_dentry: target dentry 4344 * @new_dentry: target dentry
4254 * @preserve: if true, preserve all file attributes 4345 * @preserve: if true, preserve all file attributes
4255 */ 4346 */
4256int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, 4347static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4257 struct dentry *new_dentry, bool preserve) 4348 struct dentry *new_dentry, bool preserve)
4258{ 4349{
4259 struct inode *inode = old_dentry->d_inode; 4350 struct inode *inode = old_dentry->d_inode;
4260 int error; 4351 int error;
@@ -4302,7 +4393,7 @@ int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4302 } 4393 }
4303 4394
4304 mutex_lock(&inode->i_mutex); 4395 mutex_lock(&inode->i_mutex);
4305 vfs_dq_init(dir); 4396 dquot_initialize(dir);
4306 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); 4397 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4307 mutex_unlock(&inode->i_mutex); 4398 mutex_unlock(&inode->i_mutex);
4308 if (!error) 4399 if (!error)
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index e49c41050264..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/slab.h>
22#include <linux/module.h> 23#include <linux/module.h>
23 24
24/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ 25/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
@@ -161,24 +162,23 @@ static int dlm_status_to_errno(enum dlm_status status)
161 162
162static void o2dlm_lock_ast_wrapper(void *astarg) 163static void o2dlm_lock_ast_wrapper(void *astarg)
163{ 164{
164 BUG_ON(o2cb_stack.sp_proto == NULL); 165 struct ocfs2_dlm_lksb *lksb = astarg;
165 166
166 o2cb_stack.sp_proto->lp_lock_ast(astarg); 167 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
167} 168}
168 169
169static void o2dlm_blocking_ast_wrapper(void *astarg, int level) 170static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
170{ 171{
171 BUG_ON(o2cb_stack.sp_proto == NULL); 172 struct ocfs2_dlm_lksb *lksb = astarg;
172 173
173 o2cb_stack.sp_proto->lp_blocking_ast(astarg, level); 174 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
174} 175}
175 176
176static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) 177static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
177{ 178{
179 struct ocfs2_dlm_lksb *lksb = astarg;
178 int error = dlm_status_to_errno(status); 180 int error = dlm_status_to_errno(status);
179 181
180 BUG_ON(o2cb_stack.sp_proto == NULL);
181
182 /* 182 /*
183 * In o2dlm, you can get both the lock_ast() for the lock being 183 * In o2dlm, you can get both the lock_ast() for the lock being
184 * granted and the unlock_ast() for the CANCEL failing. A 184 * granted and the unlock_ast() for the CANCEL failing. A
@@ -193,16 +193,15 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
193 if (status == DLM_CANCELGRANT) 193 if (status == DLM_CANCELGRANT)
194 return; 194 return;
195 195
196 o2cb_stack.sp_proto->lp_unlock_ast(astarg, error); 196 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
197} 197}
198 198
199static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, 199static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
200 int mode, 200 int mode,
201 union ocfs2_dlm_lksb *lksb, 201 struct ocfs2_dlm_lksb *lksb,
202 u32 flags, 202 u32 flags,
203 void *name, 203 void *name,
204 unsigned int namelen, 204 unsigned int namelen)
205 void *astarg)
206{ 205{
207 enum dlm_status status; 206 enum dlm_status status;
208 int o2dlm_mode = mode_to_o2dlm(mode); 207 int o2dlm_mode = mode_to_o2dlm(mode);
@@ -211,28 +210,27 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
211 210
212 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, 211 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
213 o2dlm_flags, name, namelen, 212 o2dlm_flags, name, namelen,
214 o2dlm_lock_ast_wrapper, astarg, 213 o2dlm_lock_ast_wrapper, lksb,
215 o2dlm_blocking_ast_wrapper); 214 o2dlm_blocking_ast_wrapper);
216 ret = dlm_status_to_errno(status); 215 ret = dlm_status_to_errno(status);
217 return ret; 216 return ret;
218} 217}
219 218
220static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, 219static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
221 union ocfs2_dlm_lksb *lksb, 220 struct ocfs2_dlm_lksb *lksb,
222 u32 flags, 221 u32 flags)
223 void *astarg)
224{ 222{
225 enum dlm_status status; 223 enum dlm_status status;
226 int o2dlm_flags = flags_to_o2dlm(flags); 224 int o2dlm_flags = flags_to_o2dlm(flags);
227 int ret; 225 int ret;
228 226
229 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, 227 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
230 o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg); 228 o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
231 ret = dlm_status_to_errno(status); 229 ret = dlm_status_to_errno(status);
232 return ret; 230 return ret;
233} 231}
234 232
235static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 233static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
236{ 234{
237 return dlm_status_to_errno(lksb->lksb_o2dlm.status); 235 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
238} 236}
@@ -242,17 +240,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
242 * contents, it will zero out the LVB. Thus the caller can always trust 240 * contents, it will zero out the LVB. Thus the caller can always trust
243 * the contents. 241 * the contents.
244 */ 242 */
245static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 243static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
246{ 244{
247 return 1; 245 return 1;
248} 246}
249 247
250static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) 248static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
251{ 249{
252 return (void *)(lksb->lksb_o2dlm.lvb); 250 return (void *)(lksb->lksb_o2dlm.lvb);
253} 251}
254 252
255static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb) 253static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256{ 254{
257 dlm_print_one_lock(lksb->lksb_o2dlm.lockid); 255 dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
258} 256}
@@ -277,10 +275,10 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
277 u32 dlm_key; 275 u32 dlm_key;
278 struct dlm_ctxt *dlm; 276 struct dlm_ctxt *dlm;
279 struct o2dlm_private *priv; 277 struct o2dlm_private *priv;
280 struct dlm_protocol_version dlm_version; 278 struct dlm_protocol_version fs_version;
281 279
282 BUG_ON(conn == NULL); 280 BUG_ON(conn == NULL);
283 BUG_ON(o2cb_stack.sp_proto == NULL); 281 BUG_ON(conn->cc_proto == NULL);
284 282
285 /* for now we only have one cluster/node, make sure we see it 283 /* for now we only have one cluster/node, make sure we see it
286 * in the heartbeat universe */ 284 * in the heartbeat universe */
@@ -304,18 +302,18 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
304 /* used by the dlm code to make message headers unique, each 302 /* used by the dlm code to make message headers unique, each
305 * node in this domain must agree on this. */ 303 * node in this domain must agree on this. */
306 dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); 304 dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
307 dlm_version.pv_major = conn->cc_version.pv_major; 305 fs_version.pv_major = conn->cc_version.pv_major;
308 dlm_version.pv_minor = conn->cc_version.pv_minor; 306 fs_version.pv_minor = conn->cc_version.pv_minor;
309 307
310 dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version); 308 dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
311 if (IS_ERR(dlm)) { 309 if (IS_ERR(dlm)) {
312 rc = PTR_ERR(dlm); 310 rc = PTR_ERR(dlm);
313 mlog_errno(rc); 311 mlog_errno(rc);
314 goto out_free; 312 goto out_free;
315 } 313 }
316 314
317 conn->cc_version.pv_major = dlm_version.pv_major; 315 conn->cc_version.pv_major = fs_version.pv_major;
318 conn->cc_version.pv_minor = dlm_version.pv_minor; 316 conn->cc_version.pv_minor = fs_version.pv_minor;
319 conn->cc_lockspace = dlm; 317 conn->cc_lockspace = dlm;
320 318
321 dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); 319 dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index ff4c798a5635..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,11 +21,11 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
25#include <linux/reboot.h> 26#include <linux/reboot.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27 28
28#include "ocfs2.h" /* For struct ocfs2_lock_res */
29#include "stackglue.h" 29#include "stackglue.h"
30 30
31#include <linux/dlm_plock.h> 31#include <linux/dlm_plock.h>
@@ -63,8 +63,8 @@
63 * negotiated by the client. The client negotiates based on the maximum 63 * negotiated by the client. The client negotiates based on the maximum
64 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major 64 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
65 * number from the "SETV" message must match 65 * number from the "SETV" message must match
66 * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number 66 * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
67 * must be less than or equal to ...->lp_max_version.pv_minor. 67 * must be less than or equal to ...sp_max_version.pv_minor.
68 * 68 *
69 * Once this information has been set, mounts will be allowed. From this 69 * Once this information has been set, mounts will be allowed. From this
70 * point on, the "DOWN" message can be sent for node down notification. 70 * point on, the "DOWN" message can be sent for node down notification.
@@ -401,7 +401,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
401 char *ptr = NULL; 401 char *ptr = NULL;
402 struct ocfs2_control_private *p = file->private_data; 402 struct ocfs2_control_private *p = file->private_data;
403 struct ocfs2_protocol_version *max = 403 struct ocfs2_protocol_version *max =
404 &ocfs2_user_plugin.sp_proto->lp_max_version; 404 &ocfs2_user_plugin.sp_max_proto;
405 405
406 if (ocfs2_control_get_handshake_state(file) != 406 if (ocfs2_control_get_handshake_state(file) !=
407 OCFS2_CONTROL_HANDSHAKE_PROTOCOL) 407 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -664,18 +664,10 @@ static void ocfs2_control_exit(void)
664 -rc); 664 -rc);
665} 665}
666 666
667static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
668{
669 struct ocfs2_lock_res *res = astarg;
670 return &res->l_lksb.lksb_fsdlm;
671}
672
673static void fsdlm_lock_ast_wrapper(void *astarg) 667static void fsdlm_lock_ast_wrapper(void *astarg)
674{ 668{
675 struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); 669 struct ocfs2_dlm_lksb *lksb = astarg;
676 int status = lksb->sb_status; 670 int status = lksb->lksb_fsdlm.sb_status;
677
678 BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
679 671
680 /* 672 /*
681 * For now we're punting on the issue of other non-standard errors 673 * For now we're punting on the issue of other non-standard errors
@@ -688,25 +680,24 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
688 */ 680 */
689 681
690 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) 682 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
691 ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0); 683 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
692 else 684 else
693 ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg); 685 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
694} 686}
695 687
696static void fsdlm_blocking_ast_wrapper(void *astarg, int level) 688static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
697{ 689{
698 BUG_ON(ocfs2_user_plugin.sp_proto == NULL); 690 struct ocfs2_dlm_lksb *lksb = astarg;
699 691
700 ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level); 692 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
701} 693}
702 694
703static int user_dlm_lock(struct ocfs2_cluster_connection *conn, 695static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
704 int mode, 696 int mode,
705 union ocfs2_dlm_lksb *lksb, 697 struct ocfs2_dlm_lksb *lksb,
706 u32 flags, 698 u32 flags,
707 void *name, 699 void *name,
708 unsigned int namelen, 700 unsigned int namelen)
709 void *astarg)
710{ 701{
711 int ret; 702 int ret;
712 703
@@ -716,36 +707,35 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
716 707
717 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, 708 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
718 flags|DLM_LKF_NODLCKWT, name, namelen, 0, 709 flags|DLM_LKF_NODLCKWT, name, namelen, 0,
719 fsdlm_lock_ast_wrapper, astarg, 710 fsdlm_lock_ast_wrapper, lksb,
720 fsdlm_blocking_ast_wrapper); 711 fsdlm_blocking_ast_wrapper);
721 return ret; 712 return ret;
722} 713}
723 714
724static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, 715static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
725 union ocfs2_dlm_lksb *lksb, 716 struct ocfs2_dlm_lksb *lksb,
726 u32 flags, 717 u32 flags)
727 void *astarg)
728{ 718{
729 int ret; 719 int ret;
730 720
731 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, 721 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
732 flags, &lksb->lksb_fsdlm, astarg); 722 flags, &lksb->lksb_fsdlm, lksb);
733 return ret; 723 return ret;
734} 724}
735 725
736static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 726static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
737{ 727{
738 return lksb->lksb_fsdlm.sb_status; 728 return lksb->lksb_fsdlm.sb_status;
739} 729}
740 730
741static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 731static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
742{ 732{
743 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID; 733 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
744 734
745 return !invalid; 735 return !invalid;
746} 736}
747 737
748static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) 738static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
749{ 739{
750 if (!lksb->lksb_fsdlm.sb_lvbptr) 740 if (!lksb->lksb_fsdlm.sb_lvbptr)
751 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + 741 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -753,7 +743,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
753 return (void *)(lksb->lksb_fsdlm.sb_lvbptr); 743 return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
754} 744}
755 745
756static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 746static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
757{ 747{
758} 748}
759 749
@@ -814,7 +804,7 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
814static int user_cluster_connect(struct ocfs2_cluster_connection *conn) 804static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
815{ 805{
816 dlm_lockspace_t *fsdlm; 806 dlm_lockspace_t *fsdlm;
817 struct ocfs2_live_connection *control; 807 struct ocfs2_live_connection *uninitialized_var(control);
818 int rc = 0; 808 int rc = 0;
819 809
820 BUG_ON(conn == NULL); 810 BUG_ON(conn == NULL);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 3f2f1c45b7b6..39abf89697ed 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -36,7 +36,7 @@
36#define OCFS2_STACK_PLUGIN_USER "user" 36#define OCFS2_STACK_PLUGIN_USER "user"
37#define OCFS2_MAX_HB_CTL_PATH 256 37#define OCFS2_MAX_HB_CTL_PATH 256
38 38
39static struct ocfs2_locking_protocol *lproto; 39static struct ocfs2_protocol_version locking_max_version;
40static DEFINE_SPINLOCK(ocfs2_stack_lock); 40static DEFINE_SPINLOCK(ocfs2_stack_lock);
41static LIST_HEAD(ocfs2_stack_list); 41static LIST_HEAD(ocfs2_stack_list);
42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; 42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
176 spin_lock(&ocfs2_stack_lock); 176 spin_lock(&ocfs2_stack_lock);
177 if (!ocfs2_stack_lookup(plugin->sp_name)) { 177 if (!ocfs2_stack_lookup(plugin->sp_name)) {
178 plugin->sp_count = 0; 178 plugin->sp_count = 0;
179 plugin->sp_proto = lproto; 179 plugin->sp_max_proto = locking_max_version;
180 list_add(&plugin->sp_list, &ocfs2_stack_list); 180 list_add(&plugin->sp_list, &ocfs2_stack_list);
181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", 181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
182 plugin->sp_name); 182 plugin->sp_name);
@@ -213,77 +213,76 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
213} 213}
214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); 214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
215 215
216void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) 216void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
217{ 217{
218 struct ocfs2_stack_plugin *p; 218 struct ocfs2_stack_plugin *p;
219 219
220 BUG_ON(proto == NULL);
221
222 spin_lock(&ocfs2_stack_lock); 220 spin_lock(&ocfs2_stack_lock);
223 BUG_ON(active_stack != NULL); 221 if (memcmp(max_proto, &locking_max_version,
222 sizeof(struct ocfs2_protocol_version))) {
223 BUG_ON(locking_max_version.pv_major != 0);
224 224
225 lproto = proto; 225 locking_max_version = *max_proto;
226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) { 226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
227 p->sp_proto = lproto; 227 p->sp_max_proto = locking_max_version;
228 }
228 } 229 }
229
230 spin_unlock(&ocfs2_stack_lock); 230 spin_unlock(&ocfs2_stack_lock);
231} 231}
232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); 232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
233 233
234 234
235/* 235/*
236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take 236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
237 * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the 237 * for the ast and bast functions. They will pass the lksb to the ast
238 * underlying stack plugins need to pilfer the lksb off of the lock_res. 238 * and bast. The caller can wrap the lksb with their own structure to
239 * If some other structure needs to be passed as an astarg, the plugins 239 * get more information.
240 * will need to be given a different avenue to the lksb.
241 */ 240 */
242int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 241int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
243 int mode, 242 int mode,
244 union ocfs2_dlm_lksb *lksb, 243 struct ocfs2_dlm_lksb *lksb,
245 u32 flags, 244 u32 flags,
246 void *name, 245 void *name,
247 unsigned int namelen, 246 unsigned int namelen)
248 struct ocfs2_lock_res *astarg)
249{ 247{
250 BUG_ON(lproto == NULL); 248 if (!lksb->lksb_conn)
251 249 lksb->lksb_conn = conn;
250 else
251 BUG_ON(lksb->lksb_conn != conn);
252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, 252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
253 name, namelen, astarg); 253 name, namelen);
254} 254}
255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); 255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
256 256
257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
258 union ocfs2_dlm_lksb *lksb, 258 struct ocfs2_dlm_lksb *lksb,
259 u32 flags, 259 u32 flags)
260 struct ocfs2_lock_res *astarg)
261{ 260{
262 BUG_ON(lproto == NULL); 261 BUG_ON(lksb->lksb_conn == NULL);
263 262
264 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg); 263 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
265} 264}
266EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); 265EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
267 266
268int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 267int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
269{ 268{
270 return active_stack->sp_ops->lock_status(lksb); 269 return active_stack->sp_ops->lock_status(lksb);
271} 270}
272EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); 271EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
273 272
274int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 273int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
275{ 274{
276 return active_stack->sp_ops->lvb_valid(lksb); 275 return active_stack->sp_ops->lvb_valid(lksb);
277} 276}
278EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); 277EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
279 278
280void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) 279void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
281{ 280{
282 return active_stack->sp_ops->lock_lvb(lksb); 281 return active_stack->sp_ops->lock_lvb(lksb);
283} 282}
284EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); 283EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
285 284
286void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 285void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
287{ 286{
288 active_stack->sp_ops->dump_lksb(lksb); 287 active_stack->sp_ops->dump_lksb(lksb);
289} 288}
@@ -312,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock);
312int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
313 const char *group, 312 const char *group,
314 int grouplen, 313 int grouplen,
314 struct ocfs2_locking_protocol *lproto,
315 void (*recovery_handler)(int node_num, 315 void (*recovery_handler)(int node_num,
316 void *recovery_data), 316 void *recovery_data),
317 void *recovery_data, 317 void *recovery_data,
@@ -329,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name,
329 goto out; 329 goto out;
330 } 330 }
331 331
332 if (memcmp(&lproto->lp_max_version, &locking_max_version,
333 sizeof(struct ocfs2_protocol_version))) {
334 rc = -EINVAL;
335 goto out;
336 }
337
332 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), 338 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
333 GFP_KERNEL); 339 GFP_KERNEL);
334 if (!new_conn) { 340 if (!new_conn) {
@@ -341,6 +347,7 @@ int ocfs2_cluster_connect(const char *stack_name,
341 new_conn->cc_recovery_handler = recovery_handler; 347 new_conn->cc_recovery_handler = recovery_handler;
342 new_conn->cc_recovery_data = recovery_data; 348 new_conn->cc_recovery_data = recovery_data;
343 349
350 new_conn->cc_proto = lproto;
344 /* Start the new connection at our maximum compatibility level */ 351 /* Start the new connection at our maximum compatibility level */
345 new_conn->cc_version = lproto->lp_max_version; 352 new_conn->cc_version = lproto->lp_max_version;
346 353
@@ -366,6 +373,24 @@ out:
366} 373}
367EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); 374EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
368 375
376/* The caller will ensure all nodes have the same cluster stack */
377int ocfs2_cluster_connect_agnostic(const char *group,
378 int grouplen,
379 struct ocfs2_locking_protocol *lproto,
380 void (*recovery_handler)(int node_num,
381 void *recovery_data),
382 void *recovery_data,
383 struct ocfs2_cluster_connection **conn)
384{
385 char *stack_name = NULL;
386
387 if (cluster_stack_name[0])
388 stack_name = cluster_stack_name;
389 return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
390 recovery_handler, recovery_data, conn);
391}
392EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
393
369/* If hangup_pending is 0, the stack driver will be dropped */ 394/* If hangup_pending is 0, the stack driver will be dropped */
370int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 395int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
371 int hangup_pending) 396 int hangup_pending)
@@ -453,10 +478,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
453 ssize_t ret = 0; 478 ssize_t ret = 0;
454 479
455 spin_lock(&ocfs2_stack_lock); 480 spin_lock(&ocfs2_stack_lock);
456 if (lproto) 481 if (locking_max_version.pv_major)
457 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", 482 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
458 lproto->lp_max_version.pv_major, 483 locking_max_version.pv_major,
459 lproto->lp_max_version.pv_minor); 484 locking_max_version.pv_minor);
460 spin_unlock(&ocfs2_stack_lock); 485 spin_unlock(&ocfs2_stack_lock);
461 486
462 return ret; 487 return ret;
@@ -620,51 +645,46 @@ error:
620 645
621static ctl_table ocfs2_nm_table[] = { 646static ctl_table ocfs2_nm_table[] = {
622 { 647 {
623 .ctl_name = 1,
624 .procname = "hb_ctl_path", 648 .procname = "hb_ctl_path",
625 .data = ocfs2_hb_ctl_path, 649 .data = ocfs2_hb_ctl_path,
626 .maxlen = OCFS2_MAX_HB_CTL_PATH, 650 .maxlen = OCFS2_MAX_HB_CTL_PATH,
627 .mode = 0644, 651 .mode = 0644,
628 .proc_handler = &proc_dostring, 652 .proc_handler = proc_dostring,
629 .strategy = &sysctl_string,
630 }, 653 },
631 { .ctl_name = 0 } 654 { }
632}; 655};
633 656
634static ctl_table ocfs2_mod_table[] = { 657static ctl_table ocfs2_mod_table[] = {
635 { 658 {
636 .ctl_name = FS_OCFS2_NM,
637 .procname = "nm", 659 .procname = "nm",
638 .data = NULL, 660 .data = NULL,
639 .maxlen = 0, 661 .maxlen = 0,
640 .mode = 0555, 662 .mode = 0555,
641 .child = ocfs2_nm_table 663 .child = ocfs2_nm_table
642 }, 664 },
643 { .ctl_name = 0} 665 { }
644}; 666};
645 667
646static ctl_table ocfs2_kern_table[] = { 668static ctl_table ocfs2_kern_table[] = {
647 { 669 {
648 .ctl_name = FS_OCFS2,
649 .procname = "ocfs2", 670 .procname = "ocfs2",
650 .data = NULL, 671 .data = NULL,
651 .maxlen = 0, 672 .maxlen = 0,
652 .mode = 0555, 673 .mode = 0555,
653 .child = ocfs2_mod_table 674 .child = ocfs2_mod_table
654 }, 675 },
655 { .ctl_name = 0} 676 { }
656}; 677};
657 678
658static ctl_table ocfs2_root_table[] = { 679static ctl_table ocfs2_root_table[] = {
659 { 680 {
660 .ctl_name = CTL_FS,
661 .procname = "fs", 681 .procname = "fs",
662 .data = NULL, 682 .data = NULL,
663 .maxlen = 0, 683 .maxlen = 0,
664 .mode = 0555, 684 .mode = 0555,
665 .child = ocfs2_kern_table 685 .child = ocfs2_kern_table
666 }, 686 },
667 { .ctl_name = 0 } 687 { }
668}; 688};
669 689
670static struct ctl_table_header *ocfs2_table_header = NULL; 690static struct ctl_table_header *ocfs2_table_header = NULL;
@@ -690,7 +710,10 @@ static int __init ocfs2_stack_glue_init(void)
690 710
691static void __exit ocfs2_stack_glue_exit(void) 711static void __exit ocfs2_stack_glue_exit(void)
692{ 712{
693 lproto = NULL; 713 memset(&locking_max_version, 0,
714 sizeof(struct ocfs2_protocol_version));
715 locking_max_version.pv_major = 0;
716 locking_max_version.pv_minor = 0;
694 ocfs2_sysfs_exit(); 717 ocfs2_sysfs_exit();
695 if (ocfs2_table_header) 718 if (ocfs2_table_header)
696 unregister_sysctl_table(ocfs2_table_header); 719 unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 03a44d60eac9..8ce7398ae1d2 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -56,17 +56,6 @@ struct ocfs2_protocol_version {
56}; 56};
57 57
58/* 58/*
59 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
60 */
61struct ocfs2_locking_protocol {
62 struct ocfs2_protocol_version lp_max_version;
63 void (*lp_lock_ast)(void *astarg);
64 void (*lp_blocking_ast)(void *astarg, int level);
65 void (*lp_unlock_ast)(void *astarg, int error);
66};
67
68
69/*
70 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only 59 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
71 * has a pointer to separately allocated lvb space. This struct exists only to 60 * has a pointer to separately allocated lvb space. This struct exists only to
72 * include in the lksb union to make space for a combined dlm_lksb and lvb. 61 * include in the lksb union to make space for a combined dlm_lksb and lvb.
@@ -81,12 +70,27 @@ struct fsdlm_lksb_plus_lvb {
81 * size of the union is known. Lock status structures are embedded in 70 * size of the union is known. Lock status structures are embedded in
82 * ocfs2 inodes. 71 * ocfs2 inodes.
83 */ 72 */
84union ocfs2_dlm_lksb { 73struct ocfs2_cluster_connection;
85 struct dlm_lockstatus lksb_o2dlm; 74struct ocfs2_dlm_lksb {
86 struct dlm_lksb lksb_fsdlm; 75 union {
87 struct fsdlm_lksb_plus_lvb padding; 76 struct dlm_lockstatus lksb_o2dlm;
77 struct dlm_lksb lksb_fsdlm;
78 struct fsdlm_lksb_plus_lvb padding;
79 };
80 struct ocfs2_cluster_connection *lksb_conn;
81};
82
83/*
84 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
85 */
86struct ocfs2_locking_protocol {
87 struct ocfs2_protocol_version lp_max_version;
88 void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
89 void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
90 void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
88}; 91};
89 92
93
90/* 94/*
91 * A cluster connection. Mostly opaque to ocfs2, the connection holds 95 * A cluster connection. Mostly opaque to ocfs2, the connection holds
92 * state for the underlying stack. ocfs2 does use cc_version to determine 96 * state for the underlying stack. ocfs2 does use cc_version to determine
@@ -96,6 +100,7 @@ struct ocfs2_cluster_connection {
96 char cc_name[GROUP_NAME_MAX]; 100 char cc_name[GROUP_NAME_MAX];
97 int cc_namelen; 101 int cc_namelen;
98 struct ocfs2_protocol_version cc_version; 102 struct ocfs2_protocol_version cc_version;
103 struct ocfs2_locking_protocol *cc_proto;
99 void (*cc_recovery_handler)(int node_num, void *recovery_data); 104 void (*cc_recovery_handler)(int node_num, void *recovery_data);
100 void *cc_recovery_data; 105 void *cc_recovery_data;
101 void *cc_lockspace; 106 void *cc_lockspace;
@@ -155,27 +160,29 @@ struct ocfs2_stack_operations {
155 * 160 *
156 * ast and bast functions are not part of the call because the 161 * ast and bast functions are not part of the call because the
157 * stack will likely want to wrap ast and bast calls before passing 162 * stack will likely want to wrap ast and bast calls before passing
158 * them to stack->sp_proto. 163 * them to stack->sp_proto. There is no astarg. The lksb will
164 * be passed back to the ast and bast functions. The caller can
165 * use this to find their object.
159 */ 166 */
160 int (*dlm_lock)(struct ocfs2_cluster_connection *conn, 167 int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
161 int mode, 168 int mode,
162 union ocfs2_dlm_lksb *lksb, 169 struct ocfs2_dlm_lksb *lksb,
163 u32 flags, 170 u32 flags,
164 void *name, 171 void *name,
165 unsigned int namelen, 172 unsigned int namelen);
166 void *astarg);
167 173
168 /* 174 /*
169 * Call the underlying dlm unlock function. The ->dlm_unlock() 175 * Call the underlying dlm unlock function. The ->dlm_unlock()
170 * function should convert the flags as appropriate. 176 * function should convert the flags as appropriate.
171 * 177 *
172 * The unlock ast is not passed, as the stack will want to wrap 178 * The unlock ast is not passed, as the stack will want to wrap
173 * it before calling stack->sp_proto->lp_unlock_ast(). 179 * it before calling stack->sp_proto->lp_unlock_ast(). There is
180 * no astarg. The lksb will be passed back to the unlock ast
181 * function. The caller can use this to find their object.
174 */ 182 */
175 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, 183 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
176 union ocfs2_dlm_lksb *lksb, 184 struct ocfs2_dlm_lksb *lksb,
177 u32 flags, 185 u32 flags);
178 void *astarg);
179 186
180 /* 187 /*
181 * Return the status of the current lock status block. The fs 188 * Return the status of the current lock status block. The fs
@@ -183,17 +190,17 @@ struct ocfs2_stack_operations {
183 * callback pulls out the stack-specific lksb, converts the status 190 * callback pulls out the stack-specific lksb, converts the status
184 * to a proper errno, and returns it. 191 * to a proper errno, and returns it.
185 */ 192 */
186 int (*lock_status)(union ocfs2_dlm_lksb *lksb); 193 int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
187 194
188 /* 195 /*
189 * Return non-zero if the LVB is valid. 196 * Return non-zero if the LVB is valid.
190 */ 197 */
191 int (*lvb_valid)(union ocfs2_dlm_lksb *lksb); 198 int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
192 199
193 /* 200 /*
194 * Pull the lvb pointer off of the stack-specific lksb. 201 * Pull the lvb pointer off of the stack-specific lksb.
195 */ 202 */
196 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); 203 void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
197 204
198 /* 205 /*
199 * Cluster-aware posix locks 206 * Cluster-aware posix locks
@@ -210,7 +217,7 @@ struct ocfs2_stack_operations {
210 * This is an optoinal debugging hook. If provided, the 217 * This is an optoinal debugging hook. If provided, the
211 * stack can dump debugging information about this lock. 218 * stack can dump debugging information about this lock.
212 */ 219 */
213 void (*dump_lksb)(union ocfs2_dlm_lksb *lksb); 220 void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
214}; 221};
215 222
216/* 223/*
@@ -226,7 +233,7 @@ struct ocfs2_stack_plugin {
226 /* These are managed by the stackglue code. */ 233 /* These are managed by the stackglue code. */
227 struct list_head sp_list; 234 struct list_head sp_list;
228 unsigned int sp_count; 235 unsigned int sp_count;
229 struct ocfs2_locking_protocol *sp_proto; 236 struct ocfs2_protocol_version sp_max_proto;
230}; 237};
231 238
232 239
@@ -234,10 +241,22 @@ struct ocfs2_stack_plugin {
234int ocfs2_cluster_connect(const char *stack_name, 241int ocfs2_cluster_connect(const char *stack_name,
235 const char *group, 242 const char *group,
236 int grouplen, 243 int grouplen,
244 struct ocfs2_locking_protocol *lproto,
237 void (*recovery_handler)(int node_num, 245 void (*recovery_handler)(int node_num,
238 void *recovery_data), 246 void *recovery_data),
239 void *recovery_data, 247 void *recovery_data,
240 struct ocfs2_cluster_connection **conn); 248 struct ocfs2_cluster_connection **conn);
249/*
250 * Used by callers that don't store their stack name. They must ensure
251 * all nodes have the same stack.
252 */
253int ocfs2_cluster_connect_agnostic(const char *group,
254 int grouplen,
255 struct ocfs2_locking_protocol *lproto,
256 void (*recovery_handler)(int node_num,
257 void *recovery_data),
258 void *recovery_data,
259 struct ocfs2_cluster_connection **conn);
241int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 260int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
242 int hangup_pending); 261 int hangup_pending);
243void ocfs2_cluster_hangup(const char *group, int grouplen); 262void ocfs2_cluster_hangup(const char *group, int grouplen);
@@ -246,26 +265,24 @@ int ocfs2_cluster_this_node(unsigned int *node);
246struct ocfs2_lock_res; 265struct ocfs2_lock_res;
247int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 266int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
248 int mode, 267 int mode,
249 union ocfs2_dlm_lksb *lksb, 268 struct ocfs2_dlm_lksb *lksb,
250 u32 flags, 269 u32 flags,
251 void *name, 270 void *name,
252 unsigned int namelen, 271 unsigned int namelen);
253 struct ocfs2_lock_res *astarg);
254int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 272int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
255 union ocfs2_dlm_lksb *lksb, 273 struct ocfs2_dlm_lksb *lksb,
256 u32 flags, 274 u32 flags);
257 struct ocfs2_lock_res *astarg);
258 275
259int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); 276int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
260int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb); 277int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
261void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); 278void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
262void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); 279void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
263 280
264int ocfs2_stack_supports_plocks(void); 281int ocfs2_stack_supports_plocks(void);
265int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, 282int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
266 struct file *file, int cmd, struct file_lock *fl); 283 struct file *file, int cmd, struct file_lock *fl);
267 284
268void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); 285void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
269 286
270 287
271/* Used by stack plugins */ 288/* Used by stack plugins */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..19ba00f28547 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -51,7 +51,7 @@
51#define ALLOC_NEW_GROUP 0x1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2 52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
53 53
54#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
95 struct buffer_head *group_bh, 95 struct buffer_head *group_bh,
96 unsigned int bit_off, 96 unsigned int bit_off,
97 unsigned int num_bits); 97 unsigned int num_bits);
98static inline int ocfs2_block_group_clear_bits(handle_t *handle,
99 struct inode *alloc_inode,
100 struct ocfs2_group_desc *bg,
101 struct buffer_head *group_bh,
102 unsigned int bit_off,
103 unsigned int num_bits);
104
105static int ocfs2_relink_block_group(handle_t *handle, 98static int ocfs2_relink_block_group(handle_t *handle,
106 struct inode *alloc_inode, 99 struct inode *alloc_inode,
107 struct buffer_head *fe_bh, 100 struct buffer_head *fe_bh,
@@ -152,7 +145,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
152 145
153#define do_error(fmt, ...) \ 146#define do_error(fmt, ...) \
154 do{ \ 147 do{ \
155 if (clean_error) \ 148 if (resize) \
156 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 149 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
157 else \ 150 else \
158 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 151 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
160 153
161static int ocfs2_validate_gd_self(struct super_block *sb, 154static int ocfs2_validate_gd_self(struct super_block *sb,
162 struct buffer_head *bh, 155 struct buffer_head *bh,
163 int clean_error) 156 int resize)
164{ 157{
165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 158 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
166 159
@@ -211,7 +204,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
211static int ocfs2_validate_gd_parent(struct super_block *sb, 204static int ocfs2_validate_gd_parent(struct super_block *sb,
212 struct ocfs2_dinode *di, 205 struct ocfs2_dinode *di,
213 struct buffer_head *bh, 206 struct buffer_head *bh,
214 int clean_error) 207 int resize)
215{ 208{
216 unsigned int max_bits; 209 unsigned int max_bits;
217 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 210 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +226,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
233 return -EINVAL; 226 return -EINVAL;
234 } 227 }
235 228
236 if (le16_to_cpu(gd->bg_chain) >= 229 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 230 if ((le16_to_cpu(gd->bg_chain) >
231 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
232 ((le16_to_cpu(gd->bg_chain) ==
233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
238 do_error("Group descriptor #%llu has bad chain %u", 234 do_error("Group descriptor #%llu has bad chain %u",
239 (unsigned long long)bh->b_blocknr, 235 (unsigned long long)bh->b_blocknr,
240 le16_to_cpu(gd->bg_chain)); 236 le16_to_cpu(gd->bg_chain));
@@ -637,12 +633,113 @@ bail:
637 return status; 633 return status;
638} 634}
639 635
636static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
637{
638 spin_lock(&osb->osb_lock);
639 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
640 spin_unlock(&osb->osb_lock);
641 atomic_set(&osb->s_num_inodes_stolen, 0);
642}
643
644static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
645{
646 spin_lock(&osb->osb_lock);
647 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
648 spin_unlock(&osb->osb_lock);
649 atomic_set(&osb->s_num_meta_stolen, 0);
650}
651
652void ocfs2_init_steal_slots(struct ocfs2_super *osb)
653{
654 ocfs2_init_inode_steal_slot(osb);
655 ocfs2_init_meta_steal_slot(osb);
656}
657
658static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
659{
660 spin_lock(&osb->osb_lock);
661 if (type == INODE_ALLOC_SYSTEM_INODE)
662 osb->s_inode_steal_slot = slot;
663 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
664 osb->s_meta_steal_slot = slot;
665 spin_unlock(&osb->osb_lock);
666}
667
668static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
669{
670 int slot = OCFS2_INVALID_SLOT;
671
672 spin_lock(&osb->osb_lock);
673 if (type == INODE_ALLOC_SYSTEM_INODE)
674 slot = osb->s_inode_steal_slot;
675 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
676 slot = osb->s_meta_steal_slot;
677 spin_unlock(&osb->osb_lock);
678
679 return slot;
680}
681
682static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
683{
684 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
685}
686
687static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
688{
689 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
690}
691
692static int ocfs2_steal_resource(struct ocfs2_super *osb,
693 struct ocfs2_alloc_context *ac,
694 int type)
695{
696 int i, status = -ENOSPC;
697 int slot = __ocfs2_get_steal_slot(osb, type);
698
699 /* Start to steal resource from the first slot after ours. */
700 if (slot == OCFS2_INVALID_SLOT)
701 slot = osb->slot_num + 1;
702
703 for (i = 0; i < osb->max_slots; i++, slot++) {
704 if (slot == osb->max_slots)
705 slot = 0;
706
707 if (slot == osb->slot_num)
708 continue;
709
710 status = ocfs2_reserve_suballoc_bits(osb, ac,
711 type,
712 (u32)slot, NULL,
713 NOT_ALLOC_NEW_GROUP);
714 if (status >= 0) {
715 __ocfs2_set_steal_slot(osb, slot, type);
716 break;
717 }
718
719 ocfs2_free_ac_resource(ac);
720 }
721
722 return status;
723}
724
725static int ocfs2_steal_inode(struct ocfs2_super *osb,
726 struct ocfs2_alloc_context *ac)
727{
728 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
729}
730
731static int ocfs2_steal_meta(struct ocfs2_super *osb,
732 struct ocfs2_alloc_context *ac)
733{
734 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
735}
736
640int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 737int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
641 int blocks, 738 int blocks,
642 struct ocfs2_alloc_context **ac) 739 struct ocfs2_alloc_context **ac)
643{ 740{
644 int status; 741 int status;
645 u32 slot; 742 int slot = ocfs2_get_meta_steal_slot(osb);
646 743
647 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 744 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
648 if (!(*ac)) { 745 if (!(*ac)) {
@@ -653,12 +750,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
653 750
654 (*ac)->ac_bits_wanted = blocks; 751 (*ac)->ac_bits_wanted = blocks;
655 (*ac)->ac_which = OCFS2_AC_USE_META; 752 (*ac)->ac_which = OCFS2_AC_USE_META;
656 slot = osb->slot_num;
657 (*ac)->ac_group_search = ocfs2_block_group_search; 753 (*ac)->ac_group_search = ocfs2_block_group_search;
658 754
755 if (slot != OCFS2_INVALID_SLOT &&
756 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
757 goto extent_steal;
758
759 atomic_set(&osb->s_num_meta_stolen, 0);
659 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 760 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
660 EXTENT_ALLOC_SYSTEM_INODE, 761 EXTENT_ALLOC_SYSTEM_INODE,
661 slot, NULL, ALLOC_NEW_GROUP); 762 (u32)osb->slot_num, NULL,
763 ALLOC_NEW_GROUP);
764
765
766 if (status >= 0) {
767 status = 0;
768 if (slot != OCFS2_INVALID_SLOT)
769 ocfs2_init_meta_steal_slot(osb);
770 goto bail;
771 } else if (status < 0 && status != -ENOSPC) {
772 mlog_errno(status);
773 goto bail;
774 }
775
776 ocfs2_free_ac_resource(*ac);
777
778extent_steal:
779 status = ocfs2_steal_meta(osb, *ac);
780 atomic_inc(&osb->s_num_meta_stolen);
662 if (status < 0) { 781 if (status < 0) {
663 if (status != -ENOSPC) 782 if (status != -ENOSPC)
664 mlog_errno(status); 783 mlog_errno(status);
@@ -685,43 +804,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
685 ac); 804 ac);
686} 805}
687 806
688static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
689 struct ocfs2_alloc_context *ac)
690{
691 int i, status = -ENOSPC;
692 s16 slot = ocfs2_get_inode_steal_slot(osb);
693
694 /* Start to steal inodes from the first slot after ours. */
695 if (slot == OCFS2_INVALID_SLOT)
696 slot = osb->slot_num + 1;
697
698 for (i = 0; i < osb->max_slots; i++, slot++) {
699 if (slot == osb->max_slots)
700 slot = 0;
701
702 if (slot == osb->slot_num)
703 continue;
704
705 status = ocfs2_reserve_suballoc_bits(osb, ac,
706 INODE_ALLOC_SYSTEM_INODE,
707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
709 if (status >= 0) {
710 ocfs2_set_inode_steal_slot(osb, slot);
711 break;
712 }
713
714 ocfs2_free_ac_resource(ac);
715 }
716
717 return status;
718}
719
720int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 807int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
721 struct ocfs2_alloc_context **ac) 808 struct ocfs2_alloc_context **ac)
722{ 809{
723 int status; 810 int status;
724 s16 slot = ocfs2_get_inode_steal_slot(osb); 811 int slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group; 812 u64 alloc_group;
726 813
727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 814 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +841,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
754 * need to check our slots to see whether there is some space for us. 841 * need to check our slots to see whether there is some space for us.
755 */ 842 */
756 if (slot != OCFS2_INVALID_SLOT && 843 if (slot != OCFS2_INVALID_SLOT &&
757 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL) 844 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
758 goto inode_steal; 845 goto inode_steal;
759 846
760 atomic_set(&osb->s_num_inodes_stolen, 0); 847 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group; 848 alloc_group = osb->osb_inode_alloc_group;
762 status = ocfs2_reserve_suballoc_bits(osb, *ac, 849 status = ocfs2_reserve_suballoc_bits(osb, *ac,
763 INODE_ALLOC_SYSTEM_INODE, 850 INODE_ALLOC_SYSTEM_INODE,
764 osb->slot_num, 851 (u32)osb->slot_num,
765 &alloc_group, 852 &alloc_group,
766 ALLOC_NEW_GROUP | 853 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL); 854 ALLOC_GROUPS_FROM_GLOBAL);
@@ -789,7 +876,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
789 ocfs2_free_ac_resource(*ac); 876 ocfs2_free_ac_resource(*ac);
790 877
791inode_steal: 878inode_steal:
792 status = ocfs2_steal_inode_from_other_nodes(osb, *ac); 879 status = ocfs2_steal_inode(osb, *ac);
793 atomic_inc(&osb->s_num_inodes_stolen); 880 atomic_inc(&osb->s_num_inodes_stolen);
794 if (status < 0) { 881 if (status < 0) {
795 if (status != -ENOSPC) 882 if (status != -ENOSPC)
@@ -1884,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1884 bits_wanted, cluster_start, num_clusters); 1971 bits_wanted, cluster_start, num_clusters);
1885} 1972}
1886 1973
1887static inline int ocfs2_block_group_clear_bits(handle_t *handle, 1974static int ocfs2_block_group_clear_bits(handle_t *handle,
1888 struct inode *alloc_inode, 1975 struct inode *alloc_inode,
1889 struct ocfs2_group_desc *bg, 1976 struct ocfs2_group_desc *bg,
1890 struct buffer_head *group_bh, 1977 struct buffer_head *group_bh,
1891 unsigned int bit_off, 1978 unsigned int bit_off,
1892 unsigned int num_bits) 1979 unsigned int num_bits,
1980 void (*undo_fn)(unsigned int bit,
1981 unsigned long *bmap))
1893{ 1982{
1894 int status; 1983 int status;
1895 unsigned int tmp; 1984 unsigned int tmp;
1896 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1897 struct ocfs2_group_desc *undo_bg = NULL; 1985 struct ocfs2_group_desc *undo_bg = NULL;
1898 int cluster_bitmap = 0;
1899 1986
1900 mlog_entry_void(); 1987 mlog_entry_void();
1901 1988
@@ -1905,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1905 1992
1906 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1993 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1907 1994
1908 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1995 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
1909 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1910
1911 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1996 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1912 group_bh, journal_type); 1997 group_bh,
1998 undo_fn ?
1999 OCFS2_JOURNAL_ACCESS_UNDO :
2000 OCFS2_JOURNAL_ACCESS_WRITE);
1913 if (status < 0) { 2001 if (status < 0) {
1914 mlog_errno(status); 2002 mlog_errno(status);
1915 goto bail; 2003 goto bail;
1916 } 2004 }
1917 2005
1918 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2006 if (undo_fn) {
1919 cluster_bitmap = 1;
1920
1921 if (cluster_bitmap) {
1922 jbd_lock_bh_state(group_bh); 2007 jbd_lock_bh_state(group_bh);
1923 undo_bg = (struct ocfs2_group_desc *) 2008 undo_bg = (struct ocfs2_group_desc *)
1924 bh2jh(group_bh)->b_committed_data; 2009 bh2jh(group_bh)->b_committed_data;
@@ -1929,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1929 while(tmp--) { 2014 while(tmp--) {
1930 ocfs2_clear_bit((bit_off + tmp), 2015 ocfs2_clear_bit((bit_off + tmp),
1931 (unsigned long *) bg->bg_bitmap); 2016 (unsigned long *) bg->bg_bitmap);
1932 if (cluster_bitmap) 2017 if (undo_fn)
1933 ocfs2_set_bit(bit_off + tmp, 2018 undo_fn(bit_off + tmp,
1934 (unsigned long *) undo_bg->bg_bitmap); 2019 (unsigned long *) undo_bg->bg_bitmap);
1935 } 2020 }
1936 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2021 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1937 2022
1938 if (cluster_bitmap) 2023 if (undo_fn)
1939 jbd_unlock_bh_state(group_bh); 2024 jbd_unlock_bh_state(group_bh);
1940 2025
1941 status = ocfs2_journal_dirty(handle, group_bh); 2026 status = ocfs2_journal_dirty(handle, group_bh);
@@ -1948,12 +2033,14 @@ bail:
1948/* 2033/*
1949 * expects the suballoc inode to already be locked. 2034 * expects the suballoc inode to already be locked.
1950 */ 2035 */
1951int ocfs2_free_suballoc_bits(handle_t *handle, 2036static int _ocfs2_free_suballoc_bits(handle_t *handle,
1952 struct inode *alloc_inode, 2037 struct inode *alloc_inode,
1953 struct buffer_head *alloc_bh, 2038 struct buffer_head *alloc_bh,
1954 unsigned int start_bit, 2039 unsigned int start_bit,
1955 u64 bg_blkno, 2040 u64 bg_blkno,
1956 unsigned int count) 2041 unsigned int count,
2042 void (*undo_fn)(unsigned int bit,
2043 unsigned long *bitmap))
1957{ 2044{
1958 int status = 0; 2045 int status = 0;
1959 u32 tmp_used; 2046 u32 tmp_used;
@@ -1988,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1988 2075
1989 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2076 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1990 group, group_bh, 2077 group, group_bh,
1991 start_bit, count); 2078 start_bit, count, undo_fn);
1992 if (status < 0) { 2079 if (status < 0) {
1993 mlog_errno(status); 2080 mlog_errno(status);
1994 goto bail; 2081 goto bail;
@@ -2019,6 +2106,17 @@ bail:
2019 return status; 2106 return status;
2020} 2107}
2021 2108
2109int ocfs2_free_suballoc_bits(handle_t *handle,
2110 struct inode *alloc_inode,
2111 struct buffer_head *alloc_bh,
2112 unsigned int start_bit,
2113 u64 bg_blkno,
2114 unsigned int count)
2115{
2116 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2117 start_bit, bg_blkno, count, NULL);
2118}
2119
2022int ocfs2_free_dinode(handle_t *handle, 2120int ocfs2_free_dinode(handle_t *handle,
2023 struct inode *inode_alloc_inode, 2121 struct inode *inode_alloc_inode,
2024 struct buffer_head *inode_alloc_bh, 2122 struct buffer_head *inode_alloc_bh,
@@ -2032,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle,
2032 inode_alloc_bh, bit, bg_blkno, 1); 2130 inode_alloc_bh, bit, bg_blkno, 1);
2033} 2131}
2034 2132
2035int ocfs2_free_clusters(handle_t *handle, 2133static int _ocfs2_free_clusters(handle_t *handle,
2036 struct inode *bitmap_inode, 2134 struct inode *bitmap_inode,
2037 struct buffer_head *bitmap_bh, 2135 struct buffer_head *bitmap_bh,
2038 u64 start_blk, 2136 u64 start_blk,
2039 unsigned int num_clusters) 2137 unsigned int num_clusters,
2138 void (*undo_fn)(unsigned int bit,
2139 unsigned long *bitmap))
2040{ 2140{
2041 int status; 2141 int status;
2042 u16 bg_start_bit; 2142 u16 bg_start_bit;
@@ -2063,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle,
2063 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2163 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2064 (unsigned long long)bg_blkno, bg_start_bit); 2164 (unsigned long long)bg_blkno, bg_start_bit);
2065 2165
2066 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2166 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2067 bg_start_bit, bg_blkno, 2167 bg_start_bit, bg_blkno,
2068 num_clusters); 2168 num_clusters, undo_fn);
2069 if (status < 0) { 2169 if (status < 0) {
2070 mlog_errno(status); 2170 mlog_errno(status);
2071 goto out; 2171 goto out;
@@ -2079,6 +2179,32 @@ out:
2079 return status; 2179 return status;
2080} 2180}
2081 2181
2182int ocfs2_free_clusters(handle_t *handle,
2183 struct inode *bitmap_inode,
2184 struct buffer_head *bitmap_bh,
2185 u64 start_blk,
2186 unsigned int num_clusters)
2187{
2188 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2189 start_blk, num_clusters,
2190 _ocfs2_set_bit);
2191}
2192
2193/*
2194 * Give never-used clusters back to the global bitmap. We don't need
2195 * to protect these bits in the undo buffer.
2196 */
2197int ocfs2_release_clusters(handle_t *handle,
2198 struct inode *bitmap_inode,
2199 struct buffer_head *bitmap_bh,
2200 u64 start_blk,
2201 unsigned int num_clusters)
2202{
2203 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2204 start_blk, num_clusters,
2205 _ocfs2_clear_bit);
2206}
2207
2082static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2208static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2083{ 2209{
2084 printk("Block Group:\n"); 2210 printk("Block Group:\n");
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a43164..e0f46df357e6 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,7 @@ struct ocfs2_alloc_context {
56 is the same as ~0 - unlimited */ 56 is the same as ~0 - unlimited */
57}; 57};
58 58
59void ocfs2_init_steal_slots(struct ocfs2_super *osb);
59void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); 60void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
60static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) 61static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
61{ 62{
@@ -126,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle,
126 struct buffer_head *bitmap_bh, 127 struct buffer_head *bitmap_bh,
127 u64 start_blk, 128 u64 start_blk,
128 unsigned int num_clusters); 129 unsigned int num_clusters);
130int ocfs2_release_clusters(handle_t *handle,
131 struct inode *bitmap_inode,
132 struct buffer_head *bitmap_bh,
133 u64 start_blk,
134 unsigned int num_clusters);
129 135
130static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) 136static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
131{ 137{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 14f47d2bfe02..dee03197a494 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
69#include "xattr.h" 69#include "xattr.h"
70#include "quota.h" 70#include "quota.h"
71#include "refcounttree.h" 71#include "refcounttree.h"
72#include "suballoc.h"
72 73
73#include "buffer_head_io.h" 74#include "buffer_head_io.h"
74 75
@@ -100,6 +101,8 @@ struct mount_options
100static int ocfs2_parse_options(struct super_block *sb, char *options, 101static int ocfs2_parse_options(struct super_block *sb, char *options,
101 struct mount_options *mopt, 102 struct mount_options *mopt,
102 int is_remount); 103 int is_remount);
104static int ocfs2_check_set_options(struct super_block *sb,
105 struct mount_options *options);
103static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt); 106static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
104static void ocfs2_put_super(struct super_block *sb); 107static void ocfs2_put_super(struct super_block *sb);
105static int ocfs2_mount_volume(struct super_block *sb); 108static int ocfs2_mount_volume(struct super_block *sb);
@@ -299,9 +302,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
299 302
300 spin_lock(&osb->osb_lock); 303 spin_lock(&osb->osb_lock);
301 out += snprintf(buf + out, len - out, 304 out += snprintf(buf + out, len - out,
302 "%10s => Slot: %d NumStolen: %d\n", "Steal", 305 "%10s => InodeSlot: %d StolenInodes: %d, "
306 "MetaSlot: %d StolenMeta: %d\n", "Steal",
303 osb->s_inode_steal_slot, 307 osb->s_inode_steal_slot,
304 atomic_read(&osb->s_num_inodes_stolen)); 308 atomic_read(&osb->s_num_inodes_stolen),
309 osb->s_meta_steal_slot,
310 atomic_read(&osb->s_num_meta_stolen));
305 spin_unlock(&osb->osb_lock); 311 spin_unlock(&osb->osb_lock);
306 312
307 out += snprintf(buf + out, len - out, "OrphanScan => "); 313 out += snprintf(buf + out, len - out, "OrphanScan => ");
@@ -600,7 +606,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
600 606
601 lock_kernel(); 607 lock_kernel();
602 608
603 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { 609 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
610 !ocfs2_check_set_options(sb, &parsed_options)) {
604 ret = -EINVAL; 611 ret = -EINVAL;
605 goto out; 612 goto out;
606 } 613 }
@@ -691,8 +698,6 @@ unlock_osb:
691 if (!ret) { 698 if (!ret) {
692 /* Only save off the new mount options in case of a successful 699 /* Only save off the new mount options in case of a successful
693 * remount. */ 700 * remount. */
694 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
695 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
696 osb->s_mount_opt = parsed_options.mount_opt; 701 osb->s_mount_opt = parsed_options.mount_opt;
697 osb->s_atime_quantum = parsed_options.atime_quantum; 702 osb->s_atime_quantum = parsed_options.atime_quantum;
698 osb->preferred_slot = parsed_options.slot; 703 osb->preferred_slot = parsed_options.slot;
@@ -701,6 +706,10 @@ unlock_osb:
701 706
702 if (!ocfs2_is_hard_readonly(osb)) 707 if (!ocfs2_is_hard_readonly(osb))
703 ocfs2_set_journal_params(osb); 708 ocfs2_set_journal_params(osb);
709
710 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
711 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
712 MS_POSIXACL : 0);
704 } 713 }
705out: 714out:
706 unlock_kernel(); 715 unlock_kernel();
@@ -1011,31 +1020,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1011 brelse(bh); 1020 brelse(bh);
1012 bh = NULL; 1021 bh = NULL;
1013 1022
1014 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) 1023 if (!ocfs2_check_set_options(sb, &parsed_options)) {
1015 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; 1024 status = -EINVAL;
1016 1025 goto read_super_error;
1026 }
1017 osb->s_mount_opt = parsed_options.mount_opt; 1027 osb->s_mount_opt = parsed_options.mount_opt;
1018 osb->s_atime_quantum = parsed_options.atime_quantum; 1028 osb->s_atime_quantum = parsed_options.atime_quantum;
1019 osb->preferred_slot = parsed_options.slot; 1029 osb->preferred_slot = parsed_options.slot;
1020 osb->osb_commit_interval = parsed_options.commit_interval; 1030 osb->osb_commit_interval = parsed_options.commit_interval;
1021 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); 1031 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
1022 osb->local_alloc_bits = osb->local_alloc_default_bits; 1032 osb->local_alloc_bits = osb->local_alloc_default_bits;
1023 if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
1024 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1025 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1026 status = -EINVAL;
1027 mlog(ML_ERROR, "User quotas were requested, but this "
1028 "filesystem does not have the feature enabled.\n");
1029 goto read_super_error;
1030 }
1031 if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
1032 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1033 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1034 status = -EINVAL;
1035 mlog(ML_ERROR, "Group quotas were requested, but this "
1036 "filesystem does not have the feature enabled.\n");
1037 goto read_super_error;
1038 }
1039 1033
1040 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 1034 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
1041 if (status) 1035 if (status)
@@ -1072,7 +1066,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1072 "file system, but write access is " 1066 "file system, but write access is "
1073 "unavailable.\n"); 1067 "unavailable.\n");
1074 else 1068 else
1075 mlog_errno(status); 1069 mlog_errno(status);
1076 goto read_super_error; 1070 goto read_super_error;
1077 } 1071 }
1078 1072
@@ -1245,6 +1239,40 @@ static struct file_system_type ocfs2_fs_type = {
1245 .next = NULL 1239 .next = NULL
1246}; 1240};
1247 1241
1242static int ocfs2_check_set_options(struct super_block *sb,
1243 struct mount_options *options)
1244{
1245 if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
1246 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1247 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1248 mlog(ML_ERROR, "User quotas were requested, but this "
1249 "filesystem does not have the feature enabled.\n");
1250 return 0;
1251 }
1252 if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
1253 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1254 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1255 mlog(ML_ERROR, "Group quotas were requested, but this "
1256 "filesystem does not have the feature enabled.\n");
1257 return 0;
1258 }
1259 if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
1260 !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
1261 mlog(ML_ERROR, "ACL support requested but extended attributes "
1262 "feature is not enabled\n");
1263 return 0;
1264 }
1265 /* No ACL setting specified? Use XATTR feature... */
1266 if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
1267 OCFS2_MOUNT_NO_POSIX_ACL))) {
1268 if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
1269 options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1270 else
1271 options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1272 }
1273 return 1;
1274}
1275
1248static int ocfs2_parse_options(struct super_block *sb, 1276static int ocfs2_parse_options(struct super_block *sb,
1249 char *options, 1277 char *options,
1250 struct mount_options *mopt, 1278 struct mount_options *mopt,
@@ -1392,40 +1420,19 @@ static int ocfs2_parse_options(struct super_block *sb,
1392 mopt->mount_opt |= OCFS2_MOUNT_INODE64; 1420 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
1393 break; 1421 break;
1394 case Opt_usrquota: 1422 case Opt_usrquota:
1395 /* We check only on remount, otherwise features
1396 * aren't yet initialized. */
1397 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1398 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1399 mlog(ML_ERROR, "User quota requested but "
1400 "filesystem feature is not set\n");
1401 status = 0;
1402 goto bail;
1403 }
1404 mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; 1423 mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
1405 break; 1424 break;
1406 case Opt_grpquota: 1425 case Opt_grpquota:
1407 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1408 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1409 mlog(ML_ERROR, "Group quota requested but "
1410 "filesystem feature is not set\n");
1411 status = 0;
1412 goto bail;
1413 }
1414 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1426 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1415 break; 1427 break;
1416#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1417 case Opt_acl: 1428 case Opt_acl:
1418 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1429 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1430 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
1419 break; 1431 break;
1420 case Opt_noacl: 1432 case Opt_noacl:
1433 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1421 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; 1434 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1422 break; 1435 break;
1423#else
1424 case Opt_acl:
1425 case Opt_noacl:
1426 printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
1427 break;
1428#endif
1429 default: 1436 default:
1430 mlog(ML_ERROR, 1437 mlog(ML_ERROR,
1431 "Unrecognized mount option \"%s\" " 1438 "Unrecognized mount option \"%s\" "
@@ -1502,12 +1509,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1502 if (opts & OCFS2_MOUNT_INODE64) 1509 if (opts & OCFS2_MOUNT_INODE64)
1503 seq_printf(s, ",inode64"); 1510 seq_printf(s, ",inode64");
1504 1511
1505#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1506 if (opts & OCFS2_MOUNT_POSIX_ACL) 1512 if (opts & OCFS2_MOUNT_POSIX_ACL)
1507 seq_printf(s, ",acl"); 1513 seq_printf(s, ",acl");
1508 else 1514 else
1509 seq_printf(s, ",noacl"); 1515 seq_printf(s, ",noacl");
1510#endif
1511 1516
1512 return 0; 1517 return 0;
1513} 1518}
@@ -1996,7 +2001,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1996 osb->blocked_lock_count = 0; 2001 osb->blocked_lock_count = 0;
1997 spin_lock_init(&osb->osb_lock); 2002 spin_lock_init(&osb->osb_lock);
1998 spin_lock_init(&osb->osb_xattr_lock); 2003 spin_lock_init(&osb->osb_xattr_lock);
1999 ocfs2_init_inode_steal_slot(osb); 2004 ocfs2_init_steal_slots(osb);
2000 2005
2001 atomic_set(&osb->alloc_stats.moves, 0); 2006 atomic_set(&osb->alloc_stats.moves, 0);
2002 atomic_set(&osb->alloc_stats.local_data, 0); 2007 atomic_set(&osb->alloc_stats.local_data, 0);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index e3421030a69f..32499d213fc4 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -137,20 +137,20 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
137 } 137 }
138 138
139 memcpy(link, target, len); 139 memcpy(link, target, len);
140 nd_set_link(nd, link);
141 140
142bail: 141bail:
142 nd_set_link(nd, status ? ERR_PTR(status) : link);
143 brelse(bh); 143 brelse(bh);
144 144
145 mlog_exit(status); 145 mlog_exit(status);
146 return status ? ERR_PTR(status) : link; 146 return NULL;
147} 147}
148 148
149static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 149static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
150{ 150{
151 char *link = cookie; 151 char *link = nd_get_link(nd);
152 152 if (!IS_ERR(link))
153 kfree(link); 153 kfree(link);
154} 154}
155 155
156const struct inode_operations ocfs2_symlink_inode_operations = { 156const struct inode_operations ocfs2_symlink_inode_operations = {
@@ -163,6 +163,7 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
163 .getxattr = generic_getxattr, 163 .getxattr = generic_getxattr,
164 .listxattr = ocfs2_listxattr, 164 .listxattr = ocfs2_listxattr,
165 .removexattr = generic_removexattr, 165 .removexattr = generic_removexattr,
166 .fiemap = ocfs2_fiemap,
166}; 167};
167const struct inode_operations ocfs2_fast_symlink_inode_operations = { 168const struct inode_operations ocfs2_fast_symlink_inode_operations = {
168 .readlink = ocfs2_readlink, 169 .readlink = ocfs2_readlink,
@@ -174,4 +175,5 @@ const struct inode_operations ocfs2_fast_symlink_inode_operations = {
174 .getxattr = generic_getxattr, 175 .getxattr = generic_getxattr,
175 .listxattr = ocfs2_listxattr, 176 .listxattr = ocfs2_listxattr,
176 .removexattr = generic_removexattr, 177 .removexattr = generic_removexattr,
178 .fiemap = ocfs2_fiemap,
177}; 179};
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#define MLOG_MASK_PREFIX ML_INODE 30#define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index c61369342a27..a0a120e82b97 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -267,8 +267,8 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
267} 267}
268 268
269/* Warning: even if it returns true, this does *not* guarantee that 269/* Warning: even if it returns true, this does *not* guarantee that
270 * the block is stored in our inode metadata cache. 270 * the block is stored in our inode metadata cache.
271 * 271 *
272 * This can be called under lock_buffer() 272 * This can be called under lock_buffer()
273 */ 273 */
274int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, 274int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fe3419068df2..3e7773089b96 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -98,10 +98,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {
98 98
99struct xattr_handler *ocfs2_xattr_handlers[] = { 99struct xattr_handler *ocfs2_xattr_handlers[] = {
100 &ocfs2_xattr_user_handler, 100 &ocfs2_xattr_user_handler,
101#ifdef CONFIG_OCFS2_FS_POSIX_ACL
102 &ocfs2_xattr_acl_access_handler, 101 &ocfs2_xattr_acl_access_handler,
103 &ocfs2_xattr_acl_default_handler, 102 &ocfs2_xattr_acl_default_handler,
104#endif
105 &ocfs2_xattr_trusted_handler, 103 &ocfs2_xattr_trusted_handler,
106 &ocfs2_xattr_security_handler, 104 &ocfs2_xattr_security_handler,
107 NULL 105 NULL
@@ -109,21 +107,20 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
109 107
110static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 108static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
111 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 109 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
112#ifdef CONFIG_OCFS2_FS_POSIX_ACL
113 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] 110 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
114 = &ocfs2_xattr_acl_access_handler, 111 = &ocfs2_xattr_acl_access_handler,
115 [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] 112 [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
116 = &ocfs2_xattr_acl_default_handler, 113 = &ocfs2_xattr_acl_default_handler,
117#endif
118 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, 114 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
119 [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, 115 [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
120}; 116};
121 117
122struct ocfs2_xattr_info { 118struct ocfs2_xattr_info {
123 int name_index; 119 int xi_name_index;
124 const char *name; 120 const char *xi_name;
125 const void *value; 121 int xi_name_len;
126 size_t value_len; 122 const void *xi_value;
123 size_t xi_value_len;
127}; 124};
128 125
129struct ocfs2_xattr_search { 126struct ocfs2_xattr_search {
@@ -141,6 +138,115 @@ struct ocfs2_xattr_search {
141 int not_found; 138 int not_found;
142}; 139};
143 140
141/* Operations on struct ocfs2_xa_entry */
142struct ocfs2_xa_loc;
143struct ocfs2_xa_loc_operations {
144 /*
145 * Journal functions
146 */
147 int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
148 int type);
149 void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
150
151 /*
152 * Return a pointer to the appropriate buffer in loc->xl_storage
153 * at the given offset from loc->xl_header.
154 */
155 void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
156
157 /* Can we reuse the existing entry for the new value? */
158 int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
159 struct ocfs2_xattr_info *xi);
160
161 /* How much space is needed for the new value? */
162 int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
163 struct ocfs2_xattr_info *xi);
164
165 /*
166 * Return the offset of the first name+value pair. This is
167 * the start of our downward-filling free space.
168 */
169 int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
170
171 /*
172 * Remove the name+value at this location. Do whatever is
173 * appropriate with the remaining name+value pairs.
174 */
175 void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
176
177 /* Fill xl_entry with a new entry */
178 void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
179
180 /* Add name+value storage to an entry */
181 void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
182
183 /*
184 * Initialize the value buf's access and bh fields for this entry.
185 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
186 */
187 void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
188 struct ocfs2_xattr_value_buf *vb);
189};
190
191/*
192 * Describes an xattr entry location. This is a memory structure
193 * tracking the on-disk structure.
194 */
195struct ocfs2_xa_loc {
196 /* This xattr belongs to this inode */
197 struct inode *xl_inode;
198
199 /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
200 struct ocfs2_xattr_header *xl_header;
201
202 /* Bytes from xl_header to the end of the storage */
203 int xl_size;
204
205 /*
206 * The ocfs2_xattr_entry this location describes. If this is
207 * NULL, this location describes the on-disk structure where it
208 * would have been.
209 */
210 struct ocfs2_xattr_entry *xl_entry;
211
212 /*
213 * Internal housekeeping
214 */
215
216 /* Buffer(s) containing this entry */
217 void *xl_storage;
218
219 /* Operations on the storage backing this location */
220 const struct ocfs2_xa_loc_operations *xl_ops;
221};
222
223/*
224 * Convenience functions to calculate how much space is needed for a
225 * given name+value pair
226 */
227static int namevalue_size(int name_len, uint64_t value_len)
228{
229 if (value_len > OCFS2_XATTR_INLINE_SIZE)
230 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
231 else
232 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
233}
234
235static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
236{
237 return namevalue_size(xi->xi_name_len, xi->xi_value_len);
238}
239
240static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
241{
242 u64 value_len = le64_to_cpu(xe->xe_value_size);
243
244 BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
245 ocfs2_xattr_is_local(xe));
246 return namevalue_size(xe->xe_name_len, value_len);
247}
248
249
144static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, 250static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
145 struct ocfs2_xattr_header *xh, 251 struct ocfs2_xattr_header *xh,
146 int index, 252 int index,
@@ -205,8 +311,6 @@ static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
205 int offset, 311 int offset,
206 struct ocfs2_xattr_value_root **xv, 312 struct ocfs2_xattr_value_root **xv,
207 struct buffer_head **bh); 313 struct buffer_head **bh);
208static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
209 const void *value, size_t size, int flags);
210 314
211static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) 315static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
212{ 316{
@@ -218,14 +322,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
218 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); 322 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
219} 323}
220 324
221static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
222{
223 u16 len = sb->s_blocksize -
224 offsetof(struct ocfs2_xattr_header, xh_entries);
225
226 return len / sizeof(struct ocfs2_xattr_entry);
227}
228
229#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) 325#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
230#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) 326#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
231#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) 327#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -469,35 +565,22 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
469 return hash; 565 return hash;
470} 566}
471 567
472/* 568static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
473 * ocfs2_xattr_hash_entry()
474 *
475 * Compute the hash of an extended attribute.
476 */
477static void ocfs2_xattr_hash_entry(struct inode *inode,
478 struct ocfs2_xattr_header *header,
479 struct ocfs2_xattr_entry *entry)
480{ 569{
481 u32 hash = 0; 570 return namevalue_size(name_len, value_len) +
482 char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); 571 sizeof(struct ocfs2_xattr_entry);
483
484 hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
485 entry->xe_name_hash = cpu_to_le32(hash);
486
487 return;
488} 572}
489 573
490static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) 574static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
491{ 575{
492 int size = 0; 576 return namevalue_size_xi(xi) +
493 577 sizeof(struct ocfs2_xattr_entry);
494 if (value_len <= OCFS2_XATTR_INLINE_SIZE) 578}
495 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
496 else
497 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
498 size += sizeof(struct ocfs2_xattr_entry);
499 579
500 return size; 580static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
581{
582 return namevalue_size_xe(xe) +
583 sizeof(struct ocfs2_xattr_entry);
501} 584}
502 585
503int ocfs2_calc_security_init(struct inode *dir, 586int ocfs2_calc_security_init(struct inode *dir,
@@ -1314,452 +1397,897 @@ out:
1314 return ret; 1397 return ret;
1315} 1398}
1316 1399
1317static int ocfs2_xattr_cleanup(struct inode *inode, 1400static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
1318 handle_t *handle, 1401 int num_entries)
1319 struct ocfs2_xattr_info *xi,
1320 struct ocfs2_xattr_search *xs,
1321 struct ocfs2_xattr_value_buf *vb,
1322 size_t offs)
1323{ 1402{
1324 int ret = 0; 1403 int free_space;
1325 size_t name_len = strlen(xi->name);
1326 void *val = xs->base + offs;
1327 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1328 1404
1329 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1405 if (!needed_space)
1330 OCFS2_JOURNAL_ACCESS_WRITE); 1406 return 0;
1331 if (ret) {
1332 mlog_errno(ret);
1333 goto out;
1334 }
1335 /* Decrease xattr count */
1336 le16_add_cpu(&xs->header->xh_count, -1);
1337 /* Remove the xattr entry and tree root which has already be set*/
1338 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
1339 memset(val, 0, size);
1340 1407
1341 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1408 free_space = free_start -
1342 if (ret < 0) 1409 sizeof(struct ocfs2_xattr_header) -
1343 mlog_errno(ret); 1410 (num_entries * sizeof(struct ocfs2_xattr_entry)) -
1344out: 1411 OCFS2_XATTR_HEADER_GAP;
1345 return ret; 1412 if (free_space < 0)
1413 return -EIO;
1414 if (free_space < needed_space)
1415 return -ENOSPC;
1416
1417 return 0;
1346} 1418}
1347 1419
1348static int ocfs2_xattr_update_entry(struct inode *inode, 1420static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
1349 handle_t *handle, 1421 int type)
1350 struct ocfs2_xattr_info *xi,
1351 struct ocfs2_xattr_search *xs,
1352 struct ocfs2_xattr_value_buf *vb,
1353 size_t offs)
1354{ 1422{
1355 int ret; 1423 return loc->xl_ops->xlo_journal_access(handle, loc, type);
1424}
1356 1425
1357 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1426static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
1358 OCFS2_JOURNAL_ACCESS_WRITE); 1427{
1359 if (ret) { 1428 loc->xl_ops->xlo_journal_dirty(handle, loc);
1360 mlog_errno(ret); 1429}
1361 goto out;
1362 }
1363 1430
1364 xs->here->xe_name_offset = cpu_to_le16(offs); 1431/* Give a pointer into the storage for the given offset */
1365 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1432static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
1366 if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) 1433{
1367 ocfs2_xattr_set_local(xs->here, 1); 1434 BUG_ON(offset >= loc->xl_size);
1368 else 1435 return loc->xl_ops->xlo_offset_pointer(loc, offset);
1369 ocfs2_xattr_set_local(xs->here, 0); 1436}
1370 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1371 1437
1372 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1438/*
1373 if (ret < 0) 1439 * Wipe the name+value pair and allow the storage to reclaim it. This
1374 mlog_errno(ret); 1440 * must be followed by either removal of the entry or a call to
1375out: 1441 * ocfs2_xa_add_namevalue().
1376 return ret; 1442 */
1443static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
1444{
1445 loc->xl_ops->xlo_wipe_namevalue(loc);
1377} 1446}
1378 1447
1379/* 1448/*
1380 * ocfs2_xattr_set_value_outside() 1449 * Find lowest offset to a name+value pair. This is the start of our
1381 * 1450 * downward-growing free space.
1382 * Set large size value in B tree.
1383 */ 1451 */
1384static int ocfs2_xattr_set_value_outside(struct inode *inode, 1452static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
1385 struct ocfs2_xattr_info *xi,
1386 struct ocfs2_xattr_search *xs,
1387 struct ocfs2_xattr_set_ctxt *ctxt,
1388 struct ocfs2_xattr_value_buf *vb,
1389 size_t offs)
1390{ 1453{
1391 size_t name_len = strlen(xi->name); 1454 return loc->xl_ops->xlo_get_free_start(loc);
1392 void *val = xs->base + offs; 1455}
1393 struct ocfs2_xattr_value_root *xv = NULL;
1394 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1395 int ret = 0;
1396 1456
1397 memset(val, 0, size); 1457/* Can we reuse loc->xl_entry for xi? */
1398 memcpy(val, xi->name, name_len); 1458static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
1399 xv = (struct ocfs2_xattr_value_root *) 1459 struct ocfs2_xattr_info *xi)
1400 (val + OCFS2_XATTR_SIZE(name_len)); 1460{
1401 xv->xr_clusters = 0; 1461 return loc->xl_ops->xlo_can_reuse(loc, xi);
1402 xv->xr_last_eb_blk = 0; 1462}
1403 xv->xr_list.l_tree_depth = 0; 1463
1404 xv->xr_list.l_count = cpu_to_le16(1); 1464/* How much free space is needed to set the new value */
1405 xv->xr_list.l_next_free_rec = 0; 1465static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
1406 vb->vb_xv = xv; 1466 struct ocfs2_xattr_info *xi)
1407 1467{
1408 ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt); 1468 return loc->xl_ops->xlo_check_space(loc, xi);
1409 if (ret < 0) { 1469}
1410 mlog_errno(ret); 1470
1411 return ret; 1471static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1472{
1473 loc->xl_ops->xlo_add_entry(loc, name_hash);
1474 loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
1475 /*
1476 * We can't leave the new entry's xe_name_offset at zero or
1477 * add_namevalue() will go nuts. We set it to the size of our
1478 * storage so that it can never be less than any other entry.
1479 */
1480 loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
1481}
1482
1483static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
1484 struct ocfs2_xattr_info *xi)
1485{
1486 int size = namevalue_size_xi(xi);
1487 int nameval_offset;
1488 char *nameval_buf;
1489
1490 loc->xl_ops->xlo_add_namevalue(loc, size);
1491 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1492 loc->xl_entry->xe_name_len = xi->xi_name_len;
1493 ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
1494 ocfs2_xattr_set_local(loc->xl_entry,
1495 xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
1496
1497 nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1498 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
1499 memset(nameval_buf, 0, size);
1500 memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
1501}
1502
1503static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
1504 struct ocfs2_xattr_value_buf *vb)
1505{
1506 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1507 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1508
1509 /* Value bufs are for value trees */
1510 BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
1511 BUG_ON(namevalue_size_xe(loc->xl_entry) !=
1512 (name_size + OCFS2_XATTR_ROOT_SIZE));
1513
1514 loc->xl_ops->xlo_fill_value_buf(loc, vb);
1515 vb->vb_xv =
1516 (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
1517 nameval_offset +
1518 name_size);
1519}
1520
1521static int ocfs2_xa_block_journal_access(handle_t *handle,
1522 struct ocfs2_xa_loc *loc, int type)
1523{
1524 struct buffer_head *bh = loc->xl_storage;
1525 ocfs2_journal_access_func access;
1526
1527 if (loc->xl_size == (bh->b_size -
1528 offsetof(struct ocfs2_xattr_block,
1529 xb_attrs.xb_header)))
1530 access = ocfs2_journal_access_xb;
1531 else
1532 access = ocfs2_journal_access_di;
1533 return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
1534}
1535
1536static void ocfs2_xa_block_journal_dirty(handle_t *handle,
1537 struct ocfs2_xa_loc *loc)
1538{
1539 struct buffer_head *bh = loc->xl_storage;
1540
1541 ocfs2_journal_dirty(handle, bh);
1542}
1543
1544static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
1545 int offset)
1546{
1547 return (char *)loc->xl_header + offset;
1548}
1549
1550static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
1551 struct ocfs2_xattr_info *xi)
1552{
1553 /*
1554 * Block storage is strict. If the sizes aren't exact, we will
1555 * remove the old one and reinsert the new.
1556 */
1557 return namevalue_size_xe(loc->xl_entry) ==
1558 namevalue_size_xi(xi);
1559}
1560
1561static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
1562{
1563 struct ocfs2_xattr_header *xh = loc->xl_header;
1564 int i, count = le16_to_cpu(xh->xh_count);
1565 int offset, free_start = loc->xl_size;
1566
1567 for (i = 0; i < count; i++) {
1568 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1569 if (offset < free_start)
1570 free_start = offset;
1412 } 1571 }
1413 ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs); 1572
1414 if (ret < 0) { 1573 return free_start;
1415 mlog_errno(ret); 1574}
1416 return ret; 1575
1576static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
1577 struct ocfs2_xattr_info *xi)
1578{
1579 int count = le16_to_cpu(loc->xl_header->xh_count);
1580 int free_start = ocfs2_xa_get_free_start(loc);
1581 int needed_space = ocfs2_xi_entry_usage(xi);
1582
1583 /*
1584 * Block storage will reclaim the original entry before inserting
1585 * the new value, so we only need the difference. If the new
1586 * entry is smaller than the old one, we don't need anything.
1587 */
1588 if (loc->xl_entry) {
1589 /* Don't need space if we're reusing! */
1590 if (ocfs2_xa_can_reuse_entry(loc, xi))
1591 needed_space = 0;
1592 else
1593 needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
1417 } 1594 }
1418 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, 1595 if (needed_space < 0)
1419 xi->value, xi->value_len); 1596 needed_space = 0;
1420 if (ret < 0) 1597 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1421 mlog_errno(ret); 1598}
1422 1599
1423 return ret; 1600/*
1601 * Block storage for xattrs keeps the name+value pairs compacted. When
1602 * we remove one, we have to shift any that preceded it towards the end.
1603 */
1604static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1605{
1606 int i, offset;
1607 int namevalue_offset, first_namevalue_offset, namevalue_size;
1608 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1609 struct ocfs2_xattr_header *xh = loc->xl_header;
1610 int count = le16_to_cpu(xh->xh_count);
1611
1612 namevalue_offset = le16_to_cpu(entry->xe_name_offset);
1613 namevalue_size = namevalue_size_xe(entry);
1614 first_namevalue_offset = ocfs2_xa_get_free_start(loc);
1615
1616 /* Shift the name+value pairs */
1617 memmove((char *)xh + first_namevalue_offset + namevalue_size,
1618 (char *)xh + first_namevalue_offset,
1619 namevalue_offset - first_namevalue_offset);
1620 memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
1621
1622 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset <= namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size);
1628 }
1629
1630 /*
1631 * Note that we don't update xh_free_start or xh_name_value_len
1632 * because they're not used in block-stored xattrs.
1633 */
1634}
1635
1636static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1637{
1638 int count = le16_to_cpu(loc->xl_header->xh_count);
1639 loc->xl_entry = &(loc->xl_header->xh_entries[count]);
1640 le16_add_cpu(&loc->xl_header->xh_count, 1);
1641 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1642}
1643
1644static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1645{
1646 int free_start = ocfs2_xa_get_free_start(loc);
1647
1648 loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
1649}
1650
1651static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
1652 struct ocfs2_xattr_value_buf *vb)
1653{
1654 struct buffer_head *bh = loc->xl_storage;
1655
1656 if (loc->xl_size == (bh->b_size -
1657 offsetof(struct ocfs2_xattr_block,
1658 xb_attrs.xb_header)))
1659 vb->vb_access = ocfs2_journal_access_xb;
1660 else
1661 vb->vb_access = ocfs2_journal_access_di;
1662 vb->vb_bh = bh;
1424} 1663}
1425 1664
1426/* 1665/*
1427 * ocfs2_xattr_set_entry_local() 1666 * Operations for xattrs stored in blocks. This includes inline inode
1428 * 1667 * storage and unindexed ocfs2_xattr_blocks.
1429 * Set, replace or remove extended attribute in local.
1430 */ 1668 */
1431static void ocfs2_xattr_set_entry_local(struct inode *inode, 1669static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
1432 struct ocfs2_xattr_info *xi, 1670 .xlo_journal_access = ocfs2_xa_block_journal_access,
1433 struct ocfs2_xattr_search *xs, 1671 .xlo_journal_dirty = ocfs2_xa_block_journal_dirty,
1434 struct ocfs2_xattr_entry *last, 1672 .xlo_offset_pointer = ocfs2_xa_block_offset_pointer,
1435 size_t min_offs) 1673 .xlo_check_space = ocfs2_xa_block_check_space,
1674 .xlo_can_reuse = ocfs2_xa_block_can_reuse,
1675 .xlo_get_free_start = ocfs2_xa_block_get_free_start,
1676 .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue,
1677 .xlo_add_entry = ocfs2_xa_block_add_entry,
1678 .xlo_add_namevalue = ocfs2_xa_block_add_namevalue,
1679 .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf,
1680};
1681
1682static int ocfs2_xa_bucket_journal_access(handle_t *handle,
1683 struct ocfs2_xa_loc *loc, int type)
1436{ 1684{
1437 size_t name_len = strlen(xi->name); 1685 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1438 int i;
1439 1686
1440 if (xi->value && xs->not_found) { 1687 return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
1441 /* Insert the new xattr entry. */ 1688}
1442 le16_add_cpu(&xs->header->xh_count, 1); 1689
1443 ocfs2_xattr_set_type(last, xi->name_index); 1690static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
1444 ocfs2_xattr_set_local(last, 1); 1691 struct ocfs2_xa_loc *loc)
1445 last->xe_name_len = name_len; 1692{
1446 } else { 1693 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1447 void *first_val; 1694
1448 void *val; 1695 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
1449 size_t offs, size; 1696}
1450 1697
1451 first_val = xs->base + min_offs; 1698static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
1452 offs = le16_to_cpu(xs->here->xe_name_offset); 1699 int offset)
1453 val = xs->base + offs; 1700{
1454 1701 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1455 if (le64_to_cpu(xs->here->xe_value_size) > 1702 int block, block_offset;
1456 OCFS2_XATTR_INLINE_SIZE) 1703
1457 size = OCFS2_XATTR_SIZE(name_len) + 1704 /* The header is at the front of the bucket */
1458 OCFS2_XATTR_ROOT_SIZE; 1705 block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
1706 block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
1707
1708 return bucket_block(bucket, block) + block_offset;
1709}
1710
1711static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
1712 struct ocfs2_xattr_info *xi)
1713{
1714 return namevalue_size_xe(loc->xl_entry) >=
1715 namevalue_size_xi(xi);
1716}
1717
1718static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
1719{
1720 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1721 return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
1722}
1723
1724static int ocfs2_bucket_align_free_start(struct super_block *sb,
1725 int free_start, int size)
1726{
1727 /*
1728 * We need to make sure that the name+value pair fits within
1729 * one block.
1730 */
1731 if (((free_start - size) >> sb->s_blocksize_bits) !=
1732 ((free_start - 1) >> sb->s_blocksize_bits))
1733 free_start -= free_start % sb->s_blocksize;
1734
1735 return free_start;
1736}
1737
1738static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
1739 struct ocfs2_xattr_info *xi)
1740{
1741 int rc;
1742 int count = le16_to_cpu(loc->xl_header->xh_count);
1743 int free_start = ocfs2_xa_get_free_start(loc);
1744 int needed_space = ocfs2_xi_entry_usage(xi);
1745 int size = namevalue_size_xi(xi);
1746 struct super_block *sb = loc->xl_inode->i_sb;
1747
1748 /*
1749 * Bucket storage does not reclaim name+value pairs it cannot
1750 * reuse. They live as holes until the bucket fills, and then
1751 * the bucket is defragmented. However, the bucket can reclaim
1752 * the ocfs2_xattr_entry.
1753 */
1754 if (loc->xl_entry) {
1755 /* Don't need space if we're reusing! */
1756 if (ocfs2_xa_can_reuse_entry(loc, xi))
1757 needed_space = 0;
1459 else 1758 else
1460 size = OCFS2_XATTR_SIZE(name_len) + 1759 needed_space -= sizeof(struct ocfs2_xattr_entry);
1461 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 1760 }
1462 1761 BUG_ON(needed_space < 0);
1463 if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
1464 OCFS2_XATTR_SIZE(xi->value_len)) {
1465 /* The old and the new value have the
1466 same size. Just replace the value. */
1467 ocfs2_xattr_set_local(xs->here, 1);
1468 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1469 /* Clear value bytes. */
1470 memset(val + OCFS2_XATTR_SIZE(name_len),
1471 0,
1472 OCFS2_XATTR_SIZE(xi->value_len));
1473 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1474 xi->value,
1475 xi->value_len);
1476 return;
1477 }
1478 /* Remove the old name+value. */
1479 memmove(first_val + size, first_val, val - first_val);
1480 memset(first_val, 0, size);
1481 xs->here->xe_name_hash = 0;
1482 xs->here->xe_name_offset = 0;
1483 ocfs2_xattr_set_local(xs->here, 1);
1484 xs->here->xe_value_size = 0;
1485
1486 min_offs += size;
1487
1488 /* Adjust all value offsets. */
1489 last = xs->header->xh_entries;
1490 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1491 size_t o = le16_to_cpu(last->xe_name_offset);
1492
1493 if (o < offs)
1494 last->xe_name_offset = cpu_to_le16(o + size);
1495 last += 1;
1496 }
1497 1762
1498 if (!xi->value) { 1763 if (free_start < size) {
1499 /* Remove the old entry. */ 1764 if (needed_space)
1500 last -= 1; 1765 return -ENOSPC;
1501 memmove(xs->here, xs->here + 1, 1766 } else {
1502 (void *)last - (void *)xs->here); 1767 /*
1503 memset(last, 0, sizeof(struct ocfs2_xattr_entry)); 1768 * First we check if it would fit in the first place.
1504 le16_add_cpu(&xs->header->xh_count, -1); 1769 * Below, we align the free start to a block. This may
1505 } 1770 * slide us below the minimum gap. By checking unaligned
1771 * first, we avoid that error.
1772 */
1773 rc = ocfs2_xa_check_space_helper(needed_space, free_start,
1774 count);
1775 if (rc)
1776 return rc;
1777 free_start = ocfs2_bucket_align_free_start(sb, free_start,
1778 size);
1506 } 1779 }
1507 if (xi->value) { 1780 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1508 /* Insert the new name+value. */ 1781}
1509 size_t size = OCFS2_XATTR_SIZE(name_len) + 1782
1510 OCFS2_XATTR_SIZE(xi->value_len); 1783static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
1511 void *val = xs->base + min_offs - size; 1784{
1785 le16_add_cpu(&loc->xl_header->xh_name_value_len,
1786 -namevalue_size_xe(loc->xl_entry));
1787}
1788
1789static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1790{
1791 struct ocfs2_xattr_header *xh = loc->xl_header;
1792 int count = le16_to_cpu(xh->xh_count);
1793 int low = 0, high = count - 1, tmp;
1794 struct ocfs2_xattr_entry *tmp_xe;
1512 1795
1513 xs->here->xe_name_offset = cpu_to_le16(min_offs - size); 1796 /*
1514 memset(val, 0, size); 1797 * We keep buckets sorted by name_hash, so we need to find
1515 memcpy(val, xi->name, name_len); 1798 * our insert place.
1516 memcpy(val + OCFS2_XATTR_SIZE(name_len), 1799 */
1517 xi->value, 1800 while (low <= high && count) {
1518 xi->value_len); 1801 tmp = (low + high) / 2;
1519 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1802 tmp_xe = &xh->xh_entries[tmp];
1520 ocfs2_xattr_set_local(xs->here, 1); 1803
1521 ocfs2_xattr_hash_entry(inode, xs->header, xs->here); 1804 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
1805 low = tmp + 1;
1806 else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
1807 high = tmp - 1;
1808 else {
1809 low = tmp;
1810 break;
1811 }
1522 } 1812 }
1523 1813
1524 return; 1814 if (low != count)
1815 memmove(&xh->xh_entries[low + 1],
1816 &xh->xh_entries[low],
1817 ((count - low) * sizeof(struct ocfs2_xattr_entry)));
1818
1819 le16_add_cpu(&xh->xh_count, 1);
1820 loc->xl_entry = &xh->xh_entries[low];
1821 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1822}
1823
1824static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1825{
1826 int free_start = ocfs2_xa_get_free_start(loc);
1827 struct ocfs2_xattr_header *xh = loc->xl_header;
1828 struct super_block *sb = loc->xl_inode->i_sb;
1829 int nameval_offset;
1830
1831 free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
1832 nameval_offset = free_start - size;
1833 loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
1834 xh->xh_free_start = cpu_to_le16(nameval_offset);
1835 le16_add_cpu(&xh->xh_name_value_len, size);
1836
1837}
1838
1839static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
1840 struct ocfs2_xattr_value_buf *vb)
1841{
1842 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1843 struct super_block *sb = loc->xl_inode->i_sb;
1844 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1845 int size = namevalue_size_xe(loc->xl_entry);
1846 int block_offset = nameval_offset >> sb->s_blocksize_bits;
1847
1848 /* Values are not allowed to straddle block boundaries */
1849 BUG_ON(block_offset !=
1850 ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
1851 /* We expect the bucket to be filled in */
1852 BUG_ON(!bucket->bu_bhs[block_offset]);
1853
1854 vb->vb_access = ocfs2_journal_access;
1855 vb->vb_bh = bucket->bu_bhs[block_offset];
1856}
1857
1858/* Operations for xattrs stored in buckets. */
1859static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
1860 .xlo_journal_access = ocfs2_xa_bucket_journal_access,
1861 .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty,
1862 .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer,
1863 .xlo_check_space = ocfs2_xa_bucket_check_space,
1864 .xlo_can_reuse = ocfs2_xa_bucket_can_reuse,
1865 .xlo_get_free_start = ocfs2_xa_bucket_get_free_start,
1866 .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue,
1867 .xlo_add_entry = ocfs2_xa_bucket_add_entry,
1868 .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue,
1869 .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf,
1870};
1871
1872static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
1873{
1874 struct ocfs2_xattr_value_buf vb;
1875
1876 if (ocfs2_xattr_is_local(loc->xl_entry))
1877 return 0;
1878
1879 ocfs2_xa_fill_value_buf(loc, &vb);
1880 return le32_to_cpu(vb.vb_xv->xr_clusters);
1881}
1882
1883static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
1884 struct ocfs2_xattr_set_ctxt *ctxt)
1885{
1886 int trunc_rc, access_rc;
1887 struct ocfs2_xattr_value_buf vb;
1888
1889 ocfs2_xa_fill_value_buf(loc, &vb);
1890 trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
1891 ctxt);
1892
1893 /*
1894 * The caller of ocfs2_xa_value_truncate() has already called
1895 * ocfs2_xa_journal_access on the loc. However, The truncate code
1896 * calls ocfs2_extend_trans(). This may commit the previous
1897 * transaction and open a new one. If this is a bucket, truncate
1898 * could leave only vb->vb_bh set up for journaling. Meanwhile,
1899 * the caller is expecting to dirty the entire bucket. So we must
1900 * reset the journal work. We do this even if truncate has failed,
1901 * as it could have failed after committing the extend.
1902 */
1903 access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
1904 OCFS2_JOURNAL_ACCESS_WRITE);
1905
1906 /* Errors in truncate take precedence */
1907 return trunc_rc ? trunc_rc : access_rc;
1908}
1909
1910static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
1911{
1912 int index, count;
1913 struct ocfs2_xattr_header *xh = loc->xl_header;
1914 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1915
1916 ocfs2_xa_wipe_namevalue(loc);
1917 loc->xl_entry = NULL;
1918
1919 le16_add_cpu(&xh->xh_count, -1);
1920 count = le16_to_cpu(xh->xh_count);
1921
1922 /*
1923 * Only zero out the entry if there are more remaining. This is
1924 * important for an empty bucket, as it keeps track of the
1925 * bucket's hash value. It doesn't hurt empty block storage.
1926 */
1927 if (count) {
1928 index = ((char *)entry - (char *)&xh->xh_entries) /
1929 sizeof(struct ocfs2_xattr_entry);
1930 memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
1931 (count - index) * sizeof(struct ocfs2_xattr_entry));
1932 memset(&xh->xh_entries[count], 0,
1933 sizeof(struct ocfs2_xattr_entry));
1934 }
1525} 1935}
1526 1936
1527/* 1937/*
1528 * ocfs2_xattr_set_entry() 1938 * If we have a problem adjusting the size of an external value during
1939 * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
1940 * in an intermediate state. For example, the value may be partially
1941 * truncated.
1529 * 1942 *
1530 * Set extended attribute entry into inode or block. 1943 * If the value tree hasn't changed, the extend/truncate went nowhere.
1944 * We have nothing to do. The caller can treat it as a straight error.
1531 * 1945 *
1532 * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, 1946 * If the value tree got partially truncated, we now have a corrupted
1533 * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), 1947 * extended attribute. We're going to wipe its entry and leak the
1534 * then set value in B tree with set_value_outside(). 1948 * clusters. Better to leak some storage than leave a corrupt entry.
1949 *
1950 * If the value tree grew, it obviously didn't grow enough for the
1951 * new entry. We're not going to try and reclaim those clusters either.
1952 * If there was already an external value there (orig_clusters != 0),
1953 * the new clusters are attached safely and we can just leave the old
1954 * value in place. If there was no external value there, we remove
1955 * the entry.
1956 *
1957 * This way, the xattr block we store in the journal will be consistent.
1958 * If the size change broke because of the journal, no changes will hit
1959 * disk anyway.
1535 */ 1960 */
1536static int ocfs2_xattr_set_entry(struct inode *inode, 1961static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
1537 struct ocfs2_xattr_info *xi, 1962 const char *what,
1538 struct ocfs2_xattr_search *xs, 1963 unsigned int orig_clusters)
1539 struct ocfs2_xattr_set_ctxt *ctxt, 1964{
1540 int flag) 1965 unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
1541{ 1966 char *nameval_buf = ocfs2_xa_offset_pointer(loc,
1542 struct ocfs2_xattr_entry *last; 1967 le16_to_cpu(loc->xl_entry->xe_name_offset));
1543 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1968
1544 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 1969 if (new_clusters < orig_clusters) {
1545 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); 1970 mlog(ML_ERROR,
1546 size_t size_l = 0; 1971 "Partial truncate while %s xattr %.*s. Leaking "
1547 handle_t *handle = ctxt->handle; 1972 "%u clusters and removing the entry\n",
1548 int free, i, ret; 1973 what, loc->xl_entry->xe_name_len, nameval_buf,
1549 struct ocfs2_xattr_info xi_l = { 1974 orig_clusters - new_clusters);
1550 .name_index = xi->name_index, 1975 ocfs2_xa_remove_entry(loc);
1551 .name = xi->name, 1976 } else if (!orig_clusters) {
1552 .value = xi->value, 1977 mlog(ML_ERROR,
1553 .value_len = xi->value_len, 1978 "Unable to allocate an external value for xattr "
1554 }; 1979 "%.*s safely. Leaking %u clusters and removing the "
1555 struct ocfs2_xattr_value_buf vb = { 1980 "entry\n",
1556 .vb_bh = xs->xattr_bh, 1981 loc->xl_entry->xe_name_len, nameval_buf,
1557 .vb_access = ocfs2_journal_access_di, 1982 new_clusters - orig_clusters);
1558 }; 1983 ocfs2_xa_remove_entry(loc);
1984 } else if (new_clusters > orig_clusters)
1985 mlog(ML_ERROR,
1986 "Unable to grow xattr %.*s safely. %u new clusters "
1987 "have been added, but the value will not be "
1988 "modified\n",
1989 loc->xl_entry->xe_name_len, nameval_buf,
1990 new_clusters - orig_clusters);
1991}
1992
1993static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
1994 struct ocfs2_xattr_set_ctxt *ctxt)
1995{
1996 int rc = 0;
1997 unsigned int orig_clusters;
1998
1999 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2000 orig_clusters = ocfs2_xa_value_clusters(loc);
2001 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2002 if (rc) {
2003 mlog_errno(rc);
2004 /*
2005 * Since this is remove, we can return 0 if
2006 * ocfs2_xa_cleanup_value_truncate() is going to
2007 * wipe the entry anyway. So we check the
2008 * cluster count as well.
2009 */
2010 if (orig_clusters != ocfs2_xa_value_clusters(loc))
2011 rc = 0;
2012 ocfs2_xa_cleanup_value_truncate(loc, "removing",
2013 orig_clusters);
2014 if (rc)
2015 goto out;
2016 }
2017 }
1559 2018
1560 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2019 ocfs2_xa_remove_entry(loc);
1561 BUG_ON(xs->xattr_bh == xs->inode_bh);
1562 vb.vb_access = ocfs2_journal_access_xb;
1563 } else
1564 BUG_ON(xs->xattr_bh != xs->inode_bh);
1565 2020
1566 /* Compute min_offs, last and free space. */ 2021out:
1567 last = xs->header->xh_entries; 2022 return rc;
2023}
1568 2024
1569 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { 2025static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
1570 size_t offs = le16_to_cpu(last->xe_name_offset); 2026{
1571 if (offs < min_offs) 2027 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1572 min_offs = offs; 2028 char *nameval_buf;
1573 last += 1;
1574 }
1575 2029
1576 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; 2030 nameval_buf = ocfs2_xa_offset_pointer(loc,
1577 if (free < 0) 2031 le16_to_cpu(loc->xl_entry->xe_name_offset));
1578 return -EIO; 2032 memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
2033}
1579 2034
1580 if (!xs->not_found) { 2035/*
1581 size_t size = 0; 2036 * Take an existing entry and make it ready for the new value. This
1582 if (ocfs2_xattr_is_local(xs->here)) 2037 * won't allocate space, but it may free space. It should be ready for
1583 size = OCFS2_XATTR_SIZE(name_len) + 2038 * ocfs2_xa_prepare_entry() to finish the work.
1584 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 2039 */
1585 else 2040static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
1586 size = OCFS2_XATTR_SIZE(name_len) + 2041 struct ocfs2_xattr_info *xi,
1587 OCFS2_XATTR_ROOT_SIZE; 2042 struct ocfs2_xattr_set_ctxt *ctxt)
1588 free += (size + sizeof(struct ocfs2_xattr_entry)); 2043{
1589 } 2044 int rc = 0;
1590 /* Check free space in inode or block */ 2045 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
1591 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2046 unsigned int orig_clusters;
1592 if (free < sizeof(struct ocfs2_xattr_entry) + 2047 char *nameval_buf;
1593 OCFS2_XATTR_SIZE(name_len) + 2048 int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
1594 OCFS2_XATTR_ROOT_SIZE) { 2049 int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
1595 ret = -ENOSPC; 2050
1596 goto out; 2051 BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
2052 name_size);
2053
2054 nameval_buf = ocfs2_xa_offset_pointer(loc,
2055 le16_to_cpu(loc->xl_entry->xe_name_offset));
2056 if (xe_local) {
2057 memset(nameval_buf + name_size, 0,
2058 namevalue_size_xe(loc->xl_entry) - name_size);
2059 if (!xi_local)
2060 ocfs2_xa_install_value_root(loc);
2061 } else {
2062 orig_clusters = ocfs2_xa_value_clusters(loc);
2063 if (xi_local) {
2064 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2065 if (rc < 0)
2066 mlog_errno(rc);
2067 else
2068 memset(nameval_buf + name_size, 0,
2069 namevalue_size_xe(loc->xl_entry) -
2070 name_size);
2071 } else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
2072 xi->xi_value_len) {
2073 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
2074 ctxt);
2075 if (rc < 0)
2076 mlog_errno(rc);
1597 } 2077 }
1598 size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 2078
1599 xi_l.value = (void *)&def_xv; 2079 if (rc) {
1600 xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; 2080 ocfs2_xa_cleanup_value_truncate(loc, "reusing",
1601 } else if (xi->value) { 2081 orig_clusters);
1602 if (free < sizeof(struct ocfs2_xattr_entry) +
1603 OCFS2_XATTR_SIZE(name_len) +
1604 OCFS2_XATTR_SIZE(xi->value_len)) {
1605 ret = -ENOSPC;
1606 goto out; 2082 goto out;
1607 } 2083 }
1608 } 2084 }
1609 2085
1610 if (!xs->not_found) { 2086 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1611 /* For existing extended attribute */ 2087 ocfs2_xattr_set_local(loc->xl_entry, xi_local);
1612 size_t size = OCFS2_XATTR_SIZE(name_len) +
1613 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1614 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1615 void *val = xs->base + offs;
1616 2088
1617 if (ocfs2_xattr_is_local(xs->here) && size == size_l) { 2089out:
1618 /* Replace existing local xattr with tree root */ 2090 return rc;
1619 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, 2091}
1620 ctxt, &vb, offs);
1621 if (ret < 0)
1622 mlog_errno(ret);
1623 goto out;
1624 } else if (!ocfs2_xattr_is_local(xs->here)) {
1625 /* For existing xattr which has value outside */
1626 vb.vb_xv = (struct ocfs2_xattr_value_root *)
1627 (val + OCFS2_XATTR_SIZE(name_len));
1628 2092
1629 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2093/*
1630 /* 2094 * Prepares loc->xl_entry to receive the new xattr. This includes
1631 * If new value need set outside also, 2095 * properly setting up the name+value pair region. If loc->xl_entry
1632 * first truncate old value to new value, 2096 * already exists, it will take care of modifying it appropriately.
1633 * then set new value with set_value_outside(). 2097 *
1634 */ 2098 * Note that this modifies the data. You did journal_access already,
1635 ret = ocfs2_xattr_value_truncate(inode, 2099 * right?
1636 &vb, 2100 */
1637 xi->value_len, 2101static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
1638 ctxt); 2102 struct ocfs2_xattr_info *xi,
1639 if (ret < 0) { 2103 u32 name_hash,
1640 mlog_errno(ret); 2104 struct ocfs2_xattr_set_ctxt *ctxt)
1641 goto out; 2105{
1642 } 2106 int rc = 0;
2107 unsigned int orig_clusters;
2108 __le64 orig_value_size = 0;
1643 2109
1644 ret = ocfs2_xattr_update_entry(inode, 2110 rc = ocfs2_xa_check_space(loc, xi);
1645 handle, 2111 if (rc)
1646 xi, 2112 goto out;
1647 xs,
1648 &vb,
1649 offs);
1650 if (ret < 0) {
1651 mlog_errno(ret);
1652 goto out;
1653 }
1654 2113
1655 ret = __ocfs2_xattr_set_value_outside(inode, 2114 if (loc->xl_entry) {
1656 handle, 2115 if (ocfs2_xa_can_reuse_entry(loc, xi)) {
1657 &vb, 2116 orig_value_size = loc->xl_entry->xe_value_size;
1658 xi->value, 2117 rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
1659 xi->value_len); 2118 if (rc)
1660 if (ret < 0) 2119 goto out;
1661 mlog_errno(ret); 2120 goto alloc_value;
2121 }
2122
2123 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2124 orig_clusters = ocfs2_xa_value_clusters(loc);
2125 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2126 if (rc) {
2127 mlog_errno(rc);
2128 ocfs2_xa_cleanup_value_truncate(loc,
2129 "overwriting",
2130 orig_clusters);
1662 goto out; 2131 goto out;
1663 } else {
1664 /*
1665 * If new value need set in local,
1666 * just trucate old value to zero.
1667 */
1668 ret = ocfs2_xattr_value_truncate(inode,
1669 &vb,
1670 0,
1671 ctxt);
1672 if (ret < 0)
1673 mlog_errno(ret);
1674 } 2132 }
1675 } 2133 }
2134 ocfs2_xa_wipe_namevalue(loc);
2135 } else
2136 ocfs2_xa_add_entry(loc, name_hash);
2137
2138 /*
2139 * If we get here, we have a blank entry. Fill it. We grow our
2140 * name+value pair back from the end.
2141 */
2142 ocfs2_xa_add_namevalue(loc, xi);
2143 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2144 ocfs2_xa_install_value_root(loc);
2145
2146alloc_value:
2147 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2148 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) {
2151 /*
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters);
2160 mlog_errno(rc);
2161 }
1676 } 2162 }
1677 2163
1678 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, 2164out:
2165 return rc;
2166}
2167
2168/*
2169 * Store the value portion of the name+value pair. This will skip
2170 * values that are stored externally. Their tree roots were set up
2171 * by ocfs2_xa_prepare_entry().
2172 */
2173static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
2174 struct ocfs2_xattr_info *xi,
2175 struct ocfs2_xattr_set_ctxt *ctxt)
2176{
2177 int rc = 0;
2178 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
2179 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
2180 char *nameval_buf;
2181 struct ocfs2_xattr_value_buf vb;
2182
2183 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
2184 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2185 ocfs2_xa_fill_value_buf(loc, &vb);
2186 rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
2187 ctxt->handle, &vb,
2188 xi->xi_value,
2189 xi->xi_value_len);
2190 } else
2191 memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
2192
2193 return rc;
2194}
2195
2196static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
2197 struct ocfs2_xattr_info *xi,
2198 struct ocfs2_xattr_set_ctxt *ctxt)
2199{
2200 int ret;
2201 u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
2202 xi->xi_name_len);
2203
2204 ret = ocfs2_xa_journal_access(ctxt->handle, loc,
1679 OCFS2_JOURNAL_ACCESS_WRITE); 2205 OCFS2_JOURNAL_ACCESS_WRITE);
1680 if (ret) { 2206 if (ret) {
1681 mlog_errno(ret); 2207 mlog_errno(ret);
1682 goto out; 2208 goto out;
1683 } 2209 }
1684 2210
1685 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1686 ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
1687 OCFS2_JOURNAL_ACCESS_WRITE);
1688 if (ret) {
1689 mlog_errno(ret);
1690 goto out;
1691 }
1692 }
1693
1694 /* 2211 /*
1695 * Set value in local, include set tree root in local. 2212 * From here on out, everything is going to modify the buffer a
1696 * This is the first step for value size >INLINE_SIZE. 2213 * little. Errors are going to leave the xattr header in a
2214 * sane state. Thus, even with errors we dirty the sucker.
1697 */ 2215 */
1698 ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
1699 2216
1700 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2217 /* Don't worry, we are never called with !xi_value and !xl_entry */
1701 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 2218 if (!xi->xi_value) {
1702 if (ret < 0) { 2219 ret = ocfs2_xa_remove(loc, ctxt);
1703 mlog_errno(ret); 2220 goto out_dirty;
1704 goto out;
1705 }
1706 } 2221 }
1707 2222
1708 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && 2223 ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
1709 (flag & OCFS2_INLINE_XATTR_FL)) { 2224 if (ret) {
1710 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2225 if (ret != -ENOSPC)
1711 unsigned int xattrsize = osb->s_xattr_inline_size; 2226 mlog_errno(ret);
1712 2227 goto out_dirty;
1713 /*
1714 * Adjust extent record count or inline data size
1715 * to reserve space for extended attribute.
1716 */
1717 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1718 struct ocfs2_inline_data *idata = &di->id2.i_data;
1719 le16_add_cpu(&idata->id_count, -xattrsize);
1720 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
1721 struct ocfs2_extent_list *el = &di->id2.i_list;
1722 le16_add_cpu(&el->l_count, -(xattrsize /
1723 sizeof(struct ocfs2_extent_rec)));
1724 }
1725 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
1726 } 2228 }
1727 /* Update xattr flag */
1728 spin_lock(&oi->ip_lock);
1729 oi->ip_dyn_features |= flag;
1730 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1731 spin_unlock(&oi->ip_lock);
1732 2229
1733 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 2230 ret = ocfs2_xa_store_value(loc, xi, ctxt);
1734 if (ret < 0) 2231 if (ret)
1735 mlog_errno(ret); 2232 mlog_errno(ret);
1736 2233
1737 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2234out_dirty:
1738 /* 2235 ocfs2_xa_journal_dirty(ctxt->handle, loc);
1739 * Set value outside in B tree.
1740 * This is the second step for value size > INLINE_SIZE.
1741 */
1742 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1743 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
1744 &vb, offs);
1745 if (ret < 0) {
1746 int ret2;
1747 2236
1748 mlog_errno(ret);
1749 /*
1750 * If set value outside failed, we have to clean
1751 * the junk tree root we have already set in local.
1752 */
1753 ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
1754 xi, xs, &vb, offs);
1755 if (ret2 < 0)
1756 mlog_errno(ret2);
1757 }
1758 }
1759out: 2237out:
1760 return ret; 2238 return ret;
1761} 2239}
1762 2240
2241static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
2242 struct inode *inode,
2243 struct buffer_head *bh,
2244 struct ocfs2_xattr_entry *entry)
2245{
2246 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2247
2248 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
2249
2250 loc->xl_inode = inode;
2251 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2252 loc->xl_storage = bh;
2253 loc->xl_entry = entry;
2254 loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
2255 loc->xl_header =
2256 (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
2257 loc->xl_size);
2258}
2259
2260static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
2261 struct inode *inode,
2262 struct buffer_head *bh,
2263 struct ocfs2_xattr_entry *entry)
2264{
2265 struct ocfs2_xattr_block *xb =
2266 (struct ocfs2_xattr_block *)bh->b_data;
2267
2268 BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
2269
2270 loc->xl_inode = inode;
2271 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2272 loc->xl_storage = bh;
2273 loc->xl_header = &(xb->xb_attrs.xb_header);
2274 loc->xl_entry = entry;
2275 loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
2276 xb_attrs.xb_header);
2277}
2278
2279static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
2280 struct ocfs2_xattr_bucket *bucket,
2281 struct ocfs2_xattr_entry *entry)
2282{
2283 loc->xl_inode = bucket->bu_inode;
2284 loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
2285 loc->xl_storage = bucket;
2286 loc->xl_header = bucket_xh(bucket);
2287 loc->xl_entry = entry;
2288 loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
2289}
2290
1763/* 2291/*
1764 * In xattr remove, if it is stored outside and refcounted, we may have 2292 * In xattr remove, if it is stored outside and refcounted, we may have
1765 * the chance to split the refcount tree. So need the allocators. 2293 * the chance to split the refcount tree. So need the allocators.
@@ -2155,6 +2683,55 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
2155 return 0; 2683 return 0;
2156} 2684}
2157 2685
2686static int ocfs2_xattr_ibody_init(struct inode *inode,
2687 struct buffer_head *di_bh,
2688 struct ocfs2_xattr_set_ctxt *ctxt)
2689{
2690 int ret;
2691 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2692 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2693 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2694 unsigned int xattrsize = osb->s_xattr_inline_size;
2695
2696 if (!ocfs2_xattr_has_space_inline(inode, di)) {
2697 ret = -ENOSPC;
2698 goto out;
2699 }
2700
2701 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
2702 OCFS2_JOURNAL_ACCESS_WRITE);
2703 if (ret) {
2704 mlog_errno(ret);
2705 goto out;
2706 }
2707
2708 /*
2709 * Adjust extent record count or inline data size
2710 * to reserve space for extended attribute.
2711 */
2712 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2713 struct ocfs2_inline_data *idata = &di->id2.i_data;
2714 le16_add_cpu(&idata->id_count, -xattrsize);
2715 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
2716 struct ocfs2_extent_list *el = &di->id2.i_list;
2717 le16_add_cpu(&el->l_count, -(xattrsize /
2718 sizeof(struct ocfs2_extent_rec)));
2719 }
2720 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
2721
2722 spin_lock(&oi->ip_lock);
2723 oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock);
2726
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730
2731out:
2732 return ret;
2733}
2734
2158/* 2735/*
2159 * ocfs2_xattr_ibody_set() 2736 * ocfs2_xattr_ibody_set()
2160 * 2737 *
@@ -2166,9 +2743,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2166 struct ocfs2_xattr_search *xs, 2743 struct ocfs2_xattr_search *xs,
2167 struct ocfs2_xattr_set_ctxt *ctxt) 2744 struct ocfs2_xattr_set_ctxt *ctxt)
2168{ 2745{
2746 int ret;
2169 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2747 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2170 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2748 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2171 int ret; 2749 struct ocfs2_xa_loc loc;
2172 2750
2173 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) 2751 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
2174 return -ENOSPC; 2752 return -ENOSPC;
@@ -2181,8 +2759,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2181 } 2759 }
2182 } 2760 }
2183 2761
2184 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2762 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2185 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); 2763 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
2764 if (ret) {
2765 if (ret != -ENOSPC)
2766 mlog_errno(ret);
2767 goto out;
2768 }
2769 }
2770
2771 ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
2772 xs->not_found ? NULL : xs->here);
2773 ret = ocfs2_xa_set(&loc, xi, ctxt);
2774 if (ret) {
2775 if (ret != -ENOSPC)
2776 mlog_errno(ret);
2777 goto out;
2778 }
2779 xs->here = loc.xl_entry;
2780
2186out: 2781out:
2187 up_write(&oi->ip_alloc_sem); 2782 up_write(&oi->ip_alloc_sem);
2188 2783
@@ -2242,12 +2837,11 @@ cleanup:
2242 return ret; 2837 return ret;
2243} 2838}
2244 2839
2245static int ocfs2_create_xattr_block(handle_t *handle, 2840static int ocfs2_create_xattr_block(struct inode *inode,
2246 struct inode *inode,
2247 struct buffer_head *inode_bh, 2841 struct buffer_head *inode_bh,
2248 struct ocfs2_alloc_context *meta_ac, 2842 struct ocfs2_xattr_set_ctxt *ctxt,
2249 struct buffer_head **ret_bh, 2843 int indexed,
2250 int indexed) 2844 struct buffer_head **ret_bh)
2251{ 2845{
2252 int ret; 2846 int ret;
2253 u16 suballoc_bit_start; 2847 u16 suballoc_bit_start;
@@ -2258,14 +2852,14 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2258 struct buffer_head *new_bh = NULL; 2852 struct buffer_head *new_bh = NULL;
2259 struct ocfs2_xattr_block *xblk; 2853 struct ocfs2_xattr_block *xblk;
2260 2854
2261 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, 2855 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
2262 OCFS2_JOURNAL_ACCESS_CREATE); 2856 inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
2263 if (ret < 0) { 2857 if (ret < 0) {
2264 mlog_errno(ret); 2858 mlog_errno(ret);
2265 goto end; 2859 goto end;
2266 } 2860 }
2267 2861
2268 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
2269 &suballoc_bit_start, &num_got, 2863 &suballoc_bit_start, &num_got,
2270 &first_blkno); 2864 &first_blkno);
2271 if (ret < 0) { 2865 if (ret < 0) {
@@ -2276,7 +2870,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2276 new_bh = sb_getblk(inode->i_sb, first_blkno); 2870 new_bh = sb_getblk(inode->i_sb, first_blkno);
2277 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); 2871 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2278 2872
2279 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), 2873 ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
2280 new_bh, 2874 new_bh,
2281 OCFS2_JOURNAL_ACCESS_CREATE); 2875 OCFS2_JOURNAL_ACCESS_CREATE);
2282 if (ret < 0) { 2876 if (ret < 0) {
@@ -2288,11 +2882,10 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2288 xblk = (struct ocfs2_xattr_block *)new_bh->b_data; 2882 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
2289 memset(xblk, 0, inode->i_sb->s_blocksize); 2883 memset(xblk, 0, inode->i_sb->s_blocksize);
2290 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2291 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); 2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2292 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2293 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2294 xblk->xb_blkno = cpu_to_le64(first_blkno); 2888 xblk->xb_blkno = cpu_to_le64(first_blkno);
2295
2296 if (indexed) { 2889 if (indexed) {
2297 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
2298 xr->xt_clusters = cpu_to_le32(1); 2891 xr->xt_clusters = cpu_to_le32(1);
@@ -2303,14 +2896,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2303 xr->xt_list.l_next_free_rec = cpu_to_le16(1); 2896 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2304 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); 2897 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
2305 } 2898 }
2899 ocfs2_journal_dirty(ctxt->handle, new_bh);
2306 2900
2307 ret = ocfs2_journal_dirty(handle, new_bh); 2901 /* Add it to the inode */
2308 if (ret < 0) {
2309 mlog_errno(ret);
2310 goto end;
2311 }
2312 di->i_xattr_loc = cpu_to_le64(first_blkno); 2902 di->i_xattr_loc = cpu_to_le64(first_blkno);
2313 ocfs2_journal_dirty(handle, inode_bh); 2903
2904 spin_lock(&OCFS2_I(inode)->ip_lock);
2905 OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
2906 di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
2907 spin_unlock(&OCFS2_I(inode)->ip_lock);
2908
2909 ocfs2_journal_dirty(ctxt->handle, inode_bh);
2314 2910
2315 *ret_bh = new_bh; 2911 *ret_bh = new_bh;
2316 new_bh = NULL; 2912 new_bh = NULL;
@@ -2332,13 +2928,13 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2332 struct ocfs2_xattr_set_ctxt *ctxt) 2928 struct ocfs2_xattr_set_ctxt *ctxt)
2333{ 2929{
2334 struct buffer_head *new_bh = NULL; 2930 struct buffer_head *new_bh = NULL;
2335 handle_t *handle = ctxt->handle;
2336 struct ocfs2_xattr_block *xblk = NULL; 2931 struct ocfs2_xattr_block *xblk = NULL;
2337 int ret; 2932 int ret;
2933 struct ocfs2_xa_loc loc;
2338 2934
2339 if (!xs->xattr_bh) { 2935 if (!xs->xattr_bh) {
2340 ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, 2936 ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
2341 ctxt->meta_ac, &new_bh, 0); 2937 0, &new_bh);
2342 if (ret) { 2938 if (ret) {
2343 mlog_errno(ret); 2939 mlog_errno(ret);
2344 goto end; 2940 goto end;
@@ -2354,21 +2950,25 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2354 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2950 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2355 2951
2356 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { 2952 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
2357 /* Set extended attribute into external block */ 2953 ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
2358 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2954 xs->not_found ? NULL : xs->here);
2359 OCFS2_HAS_XATTR_FL);
2360 if (!ret || ret != -ENOSPC)
2361 goto end;
2362 2955
2363 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2956 ret = ocfs2_xa_set(&loc, xi, ctxt);
2364 if (ret) 2957 if (!ret)
2958 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC)
2365 goto end; 2960 goto end;
2961 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
2963 if (ret)
2964 goto end;
2965 }
2366 } 2966 }
2367 2967
2368 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); 2968 if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
2969 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
2369 2970
2370end: 2971end:
2371
2372 return ret; 2972 return ret;
2373} 2973}
2374 2974
@@ -2377,7 +2977,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2377 struct ocfs2_xattr_info *xi, 2977 struct ocfs2_xattr_info *xi,
2378 struct ocfs2_xattr_search *xs) 2978 struct ocfs2_xattr_search *xs)
2379{ 2979{
2380 u64 value_size;
2381 struct ocfs2_xattr_entry *last; 2980 struct ocfs2_xattr_entry *last;
2382 int free, i; 2981 int free, i;
2383 size_t min_offs = xs->end - xs->base; 2982 size_t min_offs = xs->end - xs->base;
@@ -2400,13 +2999,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2400 2999
2401 BUG_ON(!xs->not_found); 3000 BUG_ON(!xs->not_found);
2402 3001
2403 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3002 if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
2404 value_size = OCFS2_XATTR_ROOT_SIZE;
2405 else
2406 value_size = OCFS2_XATTR_SIZE(xi->value_len);
2407
2408 if (free >= sizeof(struct ocfs2_xattr_entry) +
2409 OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
2410 return 1; 3003 return 1;
2411 3004
2412 return 0; 3005 return 0;
@@ -2430,7 +3023,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2430 char *base = NULL; 3023 char *base = NULL;
2431 int name_offset, name_len = 0; 3024 int name_offset, name_len = 0;
2432 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, 3025 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2433 xi->value_len); 3026 xi->xi_value_len);
2434 u64 value_size; 3027 u64 value_size;
2435 3028
2436 /* 3029 /*
@@ -2438,14 +3031,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2438 * No matter whether we replace an old one or add a new one, 3031 * No matter whether we replace an old one or add a new one,
2439 * we need this for writing. 3032 * we need this for writing.
2440 */ 3033 */
2441 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3034 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2442 credits += new_clusters * 3035 credits += new_clusters *
2443 ocfs2_clusters_to_blocks(inode->i_sb, 1); 3036 ocfs2_clusters_to_blocks(inode->i_sb, 1);
2444 3037
2445 if (xis->not_found && xbs->not_found) { 3038 if (xis->not_found && xbs->not_found) {
2446 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3039 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2447 3040
2448 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3041 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2449 clusters_add += new_clusters; 3042 clusters_add += new_clusters;
2450 credits += ocfs2_calc_extend_credits(inode->i_sb, 3043 credits += ocfs2_calc_extend_credits(inode->i_sb,
2451 &def_xv.xv.xr_list, 3044 &def_xv.xv.xr_list,
@@ -2490,7 +3083,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2490 * The credits for removing the value tree will be extended 3083 * The credits for removing the value tree will be extended
2491 * by ocfs2_remove_extent itself. 3084 * by ocfs2_remove_extent itself.
2492 */ 3085 */
2493 if (!xi->value) { 3086 if (!xi->xi_value) {
2494 if (!ocfs2_xattr_is_local(xe)) 3087 if (!ocfs2_xattr_is_local(xe))
2495 credits += ocfs2_remove_extent_credits(inode->i_sb); 3088 credits += ocfs2_remove_extent_credits(inode->i_sb);
2496 3089
@@ -2520,7 +3113,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2520 } 3113 }
2521 } 3114 }
2522 3115
2523 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3116 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2524 /* the new values will be stored outside. */ 3117 /* the new values will be stored outside. */
2525 u32 old_clusters = 0; 3118 u32 old_clusters = 0;
2526 3119
@@ -2553,9 +3146,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2553 * value, we don't need any allocation, otherwise we have 3146 * value, we don't need any allocation, otherwise we have
2554 * to guess metadata allocation. 3147 * to guess metadata allocation.
2555 */ 3148 */
2556 if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) || 3149 if ((ocfs2_xattr_is_local(xe) &&
3150 (value_size >= xi->xi_value_len)) ||
2557 (!ocfs2_xattr_is_local(xe) && 3151 (!ocfs2_xattr_is_local(xe) &&
2558 OCFS2_XATTR_ROOT_SIZE >= xi->value_len)) 3152 OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
2559 goto out; 3153 goto out;
2560 } 3154 }
2561 3155
@@ -2645,7 +3239,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2645 3239
2646 meta_add += extra_meta; 3240 meta_add += extra_meta;
2647 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 3241 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2648 "credits = %d\n", xi->name, meta_add, clusters_add, *credits); 3242 "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
2649 3243
2650 if (meta_add) { 3244 if (meta_add) {
2651 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, 3245 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2685,7 +3279,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2685{ 3279{
2686 int ret = 0, credits, old_found; 3280 int ret = 0, credits, old_found;
2687 3281
2688 if (!xi->value) { 3282 if (!xi->xi_value) {
2689 /* Remove existing extended attribute */ 3283 /* Remove existing extended attribute */
2690 if (!xis->not_found) 3284 if (!xis->not_found)
2691 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); 3285 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2699,8 +3293,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2699 * If succeed and that extended attribute existing in 3293 * If succeed and that extended attribute existing in
2700 * external block, then we will remove it. 3294 * external block, then we will remove it.
2701 */ 3295 */
2702 xi->value = NULL; 3296 xi->xi_value = NULL;
2703 xi->value_len = 0; 3297 xi->xi_value_len = 0;
2704 3298
2705 old_found = xis->not_found; 3299 old_found = xis->not_found;
2706 xis->not_found = -ENODATA; 3300 xis->not_found = -ENODATA;
@@ -2728,8 +3322,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2728 } else if (ret == -ENOSPC) { 3322 } else if (ret == -ENOSPC) {
2729 if (di->i_xattr_loc && !xbs->xattr_bh) { 3323 if (di->i_xattr_loc && !xbs->xattr_bh) {
2730 ret = ocfs2_xattr_block_find(inode, 3324 ret = ocfs2_xattr_block_find(inode,
2731 xi->name_index, 3325 xi->xi_name_index,
2732 xi->name, xbs); 3326 xi->xi_name, xbs);
2733 if (ret) 3327 if (ret)
2734 goto out; 3328 goto out;
2735 3329
@@ -2768,8 +3362,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2768 * If succeed and that extended attribute 3362 * If succeed and that extended attribute
2769 * existing in inode, we will remove it. 3363 * existing in inode, we will remove it.
2770 */ 3364 */
2771 xi->value = NULL; 3365 xi->xi_value = NULL;
2772 xi->value_len = 0; 3366 xi->xi_value_len = 0;
2773 xbs->not_found = -ENODATA; 3367 xbs->not_found = -ENODATA;
2774 ret = ocfs2_calc_xattr_set_need(inode, 3368 ret = ocfs2_calc_xattr_set_need(inode,
2775 di, 3369 di,
@@ -2835,10 +3429,11 @@ int ocfs2_xattr_set_handle(handle_t *handle,
2835 int ret; 3429 int ret;
2836 3430
2837 struct ocfs2_xattr_info xi = { 3431 struct ocfs2_xattr_info xi = {
2838 .name_index = name_index, 3432 .xi_name_index = name_index,
2839 .name = name, 3433 .xi_name = name,
2840 .value = value, 3434 .xi_name_len = strlen(name),
2841 .value_len = value_len, 3435 .xi_value = value,
3436 .xi_value_len = value_len,
2842 }; 3437 };
2843 3438
2844 struct ocfs2_xattr_search xis = { 3439 struct ocfs2_xattr_search xis = {
@@ -2918,10 +3513,11 @@ int ocfs2_xattr_set(struct inode *inode,
2918 struct ocfs2_refcount_tree *ref_tree = NULL; 3513 struct ocfs2_refcount_tree *ref_tree = NULL;
2919 3514
2920 struct ocfs2_xattr_info xi = { 3515 struct ocfs2_xattr_info xi = {
2921 .name_index = name_index, 3516 .xi_name_index = name_index,
2922 .name = name, 3517 .xi_name = name,
2923 .value = value, 3518 .xi_name_len = strlen(name),
2924 .value_len = value_len, 3519 .xi_value = value,
3520 .xi_value_len = value_len,
2925 }; 3521 };
2926 3522
2927 struct ocfs2_xattr_search xis = { 3523 struct ocfs2_xattr_search xis = {
@@ -3765,7 +4361,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3765 struct ocfs2_xattr_bucket *bucket) 4361 struct ocfs2_xattr_bucket *bucket)
3766{ 4362{
3767 int ret, i; 4363 int ret, i;
3768 size_t end, offset, len, value_len; 4364 size_t end, offset, len;
3769 struct ocfs2_xattr_header *xh; 4365 struct ocfs2_xattr_header *xh;
3770 char *entries, *buf, *bucket_buf = NULL; 4366 char *entries, *buf, *bucket_buf = NULL;
3771 u64 blkno = bucket_blkno(bucket); 4367 u64 blkno = bucket_blkno(bucket);
@@ -3819,12 +4415,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3819 end = OCFS2_XATTR_BUCKET_SIZE; 4415 end = OCFS2_XATTR_BUCKET_SIZE;
3820 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { 4416 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
3821 offset = le16_to_cpu(xe->xe_name_offset); 4417 offset = le16_to_cpu(xe->xe_name_offset);
3822 if (ocfs2_xattr_is_local(xe)) 4418 len = namevalue_size_xe(xe);
3823 value_len = OCFS2_XATTR_SIZE(
3824 le64_to_cpu(xe->xe_value_size));
3825 else
3826 value_len = OCFS2_XATTR_ROOT_SIZE;
3827 len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
3828 4419
3829 /* 4420 /*
3830 * We must make sure that the name/value pair 4421 * We must make sure that the name/value pair
@@ -4013,7 +4604,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4013 int new_bucket_head) 4604 int new_bucket_head)
4014{ 4605{
4015 int ret, i; 4606 int ret, i;
4016 int count, start, len, name_value_len = 0, xe_len, name_offset = 0; 4607 int count, start, len, name_value_len = 0, name_offset = 0;
4017 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; 4608 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
4018 struct ocfs2_xattr_header *xh; 4609 struct ocfs2_xattr_header *xh;
4019 struct ocfs2_xattr_entry *xe; 4610 struct ocfs2_xattr_entry *xe;
@@ -4104,13 +4695,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4104 name_value_len = 0; 4695 name_value_len = 0;
4105 for (i = 0; i < start; i++) { 4696 for (i = 0; i < start; i++) {
4106 xe = &xh->xh_entries[i]; 4697 xe = &xh->xh_entries[i];
4107 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); 4698 name_value_len += namevalue_size_xe(xe);
4108 if (ocfs2_xattr_is_local(xe))
4109 xe_len +=
4110 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4111 else
4112 xe_len += OCFS2_XATTR_ROOT_SIZE;
4113 name_value_len += xe_len;
4114 if (le16_to_cpu(xe->xe_name_offset) < name_offset) 4699 if (le16_to_cpu(xe->xe_name_offset) < name_offset)
4115 name_offset = le16_to_cpu(xe->xe_name_offset); 4700 name_offset = le16_to_cpu(xe->xe_name_offset);
4116 } 4701 }
@@ -4140,12 +4725,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4140 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); 4725 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4141 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 4726 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4142 xe = &xh->xh_entries[i]; 4727 xe = &xh->xh_entries[i];
4143 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
4144 if (ocfs2_xattr_is_local(xe))
4145 xe_len +=
4146 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4147 else
4148 xe_len += OCFS2_XATTR_ROOT_SIZE;
4149 if (le16_to_cpu(xe->xe_name_offset) < 4728 if (le16_to_cpu(xe->xe_name_offset) <
4150 le16_to_cpu(xh->xh_free_start)) 4729 le16_to_cpu(xh->xh_free_start))
4151 xh->xh_free_start = xe->xe_name_offset; 4730 xh->xh_free_start = xe->xe_name_offset;
@@ -4757,195 +5336,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
4757} 5336}
4758 5337
4759/* 5338/*
4760 * Handle the normal xattr set, including replace, delete and new.
4761 *
4762 * Note: "local" indicates the real data's locality. So we can't
4763 * just its bucket locality by its length.
4764 */
4765static void ocfs2_xattr_set_entry_normal(struct inode *inode,
4766 struct ocfs2_xattr_info *xi,
4767 struct ocfs2_xattr_search *xs,
4768 u32 name_hash,
4769 int local)
4770{
4771 struct ocfs2_xattr_entry *last, *xe;
4772 int name_len = strlen(xi->name);
4773 struct ocfs2_xattr_header *xh = xs->header;
4774 u16 count = le16_to_cpu(xh->xh_count), start;
4775 size_t blocksize = inode->i_sb->s_blocksize;
4776 char *val;
4777 size_t offs, size, new_size;
4778
4779 last = &xh->xh_entries[count];
4780 if (!xs->not_found) {
4781 xe = xs->here;
4782 offs = le16_to_cpu(xe->xe_name_offset);
4783 if (ocfs2_xattr_is_local(xe))
4784 size = OCFS2_XATTR_SIZE(name_len) +
4785 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4786 else
4787 size = OCFS2_XATTR_SIZE(name_len) +
4788 OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
4789
4790 /*
4791 * If the new value will be stored outside, xi->value has been
4792 * initalized as an empty ocfs2_xattr_value_root, and the same
4793 * goes with xi->value_len, so we can set new_size safely here.
4794 * See ocfs2_xattr_set_in_bucket.
4795 */
4796 new_size = OCFS2_XATTR_SIZE(name_len) +
4797 OCFS2_XATTR_SIZE(xi->value_len);
4798
4799 le16_add_cpu(&xh->xh_name_value_len, -size);
4800 if (xi->value) {
4801 if (new_size > size)
4802 goto set_new_name_value;
4803
4804 /* Now replace the old value with new one. */
4805 if (local)
4806 xe->xe_value_size = cpu_to_le64(xi->value_len);
4807 else
4808 xe->xe_value_size = 0;
4809
4810 val = ocfs2_xattr_bucket_get_val(inode,
4811 xs->bucket, offs);
4812 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
4813 size - OCFS2_XATTR_SIZE(name_len));
4814 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
4815 memcpy(val + OCFS2_XATTR_SIZE(name_len),
4816 xi->value, xi->value_len);
4817
4818 le16_add_cpu(&xh->xh_name_value_len, new_size);
4819 ocfs2_xattr_set_local(xe, local);
4820 return;
4821 } else {
4822 /*
4823 * Remove the old entry if there is more than one.
4824 * We don't remove the last entry so that we can
4825 * use it to indicate the hash value of the empty
4826 * bucket.
4827 */
4828 last -= 1;
4829 le16_add_cpu(&xh->xh_count, -1);
4830 if (xh->xh_count) {
4831 memmove(xe, xe + 1,
4832 (void *)last - (void *)xe);
4833 memset(last, 0,
4834 sizeof(struct ocfs2_xattr_entry));
4835 } else
4836 xh->xh_free_start =
4837 cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4838
4839 return;
4840 }
4841 } else {
4842 /* find a new entry for insert. */
4843 int low = 0, high = count - 1, tmp;
4844 struct ocfs2_xattr_entry *tmp_xe;
4845
4846 while (low <= high && count) {
4847 tmp = (low + high) / 2;
4848 tmp_xe = &xh->xh_entries[tmp];
4849
4850 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
4851 low = tmp + 1;
4852 else if (name_hash <
4853 le32_to_cpu(tmp_xe->xe_name_hash))
4854 high = tmp - 1;
4855 else {
4856 low = tmp;
4857 break;
4858 }
4859 }
4860
4861 xe = &xh->xh_entries[low];
4862 if (low != count)
4863 memmove(xe + 1, xe, (void *)last - (void *)xe);
4864
4865 le16_add_cpu(&xh->xh_count, 1);
4866 memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
4867 xe->xe_name_hash = cpu_to_le32(name_hash);
4868 xe->xe_name_len = name_len;
4869 ocfs2_xattr_set_type(xe, xi->name_index);
4870 }
4871
4872set_new_name_value:
4873 /* Insert the new name+value. */
4874 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
4875
4876 /*
4877 * We must make sure that the name/value pair
4878 * exists in the same block.
4879 */
4880 offs = le16_to_cpu(xh->xh_free_start);
4881 start = offs - size;
4882
4883 if (start >> inode->i_sb->s_blocksize_bits !=
4884 (offs - 1) >> inode->i_sb->s_blocksize_bits) {
4885 offs = offs - offs % blocksize;
4886 xh->xh_free_start = cpu_to_le16(offs);
4887 }
4888
4889 val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
4890 xe->xe_name_offset = cpu_to_le16(offs - size);
4891
4892 memset(val, 0, size);
4893 memcpy(val, xi->name, name_len);
4894 memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
4895
4896 xe->xe_value_size = cpu_to_le64(xi->value_len);
4897 ocfs2_xattr_set_local(xe, local);
4898 xs->here = xe;
4899 le16_add_cpu(&xh->xh_free_start, -size);
4900 le16_add_cpu(&xh->xh_name_value_len, size);
4901
4902 return;
4903}
4904
4905/*
4906 * Set the xattr entry in the specified bucket.
4907 * The bucket is indicated by xs->bucket and it should have the enough
4908 * space for the xattr insertion.
4909 */
4910static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4911 handle_t *handle,
4912 struct ocfs2_xattr_info *xi,
4913 struct ocfs2_xattr_search *xs,
4914 u32 name_hash,
4915 int local)
4916{
4917 int ret;
4918 u64 blkno;
4919
4920 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4921 (unsigned long)xi->value_len, xi->name_index,
4922 (unsigned long long)bucket_blkno(xs->bucket));
4923
4924 if (!xs->bucket->bu_bhs[1]) {
4925 blkno = bucket_blkno(xs->bucket);
4926 ocfs2_xattr_bucket_relse(xs->bucket);
4927 ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
4928 if (ret) {
4929 mlog_errno(ret);
4930 goto out;
4931 }
4932 }
4933
4934 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4935 OCFS2_JOURNAL_ACCESS_WRITE);
4936 if (ret < 0) {
4937 mlog_errno(ret);
4938 goto out;
4939 }
4940
4941 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4942 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4943
4944out:
4945 return ret;
4946}
4947
4948/*
4949 * Truncate the specified xe_off entry in xattr bucket. 5339 * Truncate the specified xe_off entry in xattr bucket.
4950 * bucket is indicated by header_bh and len is the new length. 5340 * bucket is indicated by header_bh and len is the new length.
4951 * Both the ocfs2_xattr_value_root and the entry will be updated here. 5341 * Both the ocfs2_xattr_value_root and the entry will be updated here.
@@ -5015,66 +5405,6 @@ out:
5015 return ret; 5405 return ret;
5016} 5406}
5017 5407
5018static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
5019 struct ocfs2_xattr_search *xs,
5020 int len,
5021 struct ocfs2_xattr_set_ctxt *ctxt)
5022{
5023 int ret, offset;
5024 struct ocfs2_xattr_entry *xe = xs->here;
5025 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
5026
5027 BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
5028
5029 offset = xe - xh->xh_entries;
5030 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
5031 offset, len, ctxt);
5032 if (ret)
5033 mlog_errno(ret);
5034
5035 return ret;
5036}
5037
5038static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
5039 handle_t *handle,
5040 struct ocfs2_xattr_search *xs,
5041 char *val,
5042 int value_len)
5043{
5044 int ret, offset, block_off;
5045 struct ocfs2_xattr_value_root *xv;
5046 struct ocfs2_xattr_entry *xe = xs->here;
5047 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5048 void *base;
5049 struct ocfs2_xattr_value_buf vb = {
5050 .vb_access = ocfs2_journal_access,
5051 };
5052
5053 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
5054
5055 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
5056 xe - xh->xh_entries,
5057 &block_off,
5058 &offset);
5059 if (ret) {
5060 mlog_errno(ret);
5061 goto out;
5062 }
5063
5064 base = bucket_block(xs->bucket, block_off);
5065 xv = (struct ocfs2_xattr_value_root *)(base + offset +
5066 OCFS2_XATTR_SIZE(xe->xe_name_len));
5067
5068 vb.vb_xv = xv;
5069 vb.vb_bh = xs->bucket->bu_bhs[block_off];
5070 ret = __ocfs2_xattr_set_value_outside(inode, handle,
5071 &vb, val, value_len);
5072 if (ret)
5073 mlog_errno(ret);
5074out:
5075 return ret;
5076}
5077
5078static int ocfs2_rm_xattr_cluster(struct inode *inode, 5408static int ocfs2_rm_xattr_cluster(struct inode *inode,
5079 struct buffer_head *root_bh, 5409 struct buffer_head *root_bh,
5080 u64 blkno, 5410 u64 blkno,
@@ -5173,128 +5503,6 @@ out:
5173 return ret; 5503 return ret;
5174} 5504}
5175 5505
5176static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
5177 handle_t *handle,
5178 struct ocfs2_xattr_search *xs)
5179{
5180 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5181 struct ocfs2_xattr_entry *last = &xh->xh_entries[
5182 le16_to_cpu(xh->xh_count) - 1];
5183 int ret = 0;
5184
5185 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
5186 OCFS2_JOURNAL_ACCESS_WRITE);
5187 if (ret) {
5188 mlog_errno(ret);
5189 return;
5190 }
5191
5192 /* Remove the old entry. */
5193 memmove(xs->here, xs->here + 1,
5194 (void *)last - (void *)xs->here);
5195 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
5196 le16_add_cpu(&xh->xh_count, -1);
5197
5198 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
5199}
5200
5201/*
5202 * Set the xattr name/value in the bucket specified in xs.
5203 *
5204 * As the new value in xi may be stored in the bucket or in an outside cluster,
5205 * we divide the whole process into 3 steps:
5206 * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
5207 * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
5208 * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
5209 * 4. If the clusters for the new outside value can't be allocated, we need
5210 * to free the xattr we allocated in set.
5211 */
5212static int ocfs2_xattr_set_in_bucket(struct inode *inode,
5213 struct ocfs2_xattr_info *xi,
5214 struct ocfs2_xattr_search *xs,
5215 struct ocfs2_xattr_set_ctxt *ctxt)
5216{
5217 int ret, local = 1;
5218 size_t value_len;
5219 char *val = (char *)xi->value;
5220 struct ocfs2_xattr_entry *xe = xs->here;
5221 u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
5222 strlen(xi->name));
5223
5224 if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
5225 /*
5226 * We need to truncate the xattr storage first.
5227 *
5228 * If both the old and new value are stored to
5229 * outside block, we only need to truncate
5230 * the storage and then set the value outside.
5231 *
5232 * If the new value should be stored within block,
5233 * we should free all the outside block first and
5234 * the modification to the xattr block will be done
5235 * by following steps.
5236 */
5237 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
5238 value_len = xi->value_len;
5239 else
5240 value_len = 0;
5241
5242 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5243 value_len,
5244 ctxt);
5245 if (ret)
5246 goto out;
5247
5248 if (value_len)
5249 goto set_value_outside;
5250 }
5251
5252 value_len = xi->value_len;
5253 /* So we have to handle the inside block change now. */
5254 if (value_len > OCFS2_XATTR_INLINE_SIZE) {
5255 /*
5256 * If the new value will be stored outside of block,
5257 * initalize a new empty value root and insert it first.
5258 */
5259 local = 0;
5260 xi->value = &def_xv;
5261 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
5262 }
5263
5264 ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
5265 name_hash, local);
5266 if (ret) {
5267 mlog_errno(ret);
5268 goto out;
5269 }
5270
5271 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
5272 goto out;
5273
5274 /* allocate the space now for the outside block storage. */
5275 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5276 value_len, ctxt);
5277 if (ret) {
5278 mlog_errno(ret);
5279
5280 if (xs->not_found) {
5281 /*
5282 * We can't allocate enough clusters for outside
5283 * storage and we have allocated xattr already,
5284 * so need to remove it.
5285 */
5286 ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
5287 }
5288 goto out;
5289 }
5290
5291set_value_outside:
5292 ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
5293 xs, val, value_len);
5294out:
5295 return ret;
5296}
5297
5298/* 5506/*
5299 * check whether the xattr bucket is filled up with the same hash value. 5507 * check whether the xattr bucket is filled up with the same hash value.
5300 * If we want to insert the xattr with the same hash, return -ENOSPC. 5508 * If we want to insert the xattr with the same hash, return -ENOSPC.
@@ -5323,156 +5531,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
5323 return 0; 5531 return 0;
5324} 5532}
5325 5533
5326static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 5534/*
5327 struct ocfs2_xattr_info *xi, 5535 * Try to set the entry in the current bucket. If we fail, the caller
5328 struct ocfs2_xattr_search *xs, 5536 * will handle getting us another bucket.
5329 struct ocfs2_xattr_set_ctxt *ctxt) 5537 */
5538static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
5539 struct ocfs2_xattr_info *xi,
5540 struct ocfs2_xattr_search *xs,
5541 struct ocfs2_xattr_set_ctxt *ctxt)
5330{ 5542{
5331 struct ocfs2_xattr_header *xh; 5543 int ret;
5332 struct ocfs2_xattr_entry *xe; 5544 struct ocfs2_xa_loc loc;
5333 u16 count, header_size, xh_free_start;
5334 int free, max_free, need, old;
5335 size_t value_size = 0, name_len = strlen(xi->name);
5336 size_t blocksize = inode->i_sb->s_blocksize;
5337 int ret, allocation = 0;
5338
5339 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
5340
5341try_again:
5342 xh = xs->header;
5343 count = le16_to_cpu(xh->xh_count);
5344 xh_free_start = le16_to_cpu(xh->xh_free_start);
5345 header_size = sizeof(struct ocfs2_xattr_header) +
5346 count * sizeof(struct ocfs2_xattr_entry);
5347 max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
5348 le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
5349
5350 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
5351 "of %u which exceed block size\n",
5352 (unsigned long long)bucket_blkno(xs->bucket),
5353 header_size);
5354 5545
5355 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) 5546 mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
5356 value_size = OCFS2_XATTR_ROOT_SIZE;
5357 else if (xi->value)
5358 value_size = OCFS2_XATTR_SIZE(xi->value_len);
5359 5547
5360 if (xs->not_found) 5548 ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
5361 need = sizeof(struct ocfs2_xattr_entry) + 5549 xs->not_found ? NULL : xs->here);
5362 OCFS2_XATTR_SIZE(name_len) + value_size; 5550 ret = ocfs2_xa_set(&loc, xi, ctxt);
5363 else { 5551 if (!ret) {
5364 need = value_size + OCFS2_XATTR_SIZE(name_len); 5552 xs->here = loc.xl_entry;
5553 goto out;
5554 }
5555 if (ret != -ENOSPC) {
5556 mlog_errno(ret);
5557 goto out;
5558 }
5365 5559
5366 /* 5560 /* Ok, we need space. Let's try defragmenting the bucket. */
5367 * We only replace the old value if the new length is smaller 5561 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5368 * than the old one. Otherwise we will allocate new space in the 5562 xs->bucket);
5369 * bucket to store it. 5563 if (ret) {
5370 */ 5564 mlog_errno(ret);
5371 xe = xs->here; 5565 goto out;
5372 if (ocfs2_xattr_is_local(xe)) 5566 }
5373 old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
5374 else
5375 old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
5376 5567
5377 if (old >= value_size) 5568 ret = ocfs2_xa_set(&loc, xi, ctxt);
5378 need = 0; 5569 if (!ret) {
5570 xs->here = loc.xl_entry;
5571 goto out;
5379 } 5572 }
5573 if (ret != -ENOSPC)
5574 mlog_errno(ret);
5380 5575
5381 free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
5382 /*
5383 * We need to make sure the new name/value pair
5384 * can exist in the same block.
5385 */
5386 if (xh_free_start % blocksize < need)
5387 free -= xh_free_start % blocksize;
5388
5389 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
5390 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
5391 " %u\n", xs->not_found,
5392 (unsigned long long)bucket_blkno(xs->bucket),
5393 free, need, max_free, le16_to_cpu(xh->xh_free_start),
5394 le16_to_cpu(xh->xh_name_value_len));
5395
5396 if (free < need ||
5397 (xs->not_found &&
5398 count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
5399 if (need <= max_free &&
5400 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
5401 /*
5402 * We can create the space by defragment. Since only the
5403 * name/value will be moved, the xe shouldn't be changed
5404 * in xs.
5405 */
5406 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5407 xs->bucket);
5408 if (ret) {
5409 mlog_errno(ret);
5410 goto out;
5411 }
5412 5576
5413 xh_free_start = le16_to_cpu(xh->xh_free_start); 5577out:
5414 free = xh_free_start - header_size 5578 mlog_exit(ret);
5415 - OCFS2_XATTR_HEADER_GAP; 5579 return ret;
5416 if (xh_free_start % blocksize < need) 5580}
5417 free -= xh_free_start % blocksize;
5418 5581
5419 if (free >= need) 5582static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
5420 goto xattr_set; 5583 struct ocfs2_xattr_info *xi,
5584 struct ocfs2_xattr_search *xs,
5585 struct ocfs2_xattr_set_ctxt *ctxt)
5586{
5587 int ret;
5421 5588
5422 mlog(0, "Can't get enough space for xattr insert by " 5589 mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
5423 "defragment. Need %u bytes, but we have %d, so "
5424 "allocate new bucket for it.\n", need, free);
5425 }
5426 5590
5427 /* 5591 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5428 * We have to add new buckets or clusters and one 5592 if (!ret)
5429 * allocation should leave us enough space for insert. 5593 goto out;
5430 */ 5594 if (ret != -ENOSPC) {
5431 BUG_ON(allocation); 5595 mlog_errno(ret);
5596 goto out;
5597 }
5432 5598
5433 /* 5599 /* Ack, need more space. Let's try to get another bucket! */
5434 * We do not allow for overlapping ranges between buckets. And
5435 * the maximum number of collisions we will allow for then is
5436 * one bucket's worth, so check it here whether we need to
5437 * add a new bucket for the insert.
5438 */
5439 ret = ocfs2_check_xattr_bucket_collision(inode,
5440 xs->bucket,
5441 xi->name);
5442 if (ret) {
5443 mlog_errno(ret);
5444 goto out;
5445 }
5446 5600
5447 ret = ocfs2_add_new_xattr_bucket(inode, 5601 /*
5448 xs->xattr_bh, 5602 * We do not allow for overlapping ranges between buckets. And
5603 * the maximum number of collisions we will allow for then is
5604 * one bucket's worth, so check it here whether we need to
5605 * add a new bucket for the insert.
5606 */
5607 ret = ocfs2_check_xattr_bucket_collision(inode,
5449 xs->bucket, 5608 xs->bucket,
5450 ctxt); 5609 xi->xi_name);
5451 if (ret) { 5610 if (ret) {
5452 mlog_errno(ret); 5611 mlog_errno(ret);
5453 goto out; 5612 goto out;
5454 } 5613 }
5455 5614
5456 /* 5615 ret = ocfs2_add_new_xattr_bucket(inode,
5457 * ocfs2_add_new_xattr_bucket() will have updated 5616 xs->xattr_bh,
5458 * xs->bucket if it moved, but it will not have updated 5617 xs->bucket,
5459 * any of the other search fields. Thus, we drop it and 5618 ctxt);
5460 * re-search. Everything should be cached, so it'll be 5619 if (ret) {
5461 * quick. 5620 mlog_errno(ret);
5462 */ 5621 goto out;
5463 ocfs2_xattr_bucket_relse(xs->bucket);
5464 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5465 xi->name_index,
5466 xi->name, xs);
5467 if (ret && ret != -ENODATA)
5468 goto out;
5469 xs->not_found = ret;
5470 allocation = 1;
5471 goto try_again;
5472 } 5622 }
5473 5623
5474xattr_set: 5624 /*
5475 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt); 5625 * ocfs2_add_new_xattr_bucket() will have updated
5626 * xs->bucket if it moved, but it will not have updated
5627 * any of the other search fields. Thus, we drop it and
5628 * re-search. Everything should be cached, so it'll be
5629 * quick.
5630 */
5631 ocfs2_xattr_bucket_relse(xs->bucket);
5632 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5633 xi->xi_name_index,
5634 xi->xi_name, xs);
5635 if (ret && ret != -ENODATA)
5636 goto out;
5637 xs->not_found = ret;
5638
5639 /* Ok, we have a new bucket, let's try again */
5640 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5641 if (ret && (ret != -ENOSPC))
5642 mlog_errno(ret);
5643
5476out: 5644out:
5477 mlog_exit(ret); 5645 mlog_exit(ret);
5478 return ret; 5646 return ret;
@@ -5684,7 +5852,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
5684 * refcount tree, and make the original extent become 3. So we will need 5852 * refcount tree, and make the original extent become 3. So we will need
5685 * 2 * cluster more extent recs at most. 5853 * 2 * cluster more extent recs at most.
5686 */ 5854 */
5687 if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { 5855 if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
5688 5856
5689 ret = ocfs2_refcounted_xattr_delete_need(inode, 5857 ret = ocfs2_refcounted_xattr_delete_need(inode,
5690 &(*ref_tree)->rf_ci, 5858 &(*ref_tree)->rf_ci,
@@ -6066,7 +6234,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
6066 * to the extent block, so just calculate a maximum record num. 6234 * to the extent block, so just calculate a maximum record num.
6067 */ 6235 */
6068 if (!xv->xr_list.l_tree_depth) 6236 if (!xv->xr_list.l_tree_depth)
6069 *num_recs += xv->xr_list.l_next_free_rec; 6237 *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec);
6070 else 6238 else
6071 *num_recs += ocfs2_clusters_for_bytes(sb, 6239 *num_recs += ocfs2_clusters_for_bytes(sb,
6072 XATTR_SIZE_MAX); 6240 XATTR_SIZE_MAX);
@@ -6360,33 +6528,33 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6360 int indexed) 6528 int indexed)
6361{ 6529{
6362 int ret; 6530 int ret;
6363 handle_t *handle;
6364 struct ocfs2_alloc_context *meta_ac;
6365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6531 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6532 struct ocfs2_xattr_set_ctxt ctxt;
6366 6533
6367 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6534 memset(&ctxt, 0, sizeof(ctxt));
6535 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
6368 if (ret < 0) { 6536 if (ret < 0) {
6369 mlog_errno(ret); 6537 mlog_errno(ret);
6370 return ret; 6538 return ret;
6371 } 6539 }
6372 6540
6373 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); 6541 ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
6374 if (IS_ERR(handle)) { 6542 if (IS_ERR(ctxt.handle)) {
6375 ret = PTR_ERR(handle); 6543 ret = PTR_ERR(ctxt.handle);
6376 mlog_errno(ret); 6544 mlog_errno(ret);
6377 goto out; 6545 goto out;
6378 } 6546 }
6379 6547
6380 mlog(0, "create new xattr block for inode %llu, index = %d\n", 6548 mlog(0, "create new xattr block for inode %llu, index = %d\n",
6381 (unsigned long long)fe_bh->b_blocknr, indexed); 6549 (unsigned long long)fe_bh->b_blocknr, indexed);
6382 ret = ocfs2_create_xattr_block(handle, inode, fe_bh, 6550 ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
6383 meta_ac, ret_bh, indexed); 6551 ret_bh);
6384 if (ret) 6552 if (ret)
6385 mlog_errno(ret); 6553 mlog_errno(ret);
6386 6554
6387 ocfs2_commit_trans(osb, handle); 6555 ocfs2_commit_trans(osb, ctxt.handle);
6388out: 6556out:
6389 ocfs2_free_alloc_context(meta_ac); 6557 ocfs2_free_alloc_context(ctxt.meta_ac);
6390 return ret; 6558 return ret;
6391} 6559}
6392 6560
@@ -6978,9 +7146,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
6978 7146
6979 ret = ocfs2_init_security_get(inode, dir, &si); 7147 ret = ocfs2_init_security_get(inode, dir, &si);
6980 if (!ret) { 7148 if (!ret) {
6981 ret = ocfs2_xattr_security_set(inode, si.name, 7149 ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
6982 si.value, si.value_len, 7150 si.name, si.value, si.value_len,
6983 XATTR_CREATE); 7151 XATTR_CREATE);
6984 if (ret) { 7152 if (ret) {
6985 mlog_errno(ret); 7153 mlog_errno(ret);
6986 goto leave; 7154 goto leave;
@@ -7008,9 +7176,9 @@ leave:
7008/* 7176/*
7009 * 'security' attributes support 7177 * 'security' attributes support
7010 */ 7178 */
7011static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, 7179static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
7012 size_t list_size, const char *name, 7180 size_t list_size, const char *name,
7013 size_t name_len) 7181 size_t name_len, int type)
7014{ 7182{
7015 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; 7183 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
7016 const size_t total_len = prefix_len + name_len + 1; 7184 const size_t total_len = prefix_len + name_len + 1;
@@ -7023,23 +7191,23 @@ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
7023 return total_len; 7191 return total_len;
7024} 7192}
7025 7193
7026static int ocfs2_xattr_security_get(struct inode *inode, const char *name, 7194static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
7027 void *buffer, size_t size) 7195 void *buffer, size_t size, int type)
7028{ 7196{
7029 if (strcmp(name, "") == 0) 7197 if (strcmp(name, "") == 0)
7030 return -EINVAL; 7198 return -EINVAL;
7031 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name, 7199 return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
7032 buffer, size); 7200 name, buffer, size);
7033} 7201}
7034 7202
7035static int ocfs2_xattr_security_set(struct inode *inode, const char *name, 7203static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
7036 const void *value, size_t size, int flags) 7204 const void *value, size_t size, int flags, int type)
7037{ 7205{
7038 if (strcmp(name, "") == 0) 7206 if (strcmp(name, "") == 0)
7039 return -EINVAL; 7207 return -EINVAL;
7040 7208
7041 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, 7209 return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
7042 size, flags); 7210 name, value, size, flags);
7043} 7211}
7044 7212
7045int ocfs2_init_security_get(struct inode *inode, 7213int ocfs2_init_security_get(struct inode *inode,
@@ -7076,9 +7244,9 @@ struct xattr_handler ocfs2_xattr_security_handler = {
7076/* 7244/*
7077 * 'trusted' attributes support 7245 * 'trusted' attributes support
7078 */ 7246 */
7079static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, 7247static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
7080 size_t list_size, const char *name, 7248 size_t list_size, const char *name,
7081 size_t name_len) 7249 size_t name_len, int type)
7082{ 7250{
7083 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; 7251 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
7084 const size_t total_len = prefix_len + name_len + 1; 7252 const size_t total_len = prefix_len + name_len + 1;
@@ -7091,23 +7259,23 @@ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
7091 return total_len; 7259 return total_len;
7092} 7260}
7093 7261
7094static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name, 7262static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
7095 void *buffer, size_t size) 7263 void *buffer, size_t size, int type)
7096{ 7264{
7097 if (strcmp(name, "") == 0) 7265 if (strcmp(name, "") == 0)
7098 return -EINVAL; 7266 return -EINVAL;
7099 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, 7267 return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
7100 buffer, size); 7268 name, buffer, size);
7101} 7269}
7102 7270
7103static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name, 7271static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
7104 const void *value, size_t size, int flags) 7272 const void *value, size_t size, int flags, int type)
7105{ 7273{
7106 if (strcmp(name, "") == 0) 7274 if (strcmp(name, "") == 0)
7107 return -EINVAL; 7275 return -EINVAL;
7108 7276
7109 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, 7277 return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
7110 size, flags); 7278 name, value, size, flags);
7111} 7279}
7112 7280
7113struct xattr_handler ocfs2_xattr_trusted_handler = { 7281struct xattr_handler ocfs2_xattr_trusted_handler = {
@@ -7120,13 +7288,13 @@ struct xattr_handler ocfs2_xattr_trusted_handler = {
7120/* 7288/*
7121 * 'user' attributes support 7289 * 'user' attributes support
7122 */ 7290 */
7123static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, 7291static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
7124 size_t list_size, const char *name, 7292 size_t list_size, const char *name,
7125 size_t name_len) 7293 size_t name_len, int type)
7126{ 7294{
7127 const size_t prefix_len = XATTR_USER_PREFIX_LEN; 7295 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
7128 const size_t total_len = prefix_len + name_len + 1; 7296 const size_t total_len = prefix_len + name_len + 1;
7129 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7297 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
7130 7298
7131 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) 7299 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
7132 return 0; 7300 return 0;
@@ -7139,31 +7307,31 @@ static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
7139 return total_len; 7307 return total_len;
7140} 7308}
7141 7309
7142static int ocfs2_xattr_user_get(struct inode *inode, const char *name, 7310static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
7143 void *buffer, size_t size) 7311 void *buffer, size_t size, int type)
7144{ 7312{
7145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7313 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
7146 7314
7147 if (strcmp(name, "") == 0) 7315 if (strcmp(name, "") == 0)
7148 return -EINVAL; 7316 return -EINVAL;
7149 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) 7317 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
7150 return -EOPNOTSUPP; 7318 return -EOPNOTSUPP;
7151 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, 7319 return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name,
7152 buffer, size); 7320 buffer, size);
7153} 7321}
7154 7322
7155static int ocfs2_xattr_user_set(struct inode *inode, const char *name, 7323static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
7156 const void *value, size_t size, int flags) 7324 const void *value, size_t size, int flags, int type)
7157{ 7325{
7158 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7326 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
7159 7327
7160 if (strcmp(name, "") == 0) 7328 if (strcmp(name, "") == 0)
7161 return -EINVAL; 7329 return -EINVAL;
7162 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) 7330 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
7163 return -EOPNOTSUPP; 7331 return -EOPNOTSUPP;
7164 7332
7165 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, 7333 return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER,
7166 size, flags); 7334 name, value, size, flags);
7167} 7335}
7168 7336
7169struct xattr_handler ocfs2_xattr_user_handler = { 7337struct xattr_handler ocfs2_xattr_user_handler = {
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 08e36389f56d..abd72a47f520 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,10 +40,8 @@ struct ocfs2_security_xattr_info {
40extern struct xattr_handler ocfs2_xattr_user_handler; 40extern struct xattr_handler ocfs2_xattr_user_handler;
41extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler; 42extern struct xattr_handler ocfs2_xattr_security_handler;
43#ifdef CONFIG_OCFS2_FS_POSIX_ACL
44extern struct xattr_handler ocfs2_xattr_acl_access_handler; 43extern struct xattr_handler ocfs2_xattr_acl_access_handler;
45extern struct xattr_handler ocfs2_xattr_acl_default_handler; 44extern struct xattr_handler ocfs2_xattr_acl_default_handler;
46#endif
47extern struct xattr_handler *ocfs2_xattr_handlers[]; 45extern struct xattr_handler *ocfs2_xattr_handlers[];
48 46
49ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);