aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig35
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c75
-rw-r--r--fs/autofs4/expire.c4
-rw-r--r--fs/autofs4/inode.c18
-rw-r--r--fs/autofs4/waitq.c8
-rw-r--r--fs/bfs/inode.c45
-rw-r--r--fs/binfmt_elf.c12
-rw-r--r--fs/binfmt_misc.c5
-rw-r--r--fs/bio.c36
-rw-r--r--fs/block_dev.c16
-rw-r--r--fs/buffer.c1
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/coda/file.c12
-rw-r--r--fs/coda/sysctl.c5
-rw-r--r--fs/compat.c6
-rw-r--r--fs/configfs/inode.c3
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c14
-rw-r--r--fs/debugfs/file.c32
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c4
-rw-r--r--fs/direct-io.c13
-rw-r--r--fs/dlm/ast.c56
-rw-r--r--fs/dlm/ast.h4
-rw-r--r--fs/dlm/debug_fs.c310
-rw-r--r--fs/dlm/dir.c18
-rw-r--r--fs/dlm/dlm_internal.h4
-rw-r--r--fs/dlm/lock.c31
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/dlm/memory.c6
-rw-r--r--fs/dlm/midcomms.c2
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/user.c4
-rw-r--r--fs/dlm/user.h2
-rw-r--r--fs/dquot.c438
-rw-r--r--fs/ecryptfs/crypto.c514
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h105
-rw-r--r--fs/ecryptfs/file.c45
-rw-r--r--fs/ecryptfs/inode.c300
-rw-r--r--fs/ecryptfs/keystore.c651
-rw-r--r--fs/ecryptfs/main.c126
-rw-r--r--fs/ecryptfs/messaging.c4
-rw-r--r--fs/ecryptfs/miscdev.c18
-rw-r--r--fs/exec.c34
-rw-r--r--fs/ext2/ialloc.c8
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/ioctl.c3
-rw-r--r--fs/ext2/super.c10
-rw-r--r--fs/ext3/ialloc.c8
-rw-r--r--fs/ext3/ioctl.c3
-rw-r--r--fs/ext3/namei.c4
-rw-r--r--fs/ext3/super.c26
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/extents.c2
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/ext4/namei.c4
-rw-r--r--fs/ext4/super.c15
-rw-r--r--fs/filesystems.c23
-rw-r--r--fs/fs-writeback.c92
-rw-r--r--fs/fuse/control.c6
-rw-r--r--fs/fuse/dev.c113
-rw-r--r--fs/fuse/dir.c48
-rw-r--r--fs/fuse/file.c457
-rw-r--r--fs/fuse/fuse_i.h83
-rw-r--r--fs/fuse/inode.c157
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/bmap.c77
-rw-r--r--fs/gfs2/bmap.h34
-rw-r--r--fs/gfs2/daemon.c136
-rw-r--r--fs/gfs2/daemon.h17
-rw-r--r--fs/gfs2/dir.c62
-rw-r--r--fs/gfs2/dir.h1
-rw-r--r--fs/gfs2/eattr.c40
-rw-r--r--fs/gfs2/glock.c303
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c56
-rw-r--r--fs/gfs2/incore.h55
-rw-r--r--fs/gfs2/inode.c53
-rw-r--r--fs/gfs2/inode.h13
-rw-r--r--fs/gfs2/locking/dlm/mount.c12
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c16
-rw-r--r--fs/gfs2/main.c15
-rw-r--r--fs/gfs2/mount.c29
-rw-r--r--fs/gfs2/ops_address.c33
-rw-r--r--fs/gfs2/ops_dentry.c2
-rw-r--r--fs/gfs2/ops_dentry.h17
-rw-r--r--fs/gfs2/ops_export.c5
-rw-r--r--fs/gfs2/ops_file.c24
-rw-r--r--fs/gfs2/ops_fstype.c125
-rw-r--r--fs/gfs2/ops_fstype.h19
-rw-r--r--fs/gfs2/ops_inode.c75
-rw-r--r--fs/gfs2/ops_inode.h25
-rw-r--r--fs/gfs2/ops_super.c149
-rw-r--r--fs/gfs2/ops_super.h17
-rw-r--r--fs/gfs2/quota.c113
-rw-r--r--fs/gfs2/quota.h24
-rw-r--r--fs/gfs2/recovery.c48
-rw-r--r--fs/gfs2/recovery.h14
-rw-r--r--fs/gfs2/rgrp.c58
-rw-r--r--fs/gfs2/super.c246
-rw-r--r--fs/gfs2/super.h13
-rw-r--r--fs/gfs2/sys.c66
-rw-r--r--fs/gfs2/sys.h4
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/hugetlbfs/inode.c13
-rw-r--r--fs/inode.c34
-rw-r--r--fs/ioctl.c44
-rw-r--r--fs/isofs/inode.c6
-rw-r--r--fs/jbd/commit.c15
-rw-r--r--fs/jbd/transaction.c39
-rw-r--r--fs/jbd2/commit.c9
-rw-r--r--fs/jbd2/journal.c19
-rw-r--r--fs/jbd2/transaction.c47
-rw-r--r--fs/jfs/jfs_imap.c10
-rw-r--r--fs/libfs.c5
-rw-r--r--fs/lockd/clntproc.c7
-rw-r--r--fs/lockd/host.c170
-rw-r--r--fs/lockd/mon.c569
-rw-r--r--fs/lockd/svc.c72
-rw-r--r--fs/lockd/svc4proc.c13
-rw-r--r--fs/lockd/svcproc.c13
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/lockd/xdr.c5
-rw-r--r--fs/lockd/xdr4.c5
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/mpage.c6
-rw-r--r--fs/namei.c32
-rw-r--r--fs/ncpfs/getopt.c1
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/nfs4callback.c3
-rw-r--r--fs/nfsd/nfs4proc.c5
-rw-r--r--fs/nfsd/nfs4recover.c2
-rw-r--r--fs/nfsd/nfs4state.c79
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--fs/nfsd/nfsctl.c479
-rw-r--r--fs/nfsd/nfsfh.c36
-rw-r--r--fs/nfsd/nfsproc.c1
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ocfs2/Makefile7
-rw-r--r--fs/ocfs2/acl.c479
-rw-r--r--fs/ocfs2/acl.h58
-rw-r--r--fs/ocfs2/alloc.c712
-rw-r--r--fs/ocfs2/alloc.h30
-rw-r--r--fs/ocfs2/aops.c59
-rw-r--r--fs/ocfs2/blockcheck.c477
-rw-r--r--fs/ocfs2/blockcheck.h82
-rw-r--r--fs/ocfs2/buffer_head_io.c32
-rw-r--r--fs/ocfs2/buffer_head_io.h27
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/dir.c399
-rw-r--r--fs/ocfs2/dir.h2
-rw-r--r--fs/ocfs2/dlm/dlmast.c52
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c53
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c42
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlmglue.c172
-rw-r--r--fs/ocfs2/dlmglue.h19
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h24
-rw-r--r--fs/ocfs2/file.c211
-rw-r--r--fs/ocfs2/file.h3
-rw-r--r--fs/ocfs2/inode.c175
-rw-r--r--fs/ocfs2/inode.h18
-rw-r--r--fs/ocfs2/journal.c364
-rw-r--r--fs/ocfs2/journal.h128
-rw-r--r--fs/ocfs2/localalloc.c26
-rw-r--r--fs/ocfs2/namei.c318
-rw-r--r--fs/ocfs2/ocfs2.h46
-rw-r--r--fs/ocfs2/ocfs2_fs.h213
-rw-r--r--fs/ocfs2/ocfs2_jbd_compat.h82
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/quota.h119
-rw-r--r--fs/ocfs2/quota_global.c1025
-rw-r--r--fs/ocfs2/quota_local.c1253
-rw-r--r--fs/ocfs2/resize.c76
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/suballoc.c363
-rw-r--r--fs/ocfs2/suballoc.h18
-rw-r--r--fs/ocfs2/super.c328
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/xattr.c2984
-rw-r--r--fs/ocfs2/xattr.h45
-rw-r--r--fs/omfs/inode.c1
-rw-r--r--fs/open.c2
-rw-r--r--fs/openpromfs/inode.c3
-rw-r--r--fs/partitions/check.c11
-rw-r--r--fs/proc/base.c235
-rw-r--r--fs/proc/generic.c8
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/proc_sysctl.c1
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/quota.c11
-rw-r--r--fs/quota_tree.c645
-rw-r--r--fs/quota_tree.h25
-rw-r--r--fs/quota_v1.c28
-rw-r--r--fs/quota_v2.c631
-rw-r--r--fs/quotaio_v1.h33
-rw-r--r--fs/quotaio_v2.h60
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c13
-rw-r--r--fs/reiserfs/inode.c13
-rw-r--r--fs/reiserfs/super.c10
-rw-r--r--fs/romfs/inode.c13
-rw-r--r--fs/select.c76
-rw-r--r--fs/splice.c1
-rw-r--r--fs/stat.c2
-rw-r--r--fs/super.c10
-rw-r--r--fs/sync.c50
-rw-r--r--fs/sysfs/inode.c3
-rw-r--r--fs/ubifs/Kconfig2
-rw-r--r--fs/ubifs/budget.c4
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/journal.c2
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c9
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c2
237 files changed, 14750 insertions, 5711 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index ff0e81980207..32883589ee54 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -189,6 +189,8 @@ config OCFS2_FS
189 select CONFIGFS_FS 189 select CONFIGFS_FS
190 select JBD2 190 select JBD2
191 select CRC32 191 select CRC32
192 select QUOTA
193 select QUOTA_TREE
192 help 194 help
193 OCFS2 is a general purpose extent based shared disk cluster file 195 OCFS2 is a general purpose extent based shared disk cluster file
194 system with many similarities to ext3. It supports 64 bit inode 196 system with many similarities to ext3. It supports 64 bit inode
@@ -258,15 +260,14 @@ config OCFS2_DEBUG_FS
258 this option for debugging only as it is likely to decrease 260 this option for debugging only as it is likely to decrease
259 performance of the filesystem. 261 performance of the filesystem.
260 262
261config OCFS2_COMPAT_JBD 263config OCFS2_FS_POSIX_ACL
262 bool "Use JBD for compatibility" 264 bool "OCFS2 POSIX Access Control Lists"
263 depends on OCFS2_FS 265 depends on OCFS2_FS
266 select FS_POSIX_ACL
264 default n 267 default n
265 select JBD
266 help 268 help
267 The ocfs2 filesystem now uses JBD2 for its journalling. JBD2 269 Posix Access Control Lists (ACLs) support permissions for users and
268 is backwards compatible with JBD. It is safe to say N here. 270 groups beyond the owner/group/world scheme.
269 However, if you really want to use the original JBD, say Y here.
270 271
271endif # BLOCK 272endif # BLOCK
272 273
@@ -303,6 +304,10 @@ config PRINT_QUOTA_WARNING
303 Note that this behavior is currently deprecated and may go away in 304 Note that this behavior is currently deprecated and may go away in
304 future. Please use notification via netlink socket instead. 305 future. Please use notification via netlink socket instead.
305 306
307# Generic support for tree structured quota files. Seleted when needed.
308config QUOTA_TREE
309 tristate
310
306config QFMT_V1 311config QFMT_V1
307 tristate "Old quota format support" 312 tristate "Old quota format support"
308 depends on QUOTA 313 depends on QUOTA
@@ -314,6 +319,7 @@ config QFMT_V1
314config QFMT_V2 319config QFMT_V2
315 tristate "Quota format v2 support" 320 tristate "Quota format v2 support"
316 depends on QUOTA 321 depends on QUOTA
322 select QUOTA_TREE
317 help 323 help
318 This quota format allows using quotas with 32-bit UIDs/GIDs. If you 324 This quota format allows using quotas with 32-bit UIDs/GIDs. If you
319 need this functionality say Y here. 325 need this functionality say Y here.
@@ -715,7 +721,20 @@ config CONFIGFS_FS
715 721
716endmenu 722endmenu
717 723
718menu "Miscellaneous filesystems" 724menuconfig MISC_FILESYSTEMS
725 bool "Miscellaneous filesystems"
726 default y
727 ---help---
728 Say Y here to get to see options for various miscellaneous
729 filesystems, such as filesystems that came from other
730 operating systems.
731
732 This option alone does not add any kernel code.
733
734 If you say N, all options in this submenu will be skipped and
735 disabled; if unsure, say Y here.
736
737if MISC_FILESYSTEMS
719 738
720config ADFS_FS 739config ADFS_FS
721 tristate "ADFS file system support (EXPERIMENTAL)" 740 tristate "ADFS file system support (EXPERIMENTAL)"
@@ -1085,7 +1104,7 @@ config UFS_DEBUG
1085 Y here. This will result in _many_ additional debugging messages to be 1104 Y here. This will result in _many_ additional debugging messages to be
1086 written to the system log. 1105 written to the system log.
1087 1106
1088endmenu 1107endif # MISC_FILESYSTEMS
1089 1108
1090menuconfig NETWORK_FILESYSTEMS 1109menuconfig NETWORK_FILESYSTEMS
1091 bool "Network File Systems" 1110 bool "Network File Systems"
diff --git a/fs/Makefile b/fs/Makefile
index e6f423d1d228..c830611550d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
54obj-$(CONFIG_QUOTA) += dquot.o 54obj-$(CONFIG_QUOTA) += dquot.o
55obj-$(CONFIG_QFMT_V1) += quota_v1.o 55obj-$(CONFIG_QFMT_V1) += quota_v1.o
56obj-$(CONFIG_QFMT_V2) += quota_v2.o 56obj-$(CONFIG_QFMT_V2) += quota_v2.o
57obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
57obj-$(CONFIG_QUOTACTL) += quota.o 58obj-$(CONFIG_QUOTACTL) += quota.o
58 59
59obj-$(CONFIG_PROC_FS) += proc/ 60obj-$(CONFIG_PROC_FS) += proc/
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac16..3c4ec7d864c4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
119 goto bad_inode; 119 goto bad_inode;
120#else 120#else
121 inode->i_mode |= S_IFDIR; 121 inode->i_mode |= S_IFDIR;
122 inode->i_op = NULL; 122 /* ... and leave ->i_op and ->i_fop pointing to empty */
123 inode->i_fop = NULL;
124 break; 123 break;
125#endif 124#endif
126 case ST_LINKFILE: 125 case ST_LINKFILE:
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c60..e1734f2d6e26 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
251 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 251 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
252 inode->i_nlink = 2; 252 inode->i_nlink = 2;
253 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 253 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
254 inode->i_blocks = 0;
255 254
256 if (ino == AUTOFS_ROOT_INO) { 255 if (ino == AUTOFS_ROOT_INO) {
257 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; 256 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
258 inode->i_op = &autofs_root_inode_operations; 257 inode->i_op = &autofs_root_inode_operations;
259 inode->i_fop = &autofs_root_operations; 258 inode->i_fop = &autofs_root_operations;
260 inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
261 goto done; 259 goto done;
262 } 260 }
263 261
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index e0f16da00e54..a76803108d06 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -25,8 +25,6 @@
25#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) 25#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
26#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) 26#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11)
27 27
28#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
29
30#include <linux/kernel.h> 28#include <linux/kernel.h>
31#include <linux/slab.h> 29#include <linux/slab.h>
32#include <linux/time.h> 30#include <linux/time.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 63b7c7afe8df..025e105bffea 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
124 124
125/* 125/*
126 * Check sanity of parameter control fields and if a path is present 126 * Check sanity of parameter control fields and if a path is present
127 * check that it has a "/" and is terminated. 127 * check that it is terminated and contains at least one "/".
128 */ 128 */
129static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) 129static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
130{ 130{
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
138 } 138 }
139 139
140 if (param->size > sizeof(*param)) { 140 if (param->size > sizeof(*param)) {
141 err = check_name(param->path); 141 err = invalid_str(param->path,
142 (void *) ((size_t) param + param->size));
142 if (err) { 143 if (err) {
143 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", 144 AUTOFS_WARN(
144 cmd); 145 "path string terminator missing for cmd(0x%08x)",
146 cmd);
145 goto out; 147 goto out;
146 } 148 }
147 149
148 err = invalid_str(param->path, 150 err = check_name(param->path);
149 (void *) ((size_t) param + param->size));
150 if (err) { 151 if (err) {
151 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", 152 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
152 cmd); 153 cmd);
@@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp,
180 struct autofs_sb_info *sbi, 181 struct autofs_sb_info *sbi,
181 struct autofs_dev_ioctl *param) 182 struct autofs_dev_ioctl *param)
182{ 183{
183 param->arg1 = sbi->version; 184 param->protover.version = sbi->version;
184 return 0; 185 return 0;
185} 186}
186 187
@@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
189 struct autofs_sb_info *sbi, 190 struct autofs_sb_info *sbi,
190 struct autofs_dev_ioctl *param) 191 struct autofs_dev_ioctl *param)
191{ 192{
192 param->arg1 = sbi->sub_version; 193 param->protosubver.sub_version = sbi->sub_version;
193 return 0; 194 return 0;
194} 195}
195 196
@@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
335 int err, fd; 336 int err, fd;
336 337
337 /* param->path has already been checked */ 338 /* param->path has already been checked */
338 if (!param->arg1) 339 if (!param->openmount.devid)
339 return -EINVAL; 340 return -EINVAL;
340 341
341 param->ioctlfd = -1; 342 param->ioctlfd = -1;
342 343
343 path = param->path; 344 path = param->path;
344 devid = param->arg1; 345 devid = param->openmount.devid;
345 346
346 err = 0; 347 err = 0;
347 fd = autofs_dev_ioctl_open_mountpoint(path, devid); 348 fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
373{ 374{
374 autofs_wqt_t token; 375 autofs_wqt_t token;
375 376
376 token = (autofs_wqt_t) param->arg1; 377 token = (autofs_wqt_t) param->ready.token;
377 return autofs4_wait_release(sbi, token, 0); 378 return autofs4_wait_release(sbi, token, 0);
378} 379}
379 380
@@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp,
388 autofs_wqt_t token; 389 autofs_wqt_t token;
389 int status; 390 int status;
390 391
391 token = (autofs_wqt_t) param->arg1; 392 token = (autofs_wqt_t) param->fail.token;
392 status = param->arg2 ? param->arg2 : -ENOENT; 393 status = param->fail.status ? param->fail.status : -ENOENT;
393 return autofs4_wait_release(sbi, token, status); 394 return autofs4_wait_release(sbi, token, status);
394} 395}
395 396
@@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
412 int pipefd; 413 int pipefd;
413 int err = 0; 414 int err = 0;
414 415
415 if (param->arg1 == -1) 416 if (param->setpipefd.pipefd == -1)
416 return -EINVAL; 417 return -EINVAL;
417 418
418 pipefd = param->arg1; 419 pipefd = param->setpipefd.pipefd;
419 420
420 mutex_lock(&sbi->wq_mutex); 421 mutex_lock(&sbi->wq_mutex);
421 if (!sbi->catatonic) { 422 if (!sbi->catatonic) {
@@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
457{ 458{
458 unsigned long timeout; 459 unsigned long timeout;
459 460
460 timeout = param->arg1; 461 timeout = param->timeout.timeout;
461 param->arg1 = sbi->exp_timeout / HZ; 462 param->timeout.timeout = sbi->exp_timeout / HZ;
462 sbi->exp_timeout = timeout * HZ; 463 sbi->exp_timeout = timeout * HZ;
463 return 0; 464 return 0;
464} 465}
@@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
489 path = param->path; 490 path = param->path;
490 devid = sbi->sb->s_dev; 491 devid = sbi->sb->s_dev;
491 492
492 param->arg1 = param->arg2 = -1; 493 param->requester.uid = param->requester.gid = -1;
493 494
494 /* Get nameidata of the parent directory */ 495 /* Get nameidata of the parent directory */
495 err = path_lookup(path, LOOKUP_PARENT, &nd); 496 err = path_lookup(path, LOOKUP_PARENT, &nd);
@@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
505 err = 0; 506 err = 0;
506 autofs4_expire_wait(nd.path.dentry); 507 autofs4_expire_wait(nd.path.dentry);
507 spin_lock(&sbi->fs_lock); 508 spin_lock(&sbi->fs_lock);
508 param->arg1 = ino->uid; 509 param->requester.uid = ino->uid;
509 param->arg2 = ino->gid; 510 param->requester.gid = ino->gid;
510 spin_unlock(&sbi->fs_lock); 511 spin_unlock(&sbi->fs_lock);
511 } 512 }
512 513
@@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp,
529 int err = -EAGAIN; 530 int err = -EAGAIN;
530 int how; 531 int how;
531 532
532 how = param->arg1; 533 how = param->expire.how;
533 mnt = fp->f_path.mnt; 534 mnt = fp->f_path.mnt;
534 535
535 if (sbi->type & AUTOFS_TYPE_TRIGGER) 536 if (autofs_type_trigger(sbi->type))
536 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); 537 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
537 else 538 else
538 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); 539 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
@@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
565 struct autofs_sb_info *sbi, 566 struct autofs_sb_info *sbi,
566 struct autofs_dev_ioctl *param) 567 struct autofs_dev_ioctl *param)
567{ 568{
568 param->arg1 = 0; 569 param->askumount.may_umount = 0;
569 if (may_umount(fp->f_path.mnt)) 570 if (may_umount(fp->f_path.mnt))
570 param->arg1 = 1; 571 param->askumount.may_umount = 1;
571 return 0; 572 return 0;
572} 573}
573 574
@@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
600 struct nameidata nd; 601 struct nameidata nd;
601 const char *path; 602 const char *path;
602 unsigned int type; 603 unsigned int type;
604 unsigned int devid, magic;
603 int err = -ENOENT; 605 int err = -ENOENT;
604 606
605 if (param->size <= sizeof(*param)) { 607 if (param->size <= sizeof(*param)) {
@@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
608 } 610 }
609 611
610 path = param->path; 612 path = param->path;
611 type = param->arg1; 613 type = param->ismountpoint.in.type;
612 614
613 param->arg1 = 0; 615 param->ismountpoint.out.devid = devid = 0;
614 param->arg2 = 0; 616 param->ismountpoint.out.magic = magic = 0;
615 617
616 if (!fp || param->ioctlfd == -1) { 618 if (!fp || param->ioctlfd == -1) {
617 if (type == AUTOFS_TYPE_ANY) { 619 if (autofs_type_any(type)) {
618 struct super_block *sb; 620 struct super_block *sb;
619 621
620 err = path_lookup(path, LOOKUP_FOLLOW, &nd); 622 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
@@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
622 goto out; 624 goto out;
623 625
624 sb = nd.path.dentry->d_sb; 626 sb = nd.path.dentry->d_sb;
625 param->arg1 = new_encode_dev(sb->s_dev); 627 devid = new_encode_dev(sb->s_dev);
626 } else { 628 } else {
627 struct autofs_info *ino; 629 struct autofs_info *ino;
628 630
@@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
635 goto out_release; 637 goto out_release;
636 638
637 ino = autofs4_dentry_ino(nd.path.dentry); 639 ino = autofs4_dentry_ino(nd.path.dentry);
638 param->arg1 = autofs4_get_dev(ino->sbi); 640 devid = autofs4_get_dev(ino->sbi);
639 } 641 }
640 642
641 err = 0; 643 err = 0;
642 if (nd.path.dentry->d_inode && 644 if (nd.path.dentry->d_inode &&
643 nd.path.mnt->mnt_root == nd.path.dentry) { 645 nd.path.mnt->mnt_root == nd.path.dentry) {
644 err = 1; 646 err = 1;
645 param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; 647 magic = nd.path.dentry->d_inode->i_sb->s_magic;
646 } 648 }
647 } else { 649 } else {
648 dev_t devid = new_encode_dev(sbi->sb->s_dev); 650 dev_t dev = autofs4_get_dev(sbi);
649 651
650 err = path_lookup(path, LOOKUP_PARENT, &nd); 652 err = path_lookup(path, LOOKUP_PARENT, &nd);
651 if (err) 653 if (err)
652 goto out; 654 goto out;
653 655
654 err = autofs_dev_ioctl_find_super(&nd, devid); 656 err = autofs_dev_ioctl_find_super(&nd, dev);
655 if (err) 657 if (err)
656 goto out_release; 658 goto out_release;
657 659
658 param->arg1 = autofs4_get_dev(sbi); 660 devid = dev;
659 661
660 err = have_submounts(nd.path.dentry); 662 err = have_submounts(nd.path.dentry);
661 663
662 if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { 664 if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
663 if (follow_down(&nd.path.mnt, &nd.path.dentry)) { 665 if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
664 struct inode *inode = nd.path.dentry->d_inode; 666 struct inode *inode = nd.path.dentry->d_inode;
665 param->arg2 = inode->i_sb->s_magic; 667 magic = inode->i_sb->s_magic;
666 } 668 }
667 } 669 }
668 } 670 }
669 671
672 param->ismountpoint.out.devid = devid;
673 param->ismountpoint.out.magic = magic;
674
670out_release: 675out_release:
671 path_put(&nd.path); 676 path_put(&nd.path);
672out: 677out:
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4b6fb3f628c0..e3bd50776f9e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
63 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 63 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
64 64
65 /* This is an autofs submount, we can't expire it */ 65 /* This is an autofs submount, we can't expire it */
66 if (sbi->type == AUTOFS_TYPE_INDIRECT) 66 if (autofs_type_indirect(sbi->type))
67 goto done; 67 goto done;
68 68
69 /* 69 /*
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
490 if (arg && get_user(do_now, arg)) 490 if (arg && get_user(do_now, arg))
491 return -EFAULT; 491 return -EFAULT;
492 492
493 if (sbi->type & AUTOFS_TYPE_TRIGGER) 493 if (autofs_type_trigger(sbi->type))
494 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); 494 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
495 else 495 else
496 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); 496 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef4..716e12b627b2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
197 seq_printf(m, ",minproto=%d", sbi->min_proto); 197 seq_printf(m, ",minproto=%d", sbi->min_proto);
198 seq_printf(m, ",maxproto=%d", sbi->max_proto); 198 seq_printf(m, ",maxproto=%d", sbi->max_proto);
199 199
200 if (sbi->type & AUTOFS_TYPE_OFFSET) 200 if (autofs_type_offset(sbi->type))
201 seq_printf(m, ",offset"); 201 seq_printf(m, ",offset");
202 else if (sbi->type & AUTOFS_TYPE_DIRECT) 202 else if (autofs_type_direct(sbi->type))
203 seq_printf(m, ",direct"); 203 seq_printf(m, ",direct");
204 else 204 else
205 seq_printf(m, ",indirect"); 205 seq_printf(m, ",indirect");
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
284 *maxproto = option; 284 *maxproto = option;
285 break; 285 break;
286 case Opt_indirect: 286 case Opt_indirect:
287 *type = AUTOFS_TYPE_INDIRECT; 287 set_autofs_type_indirect(type);
288 break; 288 break;
289 case Opt_direct: 289 case Opt_direct:
290 *type = AUTOFS_TYPE_DIRECT; 290 set_autofs_type_direct(type);
291 break; 291 break;
292 case Opt_offset: 292 case Opt_offset:
293 *type = AUTOFS_TYPE_OFFSET; 293 set_autofs_type_offset(type);
294 break; 294 break;
295 default: 295 default:
296 return 1; 296 return 1;
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
338 sbi->sb = s; 338 sbi->sb = s;
339 sbi->version = 0; 339 sbi->version = 0;
340 sbi->sub_version = 0; 340 sbi->sub_version = 0;
341 sbi->type = AUTOFS_TYPE_INDIRECT; 341 set_autofs_type_indirect(&sbi->type);
342 sbi->min_proto = 0; 342 sbi->min_proto = 0;
343 sbi->max_proto = 0; 343 sbi->max_proto = 0;
344 mutex_init(&sbi->wq_mutex); 344 mutex_init(&sbi->wq_mutex);
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
380 } 380 }
381 381
382 root_inode->i_fop = &autofs4_root_operations; 382 root_inode->i_fop = &autofs4_root_operations;
383 root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? 383 root_inode->i_op = autofs_type_trigger(sbi->type) ?
384 &autofs4_direct_root_inode_operations : 384 &autofs4_direct_root_inode_operations :
385 &autofs4_indirect_root_inode_operations; 385 &autofs4_indirect_root_inode_operations;
386 386
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
455 if (sb->s_root) { 455 if (sb->s_root) {
456 inode->i_uid = sb->s_root->d_inode->i_uid; 456 inode->i_uid = sb->s_root->d_inode->i_uid;
457 inode->i_gid = sb->s_root->d_inode->i_gid; 457 inode->i_gid = sb->s_root->d_inode->i_gid;
458 } else {
459 inode->i_uid = 0;
460 inode->i_gid = 0;
461 } 458 }
462 inode->i_blocks = 0;
463 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 459 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
464 460
465 if (S_ISDIR(inf->mode)) { 461 if (S_ISDIR(inf->mode)) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e02cc8ae5eb3..eeb246845909 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
337 * is very similar for indirect mounts except only dentrys 337 * is very similar for indirect mounts except only dentrys
338 * in the root of the autofs file system may be negative. 338 * in the root of the autofs file system may be negative.
339 */ 339 */
340 if (sbi->type & AUTOFS_TYPE_TRIGGER) 340 if (autofs_type_trigger(sbi->type))
341 return -ENOENT; 341 return -ENOENT;
342 else if (!IS_ROOT(dentry->d_parent)) 342 else if (!IS_ROOT(dentry->d_parent))
343 return -ENOENT; 343 return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
348 return -ENOMEM; 348 return -ENOMEM;
349 349
350 /* If this is a direct mount request create a dummy name */ 350 /* If this is a direct mount request create a dummy name */
351 if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) 351 if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
352 qstr.len = sprintf(name, "%p", dentry); 352 qstr.len = sprintf(name, "%p", dentry);
353 else { 353 else {
354 qstr.len = autofs4_getpath(sbi, dentry, &name); 354 qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
406 type = autofs_ptype_expire_multi; 406 type = autofs_ptype_expire_multi;
407 } else { 407 } else {
408 if (notify == NFY_MOUNT) 408 if (notify == NFY_MOUNT)
409 type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? 409 type = autofs_type_trigger(sbi->type) ?
410 autofs_ptype_missing_direct : 410 autofs_ptype_missing_direct :
411 autofs_ptype_missing_indirect; 411 autofs_ptype_missing_indirect;
412 else 412 else
413 type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? 413 type = autofs_type_trigger(sbi->type) ?
414 autofs_ptype_expire_direct : 414 autofs_ptype_expire_direct :
415 autofs_ptype_expire_indirect; 415 autofs_ptype_expire_indirect;
416 } 416 }
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 0ed57b5ee012..cc4062d12ca2 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s)
213{ 213{
214 struct bfs_sb_info *info = BFS_SB(s); 214 struct bfs_sb_info *info = BFS_SB(s);
215 215
216 if (!info)
217 return;
218
216 brelse(info->si_sbh); 219 brelse(info->si_sbh);
217 mutex_destroy(&info->bfs_lock); 220 mutex_destroy(&info->bfs_lock);
218 kfree(info->si_imap); 221 kfree(info->si_imap);
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
327 unsigned i, imap_len; 330 unsigned i, imap_len;
328 struct bfs_sb_info *info; 331 struct bfs_sb_info *info;
329 long ret = -EINVAL; 332 long ret = -EINVAL;
333 unsigned long i_sblock, i_eblock, i_eoff, s_size;
330 334
331 info = kzalloc(sizeof(*info), GFP_KERNEL); 335 info = kzalloc(sizeof(*info), GFP_KERNEL);
332 if (!info) 336 if (!info)
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
350 354
351 s->s_magic = BFS_MAGIC; 355 s->s_magic = BFS_MAGIC;
352 info->si_sbh = bh; 356 info->si_sbh = bh;
357
358 if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
359 printf("Superblock is corrupted\n");
360 goto out;
361 }
362
353 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / 363 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
354 sizeof(struct bfs_inode) 364 sizeof(struct bfs_inode)
355 + BFS_ROOT_INO - 1; 365 + BFS_ROOT_INO - 1;
@@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
380 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; 390 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
381 info->si_freei = 0; 391 info->si_freei = 0;
382 info->si_lf_eblk = 0; 392 info->si_lf_eblk = 0;
393
394 /* can we read the last block? */
395 bh = sb_bread(s, info->si_blocks - 1);
396 if (!bh) {
397 printf("Last block not available: %lu\n", info->si_blocks - 1);
398 iput(inode);
399 ret = -EIO;
400 kfree(info->si_imap);
401 goto out;
402 }
403 brelse(bh);
404
383 bh = NULL; 405 bh = NULL;
384 for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { 406 for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
385 struct bfs_inode *di; 407 struct bfs_inode *di;
@@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
397 419
398 di = (struct bfs_inode *)bh->b_data + off; 420 di = (struct bfs_inode *)bh->b_data + off;
399 421
422 /* test if filesystem is not corrupted */
423
424 i_eoff = le32_to_cpu(di->i_eoffset);
425 i_sblock = le32_to_cpu(di->i_sblock);
426 i_eblock = le32_to_cpu(di->i_eblock);
427 s_size = le32_to_cpu(bfs_sb->s_end);
428
429 if (i_sblock > info->si_blocks ||
430 i_eblock > info->si_blocks ||
431 i_sblock > i_eblock ||
432 i_eoff > s_size ||
433 i_sblock * BFS_BSIZE > i_eoff) {
434
435 printf("Inode 0x%08x corrupted\n", i);
436
437 brelse(bh);
438 s->s_root = NULL;
439 kfree(info->si_imap);
440 kfree(info);
441 s->s_fs_info = NULL;
442 return -EIO;
443 }
444
400 if (!di->i_ino) { 445 if (!di->i_ino) {
401 info->si_freei++; 446 info->si_freei++;
402 continue; 447 continue;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
152 elf_addr_t __user *sp; 152 elf_addr_t __user *sp;
153 elf_addr_t __user *u_platform; 153 elf_addr_t __user *u_platform;
154 elf_addr_t __user *u_base_platform; 154 elf_addr_t __user *u_base_platform;
155 elf_addr_t __user *u_rand_bytes;
155 const char *k_platform = ELF_PLATFORM; 156 const char *k_platform = ELF_PLATFORM;
156 const char *k_base_platform = ELF_BASE_PLATFORM; 157 const char *k_base_platform = ELF_BASE_PLATFORM;
158 unsigned char k_rand_bytes[16];
157 int items; 159 int items;
158 elf_addr_t *elf_info; 160 elf_addr_t *elf_info;
159 int ei_index = 0; 161 int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
196 return -EFAULT; 198 return -EFAULT;
197 } 199 }
198 200
201 /*
202 * Generate 16 random bytes for userspace PRNG seeding.
203 */
204 get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
205 u_rand_bytes = (elf_addr_t __user *)
206 STACK_ALLOC(p, sizeof(k_rand_bytes));
207 if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
208 return -EFAULT;
209
199 /* Create the ELF interpreter info */ 210 /* Create the ELF interpreter info */
200 elf_info = (elf_addr_t *)current->mm->saved_auxv; 211 elf_info = (elf_addr_t *)current->mm->saved_auxv;
201 /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ 212 /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
228 NEW_AUX_ENT(AT_GID, cred->gid); 239 NEW_AUX_ENT(AT_GID, cred->gid);
229 NEW_AUX_ENT(AT_EGID, cred->egid); 240 NEW_AUX_ENT(AT_EGID, cred->egid);
230 NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); 241 NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
242 NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
231 NEW_AUX_ENT(AT_EXECFN, bprm->exec); 243 NEW_AUX_ENT(AT_EXECFN, bprm->exec);
232 if (k_platform) { 244 if (k_platform) {
233 NEW_AUX_ENT(AT_PLATFORM, 245 NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b3..c4e83537ead7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
496 496
497 if (inode) { 497 if (inode) {
498 inode->i_mode = mode; 498 inode->i_mode = mode;
499 inode->i_uid = 0;
500 inode->i_gid = 0;
501 inode->i_blocks = 0;
502 inode->i_atime = inode->i_mtime = inode->i_ctime = 499 inode->i_atime = inode->i_mtime = inode->i_ctime =
503 current_fs_time(inode->i_sb); 500 current_fs_time(inode->i_sb);
504 } 501 }
@@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = {
652static ssize_t 649static ssize_t
653bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) 650bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
654{ 651{
655 char *s = enabled ? "enabled" : "disabled"; 652 char *s = enabled ? "enabled\n" : "disabled\n";
656 653
657 return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); 654 return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
658} 655}
diff --git a/fs/bio.c b/fs/bio.c
index 711cee103602..062299acbccd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
788 int i, ret; 788 int i, ret;
789 int nr_pages = 0; 789 int nr_pages = 0;
790 unsigned int len = 0; 790 unsigned int len = 0;
791 unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
791 792
792 for (i = 0; i < iov_count; i++) { 793 for (i = 0; i < iov_count; i++) {
793 unsigned long uaddr; 794 unsigned long uaddr;
@@ -814,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
814 bio->bi_rw |= (!write_to_vm << BIO_RW); 815 bio->bi_rw |= (!write_to_vm << BIO_RW);
815 816
816 ret = 0; 817 ret = 0;
817 i = 0; 818
819 if (map_data) {
820 nr_pages = 1 << map_data->page_order;
821 i = map_data->offset / PAGE_SIZE;
822 }
818 while (len) { 823 while (len) {
819 unsigned int bytes; 824 unsigned int bytes = PAGE_SIZE;
820 825
821 if (map_data) 826 bytes -= offset;
822 bytes = 1U << (PAGE_SHIFT + map_data->page_order);
823 else
824 bytes = PAGE_SIZE;
825 827
826 if (bytes > len) 828 if (bytes > len)
827 bytes = len; 829 bytes = len;
828 830
829 if (map_data) { 831 if (map_data) {
830 if (i == map_data->nr_entries) { 832 if (i == map_data->nr_entries * nr_pages) {
831 ret = -ENOMEM; 833 ret = -ENOMEM;
832 break; 834 break;
833 } 835 }
834 page = map_data->pages[i++]; 836
835 } else 837 page = map_data->pages[i / nr_pages];
838 page += (i % nr_pages);
839
840 i++;
841 } else {
836 page = alloc_page(q->bounce_gfp | gfp_mask); 842 page = alloc_page(q->bounce_gfp | gfp_mask);
837 if (!page) { 843 if (!page) {
838 ret = -ENOMEM; 844 ret = -ENOMEM;
839 break; 845 break;
846 }
840 } 847 }
841 848
842 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) 849 if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
843 break; 850 break;
844 851
845 len -= bytes; 852 len -= bytes;
853 offset = 0;
846 } 854 }
847 855
848 if (ret) 856 if (ret)
@@ -851,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
851 /* 859 /*
852 * success 860 * success
853 */ 861 */
854 if (!write_to_vm) { 862 if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
855 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); 863 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
856 if (ret) 864 if (ret)
857 goto cleanup; 865 goto cleanup;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1dd07e66e98a..ac7031f12ea5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1005 } 1005 }
1006 1006
1007 lock_kernel(); 1007 lock_kernel();
1008 restart:
1008 1009
1009 ret = -ENXIO; 1010 ret = -ENXIO;
1010 disk = get_gendisk(bdev->bd_dev, &partno); 1011 disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1025 1026
1026 if (disk->fops->open) { 1027 if (disk->fops->open) {
1027 ret = disk->fops->open(bdev, mode); 1028 ret = disk->fops->open(bdev, mode);
1029 if (ret == -ERESTARTSYS) {
1030 /* Lost a race with 'disk' being
1031 * deleted, try again.
1032 * See md.c
1033 */
1034 disk_put_part(bdev->bd_part);
1035 bdev->bd_part = NULL;
1036 module_put(disk->fops->owner);
1037 put_disk(disk);
1038 bdev->bd_disk = NULL;
1039 mutex_unlock(&bdev->bd_mutex);
1040 goto restart;
1041 }
1028 if (ret) 1042 if (ret)
1029 goto out_clear; 1043 goto out_clear;
1030 } 1044 }
@@ -1277,7 +1291,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
1277 1291
1278/** 1292/**
1279 * lookup_bdev - lookup a struct block_device by name 1293 * lookup_bdev - lookup a struct block_device by name
1280 * @path: special file representing the block device 1294 * @pathname: special file representing the block device
1281 * 1295 *
1282 * Get a reference to the blockdevice at @pathname in the current 1296 * Get a reference to the blockdevice at @pathname in the current
1283 * namespace if possible and return it. Return ERR_PTR(error) 1297 * namespace if possible and return it. Return ERR_PTR(error)
diff --git a/fs/buffer.c b/fs/buffer.c
index a13f09b696f7..c26da785938a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2022,7 +2022,6 @@ int block_write_begin(struct file *file, struct address_space *mapping,
2022 if (pos + len > inode->i_size) 2022 if (pos + len > inode->i_size)
2023 vmtruncate(inode, inode->i_size); 2023 vmtruncate(inode, inode->i_size);
2024 } 2024 }
2025 goto out;
2026 } 2025 }
2027 2026
2028out: 2027out:
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 700697a72618..38f71222a552 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
120 cd->major = major; 120 cd->major = major;
121 cd->baseminor = baseminor; 121 cd->baseminor = baseminor;
122 cd->minorct = minorct; 122 cd->minorct = minorct;
123 strncpy(cd->name,name, 64); 123 strlcpy(cd->name, name, sizeof(cd->name));
124 124
125 i = major_to_index(major); 125 i = major_to_index(major);
126 126
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4edc..5ab9896fdcb2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1641,7 +1641,7 @@ do_expand:
1641 i_size_write(inode, offset); 1641 i_size_write(inode, offset);
1642 spin_unlock(&inode->i_lock); 1642 spin_unlock(&inode->i_lock);
1643out_truncate: 1643out_truncate:
1644 if (inode->i_op && inode->i_op->truncate) 1644 if (inode->i_op->truncate)
1645 inode->i_op->truncate(inode); 1645 inode->i_op->truncate(inode);
1646 return 0; 1646 return 0;
1647out_sig: 1647out_sig:
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df6..6a347fbc998a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
201int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) 201int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
202{ 202{
203 struct file *host_file; 203 struct file *host_file;
204 struct dentry *host_dentry; 204 struct inode *coda_inode = coda_dentry->d_inode;
205 struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
206 struct coda_file_info *cfi; 205 struct coda_file_info *cfi;
207 int err = 0; 206 int err = 0;
208 207
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
214 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 213 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
215 host_file = cfi->cfi_container; 214 host_file = cfi->cfi_container;
216 215
217 if (host_file->f_op && host_file->f_op->fsync) { 216 err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
218 host_dentry = host_file->f_path.dentry;
219 host_inode = host_dentry->d_inode;
220 mutex_lock(&host_inode->i_mutex);
221 err = host_file->f_op->fsync(host_file, host_dentry, datasync);
222 mutex_unlock(&host_inode->i_mutex);
223 }
224
225 if ( !err && !datasync ) { 217 if ( !err && !datasync ) {
226 lock_kernel(); 218 lock_kernel();
227 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 219 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c6465..43c96ce29614 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
11 11
12#include "coda_int.h" 12#include "coda_int.h"
13 13
14#ifdef CONFIG_SYSCTL
14static struct ctl_table_header *fs_table_header; 15static struct ctl_table_header *fs_table_header;
16#endif
15 17
16static ctl_table coda_table[] = { 18static ctl_table coda_table[] = {
17 { 19 {
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
41 {} 43 {}
42}; 44};
43 45
46#ifdef CONFIG_SYSCTL
44static ctl_table fs_table[] = { 47static ctl_table fs_table[] = {
45 { 48 {
46 .ctl_name = CTL_UNNUMBERED, 49 .ctl_name = CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
50 }, 53 },
51 {} 54 {}
52}; 55};
53 56#endif
54 57
55void coda_sysctl_init(void) 58void coda_sysctl_init(void)
56{ 59{
diff --git a/fs/compat.c b/fs/compat.c
index d1ece79b6411..30f2faa22f5c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
1187 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); 1187 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
1188 1188
1189out: 1189out:
1190 if (ret > 0)
1191 add_rchar(current, ret);
1192 inc_syscr(current);
1190 fput(file); 1193 fput(file);
1191 return ret; 1194 return ret;
1192} 1195}
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
1210 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); 1213 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
1211 1214
1212out: 1215out:
1216 if (ret > 0)
1217 add_wchar(current, ret);
1218 inc_syscw(current);
1213 fput(file); 1219 fput(file);
1214 return ret; 1220 return ret;
1215} 1221}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc94480..5d349d38e056 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
117static inline void set_default_inode_attr(struct inode * inode, mode_t mode) 117static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
118{ 118{
119 inode->i_mode = mode; 119 inode->i_mode = mode;
120 inode->i_uid = 0;
121 inode->i_gid = 0;
122 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 120 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
123} 121}
124 122
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
136{ 134{
137 struct inode * inode = new_inode(configfs_sb); 135 struct inode * inode = new_inode(configfs_sb);
138 if (inode) { 136 if (inode) {
139 inode->i_blocks = 0;
140 inode->i_mapping->a_ops = &configfs_aops; 137 inode->i_mapping->a_ops = &configfs_aops;
141 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; 138 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
142 inode->i_op = &configfs_inode_operations; 139 inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a14..a07338d2d140 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
83 inode->i_op = &page_symlink_inode_operations; 83 inode->i_op = &page_symlink_inode_operations;
84 inode->i_data.a_ops = &cramfs_aops; 84 inode->i_data.a_ops = &cramfs_aops;
85 } else { 85 } else {
86 inode->i_size = 0;
87 inode->i_blocks = 0;
88 init_special_inode(inode, inode->i_mode, 86 init_special_inode(inode, inode->i_mode,
89 old_decode_dev(cramfs_inode->size)); 87 old_decode_dev(cramfs_inode->size));
90 } 88 }
diff --git a/fs/dcache.c b/fs/dcache.c
index e88c23b85a32..4547f66884a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1567,10 +1567,6 @@ void d_rehash(struct dentry * entry)
1567 spin_unlock(&dcache_lock); 1567 spin_unlock(&dcache_lock);
1568} 1568}
1569 1569
1570#define do_switch(x,y) do { \
1571 __typeof__ (x) __tmp = x; \
1572 x = y; y = __tmp; } while (0)
1573
1574/* 1570/*
1575 * When switching names, the actual string doesn't strictly have to 1571 * When switching names, the actual string doesn't strictly have to
1576 * be preserved in the target - because we're dropping the target 1572 * be preserved in the target - because we're dropping the target
@@ -1589,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1589 /* 1585 /*
1590 * Both external: swap the pointers 1586 * Both external: swap the pointers
1591 */ 1587 */
1592 do_switch(target->d_name.name, dentry->d_name.name); 1588 swap(target->d_name.name, dentry->d_name.name);
1593 } else { 1589 } else {
1594 /* 1590 /*
1595 * dentry:internal, target:external. Steal target's 1591 * dentry:internal, target:external. Steal target's
@@ -1620,7 +1616,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1620 return; 1616 return;
1621 } 1617 }
1622 } 1618 }
1623 do_switch(dentry->d_name.len, target->d_name.len); 1619 swap(dentry->d_name.len, target->d_name.len);
1624} 1620}
1625 1621
1626/* 1622/*
@@ -1680,7 +1676,7 @@ already_unhashed:
1680 1676
1681 /* Switch the names.. */ 1677 /* Switch the names.. */
1682 switch_names(dentry, target); 1678 switch_names(dentry, target);
1683 do_switch(dentry->d_name.hash, target->d_name.hash); 1679 swap(dentry->d_name.hash, target->d_name.hash);
1684 1680
1685 /* ... and switch the parents */ 1681 /* ... and switch the parents */
1686 if (IS_ROOT(dentry)) { 1682 if (IS_ROOT(dentry)) {
@@ -1688,7 +1684,7 @@ already_unhashed:
1688 target->d_parent = target; 1684 target->d_parent = target;
1689 INIT_LIST_HEAD(&target->d_u.d_child); 1685 INIT_LIST_HEAD(&target->d_u.d_child);
1690 } else { 1686 } else {
1691 do_switch(dentry->d_parent, target->d_parent); 1687 swap(dentry->d_parent, target->d_parent);
1692 1688
1693 /* And add them back to the (new) parent lists */ 1689 /* And add them back to the (new) parent lists */
1694 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); 1690 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1789,7 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
1789 struct dentry *dparent, *aparent; 1785 struct dentry *dparent, *aparent;
1790 1786
1791 switch_names(dentry, anon); 1787 switch_names(dentry, anon);
1792 do_switch(dentry->d_name.hash, anon->d_name.hash); 1788 swap(dentry->d_name.hash, anon->d_name.hash);
1793 1789
1794 dparent = dentry->d_parent; 1790 dparent = dentry->d_parent;
1795 aparent = anon->d_parent; 1791 aparent = anon->d_parent;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 159a5efd6a8a..33a90120f6ad 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
294} 294}
295EXPORT_SYMBOL_GPL(debugfs_create_x32); 295EXPORT_SYMBOL_GPL(debugfs_create_x32);
296 296
297
298static int debugfs_size_t_set(void *data, u64 val)
299{
300 *(size_t *)data = val;
301 return 0;
302}
303static int debugfs_size_t_get(void *data, u64 *val)
304{
305 *val = *(size_t *)data;
306 return 0;
307}
308DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
309 "%llu\n"); /* %llu and %zu are more or less the same */
310
311/**
312 * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
313 * @name: a pointer to a string containing the name of the file to create.
314 * @mode: the permission that the file should have
315 * @parent: a pointer to the parent dentry for this file. This should be a
316 * directory dentry if set. If this parameter is %NULL, then the
317 * file will be created in the root of the debugfs filesystem.
318 * @value: a pointer to the variable that the file should read to and write
319 * from.
320 */
321struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
322 struct dentry *parent, size_t *value)
323{
324 return debugfs_create_file(name, mode, parent, value, &fops_size_t);
325}
326EXPORT_SYMBOL_GPL(debugfs_create_size_t);
327
328
297static ssize_t read_file_bool(struct file *file, char __user *user_buf, 329static ssize_t read_file_bool(struct file *file, char __user *user_buf,
298 size_t count, loff_t *ppos) 330 size_t count, loff_t *ppos)
299{ 331{
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf36..81ae9ea3c6e1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
37 37
38 if (inode) { 38 if (inode) {
39 inode->i_mode = mode; 39 inode->i_mode = mode;
40 inode->i_uid = 0;
41 inode->i_gid = 0;
42 inode->i_blocks = 0;
43 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 40 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
44 switch (mode & S_IFMT) { 41 switch (mode & S_IFMT) {
45 default: 42 default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index fff96e152c0c..5f3231b9633f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -189,8 +189,6 @@ static int mknod_ptmx(struct super_block *sb)
189 } 189 }
190 190
191 inode->i_ino = 2; 191 inode->i_ino = 2;
192 inode->i_uid = inode->i_gid = 0;
193 inode->i_blocks = 0;
194 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 192 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
195 193
196 mode = S_IFCHR|opts->ptmxmode; 194 mode = S_IFCHR|opts->ptmxmode;
@@ -300,8 +298,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
300 goto free_fsi; 298 goto free_fsi;
301 inode->i_ino = 1; 299 inode->i_ino = 1;
302 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 300 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
303 inode->i_blocks = 0;
304 inode->i_uid = inode->i_gid = 0;
305 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; 301 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
306 inode->i_op = &simple_dir_inode_operations; 302 inode->i_op = &simple_dir_inode_operations;
307 inode->i_fop = &simple_dir_operations; 303 inode->i_fop = &simple_dir_operations;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558dbe8b7..b6d43908ff7a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1209 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1209 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1210 nr_segs, blkbits, get_block, end_io, dio); 1210 nr_segs, blkbits, get_block, end_io, dio);
1211 1211
1212 /*
1213 * In case of error extending write may have instantiated a few
1214 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1215 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
1216 * it's own meaner.
1217 */
1218 if (unlikely(retval < 0 && (rw & WRITE))) {
1219 loff_t isize = i_size_read(inode);
1220
1221 if (end > isize && dio_lock_type == DIO_LOCKING)
1222 vmtruncate(inode, isize);
1223 }
1224
1212 if (rw == READ && dio_lock_type == DIO_LOCKING) 1225 if (rw == READ && dio_lock_type == DIO_LOCKING)
1213 release_i_mutex = 0; 1226 release_i_mutex = 0;
1214 1227
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8bf31e3fbf01..dc2ad6008b2d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
33 spin_unlock(&ast_queue_lock); 33 spin_unlock(&ast_queue_lock);
34} 34}
35 35
36void dlm_add_ast(struct dlm_lkb *lkb, int type) 36void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
37{ 37{
38 if (lkb->lkb_flags & DLM_IFL_USER) { 38 if (lkb->lkb_flags & DLM_IFL_USER) {
39 dlm_user_add_ast(lkb, type); 39 dlm_user_add_ast(lkb, type, bastmode);
40 return; 40 return;
41 } 41 }
42 42
@@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type)
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue); 46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 } 47 }
48 lkb->lkb_ast_type |= type; 48 lkb->lkb_ast_type |= type;
49 if (bastmode)
50 lkb->lkb_bastmode = bastmode;
49 spin_unlock(&ast_queue_lock); 51 spin_unlock(&ast_queue_lock);
50 52
51 set_bit(WAKE_ASTS, &astd_wakeflags); 53 set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,50 +61,40 @@ static void process_asts(void)
59 struct dlm_lkb *lkb; 61 struct dlm_lkb *lkb;
60 void (*cast) (void *astparam); 62 void (*cast) (void *astparam);
61 void (*bast) (void *astparam, int mode); 63 void (*bast) (void *astparam, int mode);
62 int type = 0, found, bmode; 64 int type = 0, bastmode;
63 65
64 for (;;) { 66repeat:
65 found = 0; 67 spin_lock(&ast_queue_lock);
66 spin_lock(&ast_queue_lock); 68 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
67 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { 69 r = lkb->lkb_resource;
68 r = lkb->lkb_resource; 70 ls = r->res_ls;
69 ls = r->res_ls; 71
70 72 if (dlm_locking_stopped(ls))
71 if (dlm_locking_stopped(ls)) 73 continue;
72 continue;
73
74 list_del(&lkb->lkb_astqueue);
75 type = lkb->lkb_ast_type;
76 lkb->lkb_ast_type = 0;
77 found = 1;
78 break;
79 }
80 spin_unlock(&ast_queue_lock);
81 74
82 if (!found) 75 list_del(&lkb->lkb_astqueue);
83 break; 76 type = lkb->lkb_ast_type;
77 lkb->lkb_ast_type = 0;
78 bastmode = lkb->lkb_bastmode;
84 79
80 spin_unlock(&ast_queue_lock);
85 cast = lkb->lkb_astfn; 81 cast = lkb->lkb_astfn;
86 bast = lkb->lkb_bastfn; 82 bast = lkb->lkb_bastfn;
87 bmode = lkb->lkb_bastmode;
88 83
89 if ((type & AST_COMP) && cast) 84 if ((type & AST_COMP) && cast)
90 cast(lkb->lkb_astparam); 85 cast(lkb->lkb_astparam);
91 86
92 /* FIXME: Is it safe to look at lkb_grmode here
93 without doing a lock_rsb() ?
94 Look at other checks in v1 to avoid basts. */
95
96 if ((type & AST_BAST) && bast) 87 if ((type & AST_BAST) && bast)
97 if (!dlm_modes_compat(lkb->lkb_grmode, bmode)) 88 bast(lkb->lkb_astparam, bastmode);
98 bast(lkb->lkb_astparam, bmode);
99 89
100 /* this removes the reference added by dlm_add_ast 90 /* this removes the reference added by dlm_add_ast
101 and may result in the lkb being freed */ 91 and may result in the lkb being freed */
102 dlm_put_lkb(lkb); 92 dlm_put_lkb(lkb);
103 93
104 schedule(); 94 cond_resched();
95 goto repeat;
105 } 96 }
97 spin_unlock(&ast_queue_lock);
106} 98}
107 99
108static inline int no_asts(void) 100static inline int no_asts(void)
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 6ee276c74c52..1b5fc5f428fd 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
13#ifndef __ASTD_DOT_H__ 13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__ 14#define __ASTD_DOT_H__
15 15
16void dlm_add_ast(struct dlm_lkb *lkb, int type); 16void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
17void dlm_del_ast(struct dlm_lkb *lkb); 17void dlm_del_ast(struct dlm_lkb *lkb);
18 18
19void dlm_astd_wake(void); 19void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fc24f4507a3..2f107d1a6a45 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,7 @@ static struct dentry *dlm_root;
27 27
28struct rsb_iter { 28struct rsb_iter {
29 int entry; 29 int entry;
30 int locks; 30 int format;
31 int header; 31 int header;
32 struct dlm_ls *ls; 32 struct dlm_ls *ls;
33 struct list_head *next; 33 struct list_head *next;
@@ -60,8 +60,8 @@ static char *print_lockmode(int mode)
60 } 60 }
61} 61}
62 62
63static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb, 63static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
64 struct dlm_rsb *res) 64 struct dlm_rsb *res)
65{ 65{
66 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); 66 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
67 67
@@ -83,7 +83,7 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
83 seq_printf(s, "\n"); 83 seq_printf(s, "\n");
84} 84}
85 85
86static int print_resource(struct dlm_rsb *res, struct seq_file *s) 86static int print_format1(struct dlm_rsb *res, struct seq_file *s)
87{ 87{
88 struct dlm_lkb *lkb; 88 struct dlm_lkb *lkb;
89 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; 89 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
@@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
134 /* Print the locks attached to this resource */ 134 /* Print the locks attached to this resource */
135 seq_printf(s, "Granted Queue\n"); 135 seq_printf(s, "Granted Queue\n");
136 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) 136 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
137 print_resource_lock(s, lkb, res); 137 print_format1_lock(s, lkb, res);
138 138
139 seq_printf(s, "Conversion Queue\n"); 139 seq_printf(s, "Conversion Queue\n");
140 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) 140 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
141 print_resource_lock(s, lkb, res); 141 print_format1_lock(s, lkb, res);
142 142
143 seq_printf(s, "Waiting Queue\n"); 143 seq_printf(s, "Waiting Queue\n");
144 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) 144 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
145 print_resource_lock(s, lkb, res); 145 print_format1_lock(s, lkb, res);
146 146
147 if (list_empty(&res->res_lookup)) 147 if (list_empty(&res->res_lookup))
148 goto out; 148 goto out;
@@ -160,23 +160,24 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
160 return 0; 160 return 0;
161} 161}
162 162
163static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r) 163static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
164 struct dlm_rsb *r)
164{ 165{
165 unsigned int waiting = 0; 166 u64 xid = 0;
166 uint64_t xid = 0; 167 u64 us;
167 168
168 if (lkb->lkb_flags & DLM_IFL_USER) { 169 if (lkb->lkb_flags & DLM_IFL_USER) {
169 if (lkb->lkb_ua) 170 if (lkb->lkb_ua)
170 xid = lkb->lkb_ua->xid; 171 xid = lkb->lkb_ua->xid;
171 } 172 }
172 173
173 if (lkb->lkb_timestamp) 174 /* microseconds since lkb was added to current queue */
174 waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp); 175 us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
175 176
176 /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms 177 /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
177 r_nodeid r_len r_name */ 178 r_nodeid r_len r_name */
178 179
179 seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n", 180 seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
180 lkb->lkb_id, 181 lkb->lkb_id,
181 lkb->lkb_nodeid, 182 lkb->lkb_nodeid,
182 lkb->lkb_remid, 183 lkb->lkb_remid,
@@ -187,26 +188,114 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *
187 lkb->lkb_status, 188 lkb->lkb_status,
188 lkb->lkb_grmode, 189 lkb->lkb_grmode,
189 lkb->lkb_rqmode, 190 lkb->lkb_rqmode,
190 waiting, 191 (unsigned long long)us,
191 r->res_nodeid, 192 r->res_nodeid,
192 r->res_length, 193 r->res_length,
193 r->res_name); 194 r->res_name);
194} 195}
195 196
196static int print_locks(struct dlm_rsb *r, struct seq_file *s) 197static int print_format2(struct dlm_rsb *r, struct seq_file *s)
197{ 198{
198 struct dlm_lkb *lkb; 199 struct dlm_lkb *lkb;
199 200
200 lock_rsb(r); 201 lock_rsb(r);
201 202
202 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) 203 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
203 print_lock(s, lkb, r); 204 print_format2_lock(s, lkb, r);
204 205
205 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) 206 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
206 print_lock(s, lkb, r); 207 print_format2_lock(s, lkb, r);
207 208
208 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) 209 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
209 print_lock(s, lkb, r); 210 print_format2_lock(s, lkb, r);
211
212 unlock_rsb(r);
213 return 0;
214}
215
216static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
217 int rsb_lookup)
218{
219 u64 xid = 0;
220
221 if (lkb->lkb_flags & DLM_IFL_USER) {
222 if (lkb->lkb_ua)
223 xid = lkb->lkb_ua->xid;
224 }
225
226 seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
227 lkb->lkb_id,
228 lkb->lkb_nodeid,
229 lkb->lkb_remid,
230 lkb->lkb_ownpid,
231 (unsigned long long)xid,
232 lkb->lkb_exflags,
233 lkb->lkb_flags,
234 lkb->lkb_status,
235 lkb->lkb_grmode,
236 lkb->lkb_rqmode,
237 lkb->lkb_highbast,
238 rsb_lookup,
239 lkb->lkb_wait_type,
240 lkb->lkb_lvbseq,
241 (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
242 (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
243}
244
245static int print_format3(struct dlm_rsb *r, struct seq_file *s)
246{
247 struct dlm_lkb *lkb;
248 int i, lvblen = r->res_ls->ls_lvblen;
249 int print_name = 1;
250
251 lock_rsb(r);
252
253 seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
254 r,
255 r->res_nodeid,
256 r->res_first_lkid,
257 r->res_flags,
258 !list_empty(&r->res_root_list),
259 !list_empty(&r->res_recover_list),
260 r->res_recover_locks_count,
261 r->res_length);
262
263 for (i = 0; i < r->res_length; i++) {
264 if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
265 print_name = 0;
266 }
267
268 seq_printf(s, "%s", print_name ? "str " : "hex");
269
270 for (i = 0; i < r->res_length; i++) {
271 if (print_name)
272 seq_printf(s, "%c", r->res_name[i]);
273 else
274 seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
275 }
276 seq_printf(s, "\n");
277
278 if (!r->res_lvbptr)
279 goto do_locks;
280
281 seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
282
283 for (i = 0; i < lvblen; i++)
284 seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
285 seq_printf(s, "\n");
286
287 do_locks:
288 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
289 print_format3_lock(s, lkb, 0);
290
291 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
292 print_format3_lock(s, lkb, 0);
293
294 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
295 print_format3_lock(s, lkb, 0);
296
297 list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
298 print_format3_lock(s, lkb, 1);
210 299
211 unlock_rsb(r); 300 unlock_rsb(r);
212 return 0; 301 return 0;
@@ -231,7 +320,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
231 break; 320 break;
232 } 321 }
233 read_unlock(&ls->ls_rsbtbl[i].lock); 322 read_unlock(&ls->ls_rsbtbl[i].lock);
234 } 323 }
235 ri->entry = i; 324 ri->entry = i;
236 325
237 if (ri->entry >= ls->ls_rsbtbl_size) 326 if (ri->entry >= ls->ls_rsbtbl_size)
@@ -248,7 +337,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
248 read_unlock(&ls->ls_rsbtbl[i].lock); 337 read_unlock(&ls->ls_rsbtbl[i].lock);
249 dlm_put_rsb(old); 338 dlm_put_rsb(old);
250 goto top; 339 goto top;
251 } 340 }
252 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain); 341 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
253 dlm_hold_rsb(ri->rsb); 342 dlm_hold_rsb(ri->rsb);
254 read_unlock(&ls->ls_rsbtbl[i].lock); 343 read_unlock(&ls->ls_rsbtbl[i].lock);
@@ -274,6 +363,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
274 ri->ls = ls; 363 ri->ls = ls;
275 ri->entry = 0; 364 ri->entry = 0;
276 ri->next = NULL; 365 ri->next = NULL;
366 ri->format = 1;
277 367
278 if (rsb_iter_next(ri)) { 368 if (rsb_iter_next(ri)) {
279 rsb_iter_free(ri); 369 rsb_iter_free(ri);
@@ -325,16 +415,26 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
325{ 415{
326 struct rsb_iter *ri = iter_ptr; 416 struct rsb_iter *ri = iter_ptr;
327 417
328 if (ri->locks) { 418 switch (ri->format) {
419 case 1:
420 print_format1(ri->rsb, file);
421 break;
422 case 2:
329 if (ri->header) { 423 if (ri->header) {
330 seq_printf(file, "id nodeid remid pid xid exflags flags " 424 seq_printf(file, "id nodeid remid pid xid exflags "
331 "sts grmode rqmode time_ms r_nodeid " 425 "flags sts grmode rqmode time_ms "
332 "r_len r_name\n"); 426 "r_nodeid r_len r_name\n");
333 ri->header = 0; 427 ri->header = 0;
334 } 428 }
335 print_locks(ri->rsb, file); 429 print_format2(ri->rsb, file);
336 } else { 430 break;
337 print_resource(ri->rsb, file); 431 case 3:
432 if (ri->header) {
433 seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
434 ri->header = 0;
435 }
436 print_format3(ri->rsb, file);
437 break;
338 } 438 }
339 439
340 return 0; 440 return 0;
@@ -385,7 +485,7 @@ static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
385 ri->ls = ls; 485 ri->ls = ls;
386 ri->entry = 0; 486 ri->entry = 0;
387 ri->next = NULL; 487 ri->next = NULL;
388 ri->locks = 1; 488 ri->format = 2;
389 489
390 if (*pos == 0) 490 if (*pos == 0)
391 ri->header = 1; 491 ri->header = 1;
@@ -448,6 +548,84 @@ static const struct file_operations locks_fops = {
448}; 548};
449 549
450/* 550/*
551 * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks
552 * This can replace both formats 1 and 2 eventually.
553 */
554
555static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos)
556{
557 struct rsb_iter *ri;
558
559 ri = kzalloc(sizeof *ri, GFP_KERNEL);
560 if (!ri)
561 return NULL;
562
563 ri->ls = ls;
564 ri->entry = 0;
565 ri->next = NULL;
566 ri->format = 3;
567
568 if (*pos == 0)
569 ri->header = 1;
570
571 if (rsb_iter_next(ri)) {
572 rsb_iter_free(ri);
573 return NULL;
574 }
575
576 return ri;
577}
578
579static void *all_seq_start(struct seq_file *file, loff_t *pos)
580{
581 struct rsb_iter *ri;
582 loff_t n = *pos;
583
584 ri = all_iter_init(file->private, pos);
585 if (!ri)
586 return NULL;
587
588 while (n--) {
589 if (rsb_iter_next(ri)) {
590 rsb_iter_free(ri);
591 return NULL;
592 }
593 }
594
595 return ri;
596}
597
598static struct seq_operations all_seq_ops = {
599 .start = all_seq_start,
600 .next = rsb_seq_next,
601 .stop = rsb_seq_stop,
602 .show = rsb_seq_show,
603};
604
605static int all_open(struct inode *inode, struct file *file)
606{
607 struct seq_file *seq;
608 int ret;
609
610 ret = seq_open(file, &all_seq_ops);
611 if (ret)
612 return ret;
613
614 seq = file->private_data;
615 seq->private = inode->i_private;
616
617 return 0;
618}
619
620static const struct file_operations all_fops = {
621 .owner = THIS_MODULE,
622 .open = all_open,
623 .read = seq_read,
624 .llseek = seq_lseek,
625 .release = seq_release
626};
627
628/*
451 * dump lkb's on the ls_waiters list 629 * dump lkb's on the ls_waiters list
452 */ 630 */
453 631
@@ -489,30 +667,33 @@ static const struct file_operations waiters_fops = {
489 .read = waiters_read 667 .read = waiters_read
490}; 668};
491 669
670void dlm_delete_debug_file(struct dlm_ls *ls)
671{
672 if (ls->ls_debug_rsb_dentry)
673 debugfs_remove(ls->ls_debug_rsb_dentry);
674 if (ls->ls_debug_waiters_dentry)
675 debugfs_remove(ls->ls_debug_waiters_dentry);
676 if (ls->ls_debug_locks_dentry)
677 debugfs_remove(ls->ls_debug_locks_dentry);
678 if (ls->ls_debug_all_dentry)
679 debugfs_remove(ls->ls_debug_all_dentry);
680}
681
492int dlm_create_debug_file(struct dlm_ls *ls) 682int dlm_create_debug_file(struct dlm_ls *ls)
493{ 683{
494 char name[DLM_LOCKSPACE_LEN+8]; 684 char name[DLM_LOCKSPACE_LEN+8];
495 685
686 /* format 1 */
687
496 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name, 688 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
497 S_IFREG | S_IRUGO, 689 S_IFREG | S_IRUGO,
498 dlm_root, 690 dlm_root,
499 ls, 691 ls,
500 &rsb_fops); 692 &rsb_fops);
501 if (!ls->ls_debug_rsb_dentry) 693 if (!ls->ls_debug_rsb_dentry)
502 return -ENOMEM; 694 goto fail;
503 695
504 memset(name, 0, sizeof(name)); 696 /* format 2 */
505 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
506
507 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
508 S_IFREG | S_IRUGO,
509 dlm_root,
510 ls,
511 &waiters_fops);
512 if (!ls->ls_debug_waiters_dentry) {
513 debugfs_remove(ls->ls_debug_rsb_dentry);
514 return -ENOMEM;
515 }
516 697
517 memset(name, 0, sizeof(name)); 698 memset(name, 0, sizeof(name));
518 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); 699 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
@@ -522,23 +703,38 @@ int dlm_create_debug_file(struct dlm_ls *ls)
522 dlm_root, 703 dlm_root,
523 ls, 704 ls,
524 &locks_fops); 705 &locks_fops);
525 if (!ls->ls_debug_locks_dentry) { 706 if (!ls->ls_debug_locks_dentry)
526 debugfs_remove(ls->ls_debug_waiters_dentry); 707 goto fail;
527 debugfs_remove(ls->ls_debug_rsb_dentry); 708
528 return -ENOMEM; 709 /* format 3 */
529 } 710
711 memset(name, 0, sizeof(name));
712 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
713
714 ls->ls_debug_all_dentry = debugfs_create_file(name,
715 S_IFREG | S_IRUGO,
716 dlm_root,
717 ls,
718 &all_fops);
719 if (!ls->ls_debug_all_dentry)
720 goto fail;
721
722 memset(name, 0, sizeof(name));
723 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
724
725 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
726 S_IFREG | S_IRUGO,
727 dlm_root,
728 ls,
729 &waiters_fops);
730 if (!ls->ls_debug_waiters_dentry)
731 goto fail;
530 732
531 return 0; 733 return 0;
532}
533 734
534void dlm_delete_debug_file(struct dlm_ls *ls) 735 fail:
535{ 736 dlm_delete_debug_file(ls);
536 if (ls->ls_debug_rsb_dentry) 737 return -ENOMEM;
537 debugfs_remove(ls->ls_debug_rsb_dentry);
538 if (ls->ls_debug_waiters_dentry)
539 debugfs_remove(ls->ls_debug_waiters_dentry);
540 if (ls->ls_debug_locks_dentry)
541 debugfs_remove(ls->ls_debug_locks_dentry);
542} 738}
543 739
544int __init dlm_register_debugfs(void) 740int __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 85defeb64df4..92969f879a17 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
374 struct list_head *list; 374 struct list_head *list;
375 struct dlm_rsb *r; 375 struct dlm_rsb *r;
376 int offset = 0, dir_nodeid; 376 int offset = 0, dir_nodeid;
377 uint16_t be_namelen; 377 __be16 be_namelen;
378 378
379 down_read(&ls->ls_root_sem); 379 down_read(&ls->ls_root_sem);
380 380
@@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
410 410
411 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) { 411 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
412 /* Write end-of-block record */ 412 /* Write end-of-block record */
413 be_namelen = 0; 413 be_namelen = cpu_to_be16(0);
414 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 414 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
415 offset += sizeof(uint16_t); 415 offset += sizeof(__be16);
416 goto out; 416 goto out;
417 } 417 }
418 418
419 be_namelen = cpu_to_be16(r->res_length); 419 be_namelen = cpu_to_be16(r->res_length);
420 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 420 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
421 offset += sizeof(uint16_t); 421 offset += sizeof(__be16);
422 memcpy(outbuf + offset, r->res_name, r->res_length); 422 memcpy(outbuf + offset, r->res_name, r->res_length);
423 offset += r->res_length; 423 offset += r->res_length;
424 } 424 }
@@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
430 430
431 if ((list == &ls->ls_root_list) && 431 if ((list == &ls->ls_root_list) &&
432 (offset + sizeof(uint16_t) <= outlen)) { 432 (offset + sizeof(uint16_t) <= outlen)) {
433 be_namelen = 0xFFFF; 433 be_namelen = cpu_to_be16(0xFFFF);
434 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 434 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
435 offset += sizeof(uint16_t); 435 offset += sizeof(__be16);
436 } 436 }
437 437
438 out: 438 out:
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 868e4c9ef127..ef2f1e353966 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -245,7 +245,8 @@ struct dlm_lkb {
245 struct list_head lkb_astqueue; /* need ast to be sent */ 245 struct list_head lkb_astqueue; /* need ast to be sent */
246 struct list_head lkb_ownqueue; /* list of locks for a process */ 246 struct list_head lkb_ownqueue; /* list of locks for a process */
247 struct list_head lkb_time_list; 247 struct list_head lkb_time_list;
248 unsigned long lkb_timestamp; 248 ktime_t lkb_time_bast; /* for debugging */
249 ktime_t lkb_timestamp;
249 unsigned long lkb_timeout_cs; 250 unsigned long lkb_timeout_cs;
250 251
251 char *lkb_lvbptr; 252 char *lkb_lvbptr;
@@ -481,6 +482,7 @@ struct dlm_ls {
481 struct dentry *ls_debug_rsb_dentry; /* debugfs */ 482 struct dentry *ls_debug_rsb_dentry; /* debugfs */
482 struct dentry *ls_debug_waiters_dentry; /* debugfs */ 483 struct dentry *ls_debug_waiters_dentry; /* debugfs */
483 struct dentry *ls_debug_locks_dentry; /* debugfs */ 484 struct dentry *ls_debug_locks_dentry; /* debugfs */
485 struct dentry *ls_debug_all_dentry; /* debugfs */
484 486
485 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ 487 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
486 int ls_uevent_result; 488 int ls_uevent_result;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 724ddac91538..6cfe65bbf4a2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
307 lkb->lkb_lksb->sb_status = rv; 307 lkb->lkb_lksb->sb_status = rv;
308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; 308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
309 309
310 dlm_add_ast(lkb, AST_COMP); 310 dlm_add_ast(lkb, AST_COMP, 0);
311} 311}
312 312
313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) 313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
318 318
319static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) 319static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
320{ 320{
321 lkb->lkb_time_bast = ktime_get();
322
321 if (is_master_copy(lkb)) 323 if (is_master_copy(lkb))
322 send_bast(r, lkb, rqmode); 324 send_bast(r, lkb, rqmode);
323 else { 325 else
324 lkb->lkb_bastmode = rqmode; 326 dlm_add_ast(lkb, AST_BAST, rqmode);
325 dlm_add_ast(lkb, AST_BAST);
326 }
327} 327}
328 328
329/* 329/*
@@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
744 744
745 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); 745 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
746 746
747 lkb->lkb_timestamp = ktime_get();
748
747 lkb->lkb_status = status; 749 lkb->lkb_status = status;
748 750
749 switch (status) { 751 switch (status) {
@@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb)
1013{ 1015{
1014 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 1016 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1015 1017
1016 if (is_master_copy(lkb)) { 1018 if (is_master_copy(lkb))
1017 lkb->lkb_timestamp = jiffies;
1018 return; 1019 return;
1019 }
1020 1020
1021 if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && 1021 if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
1022 !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { 1022 !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
@@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb)
1031 DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb);); 1031 DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
1032 mutex_lock(&ls->ls_timeout_mutex); 1032 mutex_lock(&ls->ls_timeout_mutex);
1033 hold_lkb(lkb); 1033 hold_lkb(lkb);
1034 lkb->lkb_timestamp = jiffies;
1035 list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout); 1034 list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
1036 mutex_unlock(&ls->ls_timeout_mutex); 1035 mutex_unlock(&ls->ls_timeout_mutex);
1037} 1036}
@@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls)
1059 struct dlm_rsb *r; 1058 struct dlm_rsb *r;
1060 struct dlm_lkb *lkb; 1059 struct dlm_lkb *lkb;
1061 int do_cancel, do_warn; 1060 int do_cancel, do_warn;
1061 s64 wait_us;
1062 1062
1063 for (;;) { 1063 for (;;) {
1064 if (dlm_locking_stopped(ls)) 1064 if (dlm_locking_stopped(ls))
@@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls)
1069 mutex_lock(&ls->ls_timeout_mutex); 1069 mutex_lock(&ls->ls_timeout_mutex);
1070 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { 1070 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
1071 1071
1072 wait_us = ktime_to_us(ktime_sub(ktime_get(),
1073 lkb->lkb_timestamp));
1074
1072 if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && 1075 if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
1073 time_after_eq(jiffies, lkb->lkb_timestamp + 1076 wait_us >= (lkb->lkb_timeout_cs * 10000))
1074 lkb->lkb_timeout_cs * HZ/100))
1075 do_cancel = 1; 1077 do_cancel = 1;
1076 1078
1077 if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && 1079 if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
1078 time_after_eq(jiffies, lkb->lkb_timestamp + 1080 wait_us >= dlm_config.ci_timewarn_cs * 10000)
1079 dlm_config.ci_timewarn_cs * HZ/100))
1080 do_warn = 1; 1081 do_warn = 1;
1081 1082
1082 if (!do_cancel && !do_warn) 1083 if (!do_cancel && !do_warn)
@@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls)
1122void dlm_adjust_timeouts(struct dlm_ls *ls) 1123void dlm_adjust_timeouts(struct dlm_ls *ls)
1123{ 1124{
1124 struct dlm_lkb *lkb; 1125 struct dlm_lkb *lkb;
1125 long adj = jiffies - ls->ls_recover_begin; 1126 u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
1126 1127
1127 ls->ls_recover_begin = 0; 1128 ls->ls_recover_begin = 0;
1128 mutex_lock(&ls->ls_timeout_mutex); 1129 mutex_lock(&ls->ls_timeout_mutex);
1129 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) 1130 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
1130 lkb->lkb_timestamp += adj; 1131 lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
1131 mutex_unlock(&ls->ls_timeout_mutex); 1132 mutex_unlock(&ls->ls_timeout_mutex);
1132} 1133}
1133 1134
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3962262f991a..103a5ebd1371 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con)
295 con->sock->sk->sk_write_space = lowcomms_write_space; 295 con->sock->sk->sk_write_space = lowcomms_write_space;
296 con->sock->sk->sk_state_change = lowcomms_state_change; 296 con->sock->sk->sk_state_change = lowcomms_state_change;
297 con->sock->sk->sk_user_data = con; 297 con->sock->sk->sk_user_data = con;
298 con->sock->sk->sk_allocation = GFP_NOFS;
298 return 0; 299 return 0;
299} 300}
300 301
@@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con)
823 len = e->len; 824 len = e->len;
824 offset = e->offset; 825 offset = e->offset;
825 spin_unlock(&con->writequeue_lock); 826 spin_unlock(&con->writequeue_lock);
826 kmap(e->page);
827 827
828 /* Send the first block off the write queue */ 828 /* Send the first block off the write queue */
829 iov[0].iov_base = page_address(e->page)+offset; 829 iov[0].iov_base = page_address(e->page)+offset;
@@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con)
854 854
855 if (e->len == 0 && e->users == 0) { 855 if (e->len == 0 && e->users == 0) {
856 list_del(&e->list); 856 list_del(&e->list);
857 kunmap(e->page);
858 free_entry(e); 857 free_entry(e);
859 } 858 }
860 spin_unlock(&con->writequeue_lock); 859 spin_unlock(&con->writequeue_lock);
@@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1203 1202
1204 if (e) { 1203 if (e) {
1205 got_one: 1204 got_one:
1206 if (users == 0)
1207 kmap(e->page);
1208 *ppc = page_address(e->page) + offset; 1205 *ppc = page_address(e->page) + offset;
1209 return e; 1206 return e;
1210 } 1207 }
@@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
1233 if (users) 1230 if (users)
1234 goto out; 1231 goto out;
1235 e->len = e->end - e->offset; 1232 e->len = e->end - e->offset;
1236 kunmap(e->page);
1237 spin_unlock(&con->writequeue_lock); 1233 spin_unlock(&con->writequeue_lock);
1238 1234
1239 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { 1235 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
@@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con)
1272 offset = e->offset; 1268 offset = e->offset;
1273 BUG_ON(len == 0 && e->users == 0); 1269 BUG_ON(len == 0 && e->users == 0);
1274 spin_unlock(&con->writequeue_lock); 1270 spin_unlock(&con->writequeue_lock);
1275 kmap(e->page);
1276 1271
1277 ret = 0; 1272 ret = 0;
1278 if (len) { 1273 if (len) {
@@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con)
1294 1289
1295 if (e->len == 0 && e->users == 0) { 1290 if (e->len == 0 && e->users == 0) {
1296 list_del(&e->list); 1291 list_del(&e->list);
1297 kunmap(e->page);
1298 free_entry(e); 1292 free_entry(e);
1299 continue; 1293 continue;
1300 } 1294 }
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 54c14c6d06cb..c1775b84ebab 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
39{ 39{
40 char *p; 40 char *p;
41 41
42 p = kzalloc(ls->ls_lvblen, GFP_KERNEL); 42 p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
43 return p; 43 return p;
44} 44}
45 45
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
57 57
58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); 58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
59 59
60 r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL); 60 r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
61 return r; 61 return r;
62} 62}
63 63
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
72{ 72{
73 struct dlm_lkb *lkb; 73 struct dlm_lkb *lkb;
74 74
75 lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL); 75 lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
76 return lkb; 76 return lkb;
77} 77}
78 78
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 07ac709f3ed7..f3396c622aec 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
112 ordinary messages). */ 112 ordinary messages). */
113 113
114 if (msglen > sizeof(__tmp) && p == &__tmp.p) { 114 if (msglen > sizeof(__tmp) && p == &__tmp.p) {
115 p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); 115 p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
116 if (p == NULL) 116 if (p == NULL)
117 return ret; 117 return ret;
118 } 118 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index aa2a5775a027..ccc9d62c462d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
115 data->status = lkb->lkb_status; 115 data->status = lkb->lkb_status;
116 data->grmode = lkb->lkb_grmode; 116 data->grmode = lkb->lkb_grmode;
117 data->rqmode = lkb->lkb_rqmode; 117 data->rqmode = lkb->lkb_rqmode;
118 data->timestamp = lkb->lkb_timestamp;
119 if (lkb->lkb_ua) 118 if (lkb->lkb_ua)
120 data->xid = lkb->lkb_ua->xid; 119 data->xid = lkb->lkb_ua->xid;
121 if (r) { 120 if (r) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b3832c67194a..065149e84f42 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
175/* we could possibly check if the cancel of an orphan has resulted in the lkb 175/* we could possibly check if the cancel of an orphan has resulted in the lkb
176 being removed and then remove that lkb from the orphans list and free it */ 176 being removed and then remove that lkb from the orphans list and free it */
177 177
178void dlm_user_add_ast(struct dlm_lkb *lkb, int type) 178void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
179{ 179{
180 struct dlm_ls *ls; 180 struct dlm_ls *ls;
181 struct dlm_user_args *ua; 181 struct dlm_user_args *ua;
@@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
208 208
209 ast_type = lkb->lkb_ast_type; 209 ast_type = lkb->lkb_ast_type;
210 lkb->lkb_ast_type |= type; 210 lkb->lkb_ast_type |= type;
211 if (bastmode)
212 lkb->lkb_bastmode = bastmode;
211 213
212 if (!ast_type) { 214 if (!ast_type) {
213 kref_get(&lkb->lkb_ref); 215 kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 35eb6a13d616..1c9686492286 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,7 @@
9#ifndef __USER_DOT_H__ 9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__ 10#define __USER_DOT_H__
11 11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls); 15int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581c..48c0571f831d 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
211 211
212struct dqstats dqstats; 212struct dqstats dqstats;
213 213
214static void dqput(struct dquot *dquot);
215
216static inline unsigned int 214static inline unsigned int
217hashfn(const struct super_block *sb, unsigned int id, int type) 215hashfn(const struct super_block *sb, unsigned int id, int type)
218{ 216{
@@ -415,6 +413,17 @@ out_dqlock:
415 return ret; 413 return ret;
416} 414}
417 415
416void dquot_destroy(struct dquot *dquot)
417{
418 kmem_cache_free(dquot_cachep, dquot);
419}
420EXPORT_SYMBOL(dquot_destroy);
421
422static inline void do_destroy_dquot(struct dquot *dquot)
423{
424 dquot->dq_sb->dq_op->destroy_dquot(dquot);
425}
426
418/* Invalidate all dquots on the list. Note that this function is called after 427/* Invalidate all dquots on the list. Note that this function is called after
419 * quota is disabled and pointers from inodes removed so there cannot be new 428 * quota is disabled and pointers from inodes removed so there cannot be new
420 * quota users. There can still be some users of quotas due to inodes being 429 * quota users. There can still be some users of quotas due to inodes being
@@ -463,9 +472,44 @@ restart:
463 remove_dquot_hash(dquot); 472 remove_dquot_hash(dquot);
464 remove_free_dquot(dquot); 473 remove_free_dquot(dquot);
465 remove_inuse(dquot); 474 remove_inuse(dquot);
466 kmem_cache_free(dquot_cachep, dquot); 475 do_destroy_dquot(dquot);
476 }
477 spin_unlock(&dq_list_lock);
478}
479
480/* Call callback for every active dquot on given filesystem */
481int dquot_scan_active(struct super_block *sb,
482 int (*fn)(struct dquot *dquot, unsigned long priv),
483 unsigned long priv)
484{
485 struct dquot *dquot, *old_dquot = NULL;
486 int ret = 0;
487
488 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
489 spin_lock(&dq_list_lock);
490 list_for_each_entry(dquot, &inuse_list, dq_inuse) {
491 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
492 continue;
493 if (dquot->dq_sb != sb)
494 continue;
495 /* Now we have active dquot so we can just increase use count */
496 atomic_inc(&dquot->dq_count);
497 dqstats.lookups++;
498 spin_unlock(&dq_list_lock);
499 dqput(old_dquot);
500 old_dquot = dquot;
501 ret = fn(dquot, priv);
502 if (ret < 0)
503 goto out;
504 spin_lock(&dq_list_lock);
505 /* We are safe to continue now because our dquot could not
506 * be moved out of the inuse list while we hold the reference */
467 } 507 }
468 spin_unlock(&dq_list_lock); 508 spin_unlock(&dq_list_lock);
509out:
510 dqput(old_dquot);
511 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
512 return ret;
469} 513}
470 514
471int vfs_quota_sync(struct super_block *sb, int type) 515int vfs_quota_sync(struct super_block *sb, int type)
@@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
479 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 523 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
480 if (type != -1 && cnt != type) 524 if (type != -1 && cnt != type)
481 continue; 525 continue;
482 if (!sb_has_quota_enabled(sb, cnt)) 526 if (!sb_has_quota_active(sb, cnt))
483 continue; 527 continue;
484 spin_lock(&dq_list_lock); 528 spin_lock(&dq_list_lock);
485 dirty = &dqopt->info[cnt].dqi_dirty_list; 529 dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
504 } 548 }
505 549
506 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 550 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
507 if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt) 551 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
508 && info_dirty(&dqopt->info[cnt])) 552 && info_dirty(&dqopt->info[cnt]))
509 sb->dq_op->write_info(sb, cnt); 553 sb->dq_op->write_info(sb, cnt);
510 spin_lock(&dq_list_lock); 554 spin_lock(&dq_list_lock);
511 dqstats.syncs++; 555 dqstats.syncs++;
@@ -527,7 +571,7 @@ static void prune_dqcache(int count)
527 remove_dquot_hash(dquot); 571 remove_dquot_hash(dquot);
528 remove_free_dquot(dquot); 572 remove_free_dquot(dquot);
529 remove_inuse(dquot); 573 remove_inuse(dquot);
530 kmem_cache_free(dquot_cachep, dquot); 574 do_destroy_dquot(dquot);
531 count--; 575 count--;
532 head = free_dquots.prev; 576 head = free_dquots.prev;
533 } 577 }
@@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = {
558 * NOTE: If you change this function please check whether dqput_blocks() works right... 602 * NOTE: If you change this function please check whether dqput_blocks() works right...
559 * MUST be called with either dqptr_sem or dqonoff_mutex held 603 * MUST be called with either dqptr_sem or dqonoff_mutex held
560 */ 604 */
561static void dqput(struct dquot *dquot) 605void dqput(struct dquot *dquot)
562{ 606{
563 int ret; 607 int ret;
564 608
@@ -584,7 +628,7 @@ we_slept:
584 /* We have more than one user... nothing to do */ 628 /* We have more than one user... nothing to do */
585 atomic_dec(&dquot->dq_count); 629 atomic_dec(&dquot->dq_count);
586 /* Releasing dquot during quotaoff phase? */ 630 /* Releasing dquot during quotaoff phase? */
587 if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) && 631 if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
588 atomic_read(&dquot->dq_count) == 1) 632 atomic_read(&dquot->dq_count) == 1)
589 wake_up(&dquot->dq_wait_unused); 633 wake_up(&dquot->dq_wait_unused);
590 spin_unlock(&dq_list_lock); 634 spin_unlock(&dq_list_lock);
@@ -625,11 +669,17 @@ we_slept:
625 spin_unlock(&dq_list_lock); 669 spin_unlock(&dq_list_lock);
626} 670}
627 671
672struct dquot *dquot_alloc(struct super_block *sb, int type)
673{
674 return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
675}
676EXPORT_SYMBOL(dquot_alloc);
677
628static struct dquot *get_empty_dquot(struct super_block *sb, int type) 678static struct dquot *get_empty_dquot(struct super_block *sb, int type)
629{ 679{
630 struct dquot *dquot; 680 struct dquot *dquot;
631 681
632 dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS); 682 dquot = sb->dq_op->alloc_dquot(sb, type);
633 if(!dquot) 683 if(!dquot)
634 return NODQUOT; 684 return NODQUOT;
635 685
@@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
647} 697}
648 698
649/* 699/*
700 * Check whether dquot is in memory.
701 * MUST be called with either dqptr_sem or dqonoff_mutex held
702 */
703int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
704{
705 unsigned int hashent = hashfn(sb, id, type);
706 int ret = 0;
707
708 if (!sb_has_quota_active(sb, type))
709 return 0;
710 spin_lock(&dq_list_lock);
711 if (find_dquot(hashent, sb, id, type) != NODQUOT)
712 ret = 1;
713 spin_unlock(&dq_list_lock);
714 return ret;
715}
716
717/*
650 * Get reference to dquot 718 * Get reference to dquot
651 * MUST be called with either dqptr_sem or dqonoff_mutex held 719 * MUST be called with either dqptr_sem or dqonoff_mutex held
652 */ 720 */
653static struct dquot *dqget(struct super_block *sb, unsigned int id, int type) 721struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
654{ 722{
655 unsigned int hashent = hashfn(sb, id, type); 723 unsigned int hashent = hashfn(sb, id, type);
656 struct dquot *dquot, *empty = NODQUOT; 724 struct dquot *dquot, *empty = NODQUOT;
657 725
658 if (!sb_has_quota_enabled(sb, type)) 726 if (!sb_has_quota_active(sb, type))
659 return NODQUOT; 727 return NODQUOT;
660we_slept: 728we_slept:
661 spin_lock(&dq_list_lock); 729 spin_lock(&dq_list_lock);
@@ -682,7 +750,7 @@ we_slept:
682 dqstats.lookups++; 750 dqstats.lookups++;
683 spin_unlock(&dq_list_lock); 751 spin_unlock(&dq_list_lock);
684 if (empty) 752 if (empty)
685 kmem_cache_free(dquot_cachep, empty); 753 do_destroy_dquot(empty);
686 } 754 }
687 /* Wait for dq_lock - after this we know that either dquot_release() is already 755 /* Wait for dq_lock - after this we know that either dquot_release() is already
688 * finished or it will be canceled due to dq_count > 1 test */ 756 * finished or it will be canceled due to dq_count > 1 test */
@@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
820 } 888 }
821} 889}
822 890
823static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) 891static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
824{ 892{
825 dquot->dq_dqb.dqb_curinodes += number; 893 dquot->dq_dqb.dqb_curinodes += number;
826} 894}
@@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
830 dquot->dq_dqb.dqb_curspace += number; 898 dquot->dq_dqb.dqb_curspace += number;
831} 899}
832 900
833static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) 901static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
834{ 902{
835 if (dquot->dq_dqb.dqb_curinodes > number) 903 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
904 dquot->dq_dqb.dqb_curinodes >= number)
836 dquot->dq_dqb.dqb_curinodes -= number; 905 dquot->dq_dqb.dqb_curinodes -= number;
837 else 906 else
838 dquot->dq_dqb.dqb_curinodes = 0; 907 dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
843 912
844static inline void dquot_decr_space(struct dquot *dquot, qsize_t number) 913static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
845{ 914{
846 if (dquot->dq_dqb.dqb_curspace > number) 915 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
916 dquot->dq_dqb.dqb_curspace >= number)
847 dquot->dq_dqb.dqb_curspace -= number; 917 dquot->dq_dqb.dqb_curspace -= number;
848 else 918 else
849 dquot->dq_dqb.dqb_curspace = 0; 919 dquot->dq_dqb.dqb_curspace = 0;
850 if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) 920 if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
851 dquot->dq_dqb.dqb_btime = (time_t) 0; 921 dquot->dq_dqb.dqb_btime = (time_t) 0;
852 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 922 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
853} 923}
@@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
1023} 1093}
1024 1094
1025/* needs dq_data_lock */ 1095/* needs dq_data_lock */
1026static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) 1096static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1027{ 1097{
1028 *warntype = QUOTA_NL_NOWARN; 1098 *warntype = QUOTA_NL_NOWARN;
1029 if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1099 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1100 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1030 return QUOTA_OK; 1101 return QUOTA_OK;
1031 1102
1032 if (dquot->dq_dqb.dqb_ihardlimit && 1103 if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
1058static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) 1129static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
1059{ 1130{
1060 *warntype = QUOTA_NL_NOWARN; 1131 *warntype = QUOTA_NL_NOWARN;
1061 if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1132 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1133 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1062 return QUOTA_OK; 1134 return QUOTA_OK;
1063 1135
1064 if (dquot->dq_dqb.dqb_bhardlimit && 1136 if (dquot->dq_dqb.dqb_bhardlimit &&
1065 toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit && 1137 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
1066 !ignore_hardlimit(dquot)) { 1138 !ignore_hardlimit(dquot)) {
1067 if (!prealloc) 1139 if (!prealloc)
1068 *warntype = QUOTA_NL_BHARDWARN; 1140 *warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1070 } 1142 }
1071 1143
1072 if (dquot->dq_dqb.dqb_bsoftlimit && 1144 if (dquot->dq_dqb.dqb_bsoftlimit &&
1073 toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && 1145 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
1074 dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime && 1146 dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
1075 !ignore_hardlimit(dquot)) { 1147 !ignore_hardlimit(dquot)) {
1076 if (!prealloc) 1148 if (!prealloc)
@@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1079 } 1151 }
1080 1152
1081 if (dquot->dq_dqb.dqb_bsoftlimit && 1153 if (dquot->dq_dqb.dqb_bsoftlimit &&
1082 toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && 1154 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
1083 dquot->dq_dqb.dqb_btime == 0) { 1155 dquot->dq_dqb.dqb_btime == 0) {
1084 if (!prealloc) { 1156 if (!prealloc) {
1085 *warntype = QUOTA_NL_BSOFTWARN; 1157 *warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1096 return QUOTA_OK; 1168 return QUOTA_OK;
1097} 1169}
1098 1170
1099static int info_idq_free(struct dquot *dquot, ulong inodes) 1171static int info_idq_free(struct dquot *dquot, qsize_t inodes)
1100{ 1172{
1101 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || 1173 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
1102 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) 1174 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
1175 !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
1103 return QUOTA_NL_NOWARN; 1176 return QUOTA_NL_NOWARN;
1104 1177
1105 if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit) 1178 if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
1113static int info_bdq_free(struct dquot *dquot, qsize_t space) 1186static int info_bdq_free(struct dquot *dquot, qsize_t space)
1114{ 1187{
1115 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || 1188 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
1116 toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) 1189 dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
1117 return QUOTA_NL_NOWARN; 1190 return QUOTA_NL_NOWARN;
1118 1191
1119 if (toqb(dquot->dq_dqb.dqb_curspace - space) <= 1192 if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
1120 dquot->dq_dqb.dqb_bsoftlimit)
1121 return QUOTA_NL_BSOFTBELOW; 1193 return QUOTA_NL_BSOFTBELOW;
1122 if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit && 1194 if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
1123 toqb(dquot->dq_dqb.dqb_curspace - space) < 1195 dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
1124 dquot->dq_dqb.dqb_bhardlimit)
1125 return QUOTA_NL_BHARDBELOW; 1196 return QUOTA_NL_BHARDBELOW;
1126 return QUOTA_NL_NOWARN; 1197 return QUOTA_NL_NOWARN;
1127} 1198}
@@ -1166,17 +1237,23 @@ out_err:
1166 * Release all quotas referenced by inode 1237 * Release all quotas referenced by inode
1167 * Transaction must be started at an entry 1238 * Transaction must be started at an entry
1168 */ 1239 */
1169int dquot_drop(struct inode *inode) 1240int dquot_drop_locked(struct inode *inode)
1170{ 1241{
1171 int cnt; 1242 int cnt;
1172 1243
1173 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1174 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1244 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1175 if (inode->i_dquot[cnt] != NODQUOT) { 1245 if (inode->i_dquot[cnt] != NODQUOT) {
1176 dqput(inode->i_dquot[cnt]); 1246 dqput(inode->i_dquot[cnt]);
1177 inode->i_dquot[cnt] = NODQUOT; 1247 inode->i_dquot[cnt] = NODQUOT;
1178 } 1248 }
1179 } 1249 }
1250 return 0;
1251}
1252
1253int dquot_drop(struct inode *inode)
1254{
1255 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1256 dquot_drop_locked(inode);
1180 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1257 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1181 return 0; 1258 return 0;
1182} 1259}
@@ -1264,7 +1341,7 @@ warn_put_all:
1264/* 1341/*
1265 * This operation can block, but only after everything is updated 1342 * This operation can block, but only after everything is updated
1266 */ 1343 */
1267int dquot_alloc_inode(const struct inode *inode, unsigned long number) 1344int dquot_alloc_inode(const struct inode *inode, qsize_t number)
1268{ 1345{
1269 int cnt, ret = NO_QUOTA; 1346 int cnt, ret = NO_QUOTA;
1270 char warntype[MAXQUOTAS]; 1347 char warntype[MAXQUOTAS];
@@ -1349,7 +1426,7 @@ out_sub:
1349/* 1426/*
1350 * This operation can block, but only after everything is updated 1427 * This operation can block, but only after everything is updated
1351 */ 1428 */
1352int dquot_free_inode(const struct inode *inode, unsigned long number) 1429int dquot_free_inode(const struct inode *inode, qsize_t number)
1353{ 1430{
1354 unsigned int cnt; 1431 unsigned int cnt;
1355 char warntype[MAXQUOTAS]; 1432 char warntype[MAXQUOTAS];
@@ -1495,7 +1572,7 @@ warn_put_all:
1495/* Wrapper for transferring ownership of an inode */ 1572/* Wrapper for transferring ownership of an inode */
1496int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) 1573int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
1497{ 1574{
1498 if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) { 1575 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
1499 vfs_dq_init(inode); 1576 vfs_dq_init(inode);
1500 if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) 1577 if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
1501 return 1; 1578 return 1;
@@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = {
1533 .acquire_dquot = dquot_acquire, 1610 .acquire_dquot = dquot_acquire,
1534 .release_dquot = dquot_release, 1611 .release_dquot = dquot_release,
1535 .mark_dirty = dquot_mark_dquot_dirty, 1612 .mark_dirty = dquot_mark_dquot_dirty,
1536 .write_info = dquot_commit_info 1613 .write_info = dquot_commit_info,
1614 .alloc_dquot = dquot_alloc,
1615 .destroy_dquot = dquot_destroy,
1537}; 1616};
1538 1617
1539static inline void set_enable_flags(struct quota_info *dqopt, int type)
1540{
1541 switch (type) {
1542 case USRQUOTA:
1543 dqopt->flags |= DQUOT_USR_ENABLED;
1544 dqopt->flags &= ~DQUOT_USR_SUSPENDED;
1545 break;
1546 case GRPQUOTA:
1547 dqopt->flags |= DQUOT_GRP_ENABLED;
1548 dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
1549 break;
1550 }
1551}
1552
1553static inline void reset_enable_flags(struct quota_info *dqopt, int type,
1554 int remount)
1555{
1556 switch (type) {
1557 case USRQUOTA:
1558 dqopt->flags &= ~DQUOT_USR_ENABLED;
1559 if (remount)
1560 dqopt->flags |= DQUOT_USR_SUSPENDED;
1561 else
1562 dqopt->flags &= ~DQUOT_USR_SUSPENDED;
1563 break;
1564 case GRPQUOTA:
1565 dqopt->flags &= ~DQUOT_GRP_ENABLED;
1566 if (remount)
1567 dqopt->flags |= DQUOT_GRP_SUSPENDED;
1568 else
1569 dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
1570 break;
1571 }
1572}
1573
1574
1575/* 1618/*
1576 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1619 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1577 */ 1620 */
1578int vfs_quota_off(struct super_block *sb, int type, int remount) 1621int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
1579{ 1622{
1580 int cnt, ret = 0; 1623 int cnt, ret = 0;
1581 struct quota_info *dqopt = sb_dqopt(sb); 1624 struct quota_info *dqopt = sb_dqopt(sb);
1582 struct inode *toputinode[MAXQUOTAS]; 1625 struct inode *toputinode[MAXQUOTAS];
1583 1626
1627 /* Cannot turn off usage accounting without turning off limits, or
1628 * suspend quotas and simultaneously turn quotas off. */
1629 if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
1630 || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
1631 DQUOT_USAGE_ENABLED)))
1632 return -EINVAL;
1633
1584 /* We need to serialize quota_off() for device */ 1634 /* We need to serialize quota_off() for device */
1585 mutex_lock(&dqopt->dqonoff_mutex); 1635 mutex_lock(&dqopt->dqonoff_mutex);
1586 1636
@@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1589 * sometimes we are called when fill_super() failed and calling 1639 * sometimes we are called when fill_super() failed and calling
1590 * sync_fs() in such cases does no good. 1640 * sync_fs() in such cases does no good.
1591 */ 1641 */
1592 if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) { 1642 if (!sb_any_quota_loaded(sb)) {
1593 mutex_unlock(&dqopt->dqonoff_mutex); 1643 mutex_unlock(&dqopt->dqonoff_mutex);
1594 return 0; 1644 return 0;
1595 } 1645 }
@@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1597 toputinode[cnt] = NULL; 1647 toputinode[cnt] = NULL;
1598 if (type != -1 && cnt != type) 1648 if (type != -1 && cnt != type)
1599 continue; 1649 continue;
1600 /* If we keep inodes of quota files after remount and quotaoff 1650 if (!sb_has_quota_loaded(sb, cnt))
1601 * is called, drop kept inodes. */
1602 if (!remount && sb_has_quota_suspended(sb, cnt)) {
1603 iput(dqopt->files[cnt]);
1604 dqopt->files[cnt] = NULL;
1605 reset_enable_flags(dqopt, cnt, 0);
1606 continue; 1651 continue;
1652
1653 if (flags & DQUOT_SUSPENDED) {
1654 dqopt->flags |=
1655 dquot_state_flag(DQUOT_SUSPENDED, cnt);
1656 } else {
1657 dqopt->flags &= ~dquot_state_flag(flags, cnt);
1658 /* Turning off suspended quotas? */
1659 if (!sb_has_quota_loaded(sb, cnt) &&
1660 sb_has_quota_suspended(sb, cnt)) {
1661 dqopt->flags &= ~dquot_state_flag(
1662 DQUOT_SUSPENDED, cnt);
1663 iput(dqopt->files[cnt]);
1664 dqopt->files[cnt] = NULL;
1665 continue;
1666 }
1607 } 1667 }
1608 if (!sb_has_quota_enabled(sb, cnt)) 1668
1669 /* We still have to keep quota loaded? */
1670 if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
1609 continue; 1671 continue;
1610 reset_enable_flags(dqopt, cnt, remount);
1611 1672
1612 /* Note: these are blocking operations */ 1673 /* Note: these are blocking operations */
1613 drop_dquot_ref(sb, cnt); 1674 drop_dquot_ref(sb, cnt);
@@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1623 put_quota_format(dqopt->info[cnt].dqi_format); 1684 put_quota_format(dqopt->info[cnt].dqi_format);
1624 1685
1625 toputinode[cnt] = dqopt->files[cnt]; 1686 toputinode[cnt] = dqopt->files[cnt];
1626 if (!remount) 1687 if (!sb_has_quota_loaded(sb, cnt))
1627 dqopt->files[cnt] = NULL; 1688 dqopt->files[cnt] = NULL;
1628 dqopt->info[cnt].dqi_flags = 0; 1689 dqopt->info[cnt].dqi_flags = 0;
1629 dqopt->info[cnt].dqi_igrace = 0; 1690 dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1631 dqopt->ops[cnt] = NULL; 1692 dqopt->ops[cnt] = NULL;
1632 } 1693 }
1633 mutex_unlock(&dqopt->dqonoff_mutex); 1694 mutex_unlock(&dqopt->dqonoff_mutex);
1695
1696 /* Skip syncing and setting flags if quota files are hidden */
1697 if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
1698 goto put_inodes;
1699
1634 /* Sync the superblock so that buffers with quota data are written to 1700 /* Sync the superblock so that buffers with quota data are written to
1635 * disk (and so userspace sees correct data afterwards). */ 1701 * disk (and so userspace sees correct data afterwards). */
1636 if (sb->s_op->sync_fs) 1702 if (sb->s_op->sync_fs)
@@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1646 mutex_lock(&dqopt->dqonoff_mutex); 1712 mutex_lock(&dqopt->dqonoff_mutex);
1647 /* If quota was reenabled in the meantime, we have 1713 /* If quota was reenabled in the meantime, we have
1648 * nothing to do */ 1714 * nothing to do */
1649 if (!sb_has_quota_enabled(sb, cnt)) { 1715 if (!sb_has_quota_loaded(sb, cnt)) {
1650 mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA); 1716 mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
1651 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | 1717 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
1652 S_NOATIME | S_NOQUOTA); 1718 S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1655 mark_inode_dirty(toputinode[cnt]); 1721 mark_inode_dirty(toputinode[cnt]);
1656 } 1722 }
1657 mutex_unlock(&dqopt->dqonoff_mutex); 1723 mutex_unlock(&dqopt->dqonoff_mutex);
1724 }
1725 if (sb->s_bdev)
1726 invalidate_bdev(sb->s_bdev);
1727put_inodes:
1728 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1729 if (toputinode[cnt]) {
1658 /* On remount RO, we keep the inode pointer so that we 1730 /* On remount RO, we keep the inode pointer so that we
1659 * can reenable quota on the subsequent remount RW. 1731 * can reenable quota on the subsequent remount RW. We
1660 * But we have better not keep inode pointer when there 1732 * have to check 'flags' variable and not use sb_has_
1661 * is pending delete on the quota file... */ 1733 * function because another quotaon / quotaoff could
1662 if (!remount) 1734 * change global state before we got here. We refuse
1735 * to suspend quotas when there is pending delete on
1736 * the quota file... */
1737 if (!(flags & DQUOT_SUSPENDED))
1663 iput(toputinode[cnt]); 1738 iput(toputinode[cnt]);
1664 else if (!toputinode[cnt]->i_nlink) 1739 else if (!toputinode[cnt]->i_nlink)
1665 ret = -EBUSY; 1740 ret = -EBUSY;
1666 } 1741 }
1667 if (sb->s_bdev)
1668 invalidate_bdev(sb->s_bdev);
1669 return ret; 1742 return ret;
1670} 1743}
1671 1744
1745int vfs_quota_off(struct super_block *sb, int type, int remount)
1746{
1747 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
1748 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
1749}
1750
1672/* 1751/*
1673 * Turn quotas on on a device 1752 * Turn quotas on on a device
1674 */ 1753 */
1675 1754
1676/* Helper function when we already have the inode */ 1755/*
1677static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) 1756 * Helper function to turn quotas on when we already have the inode of
1757 * quota file and no quota information is loaded.
1758 */
1759static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
1760 unsigned int flags)
1678{ 1761{
1679 struct quota_format_type *fmt = find_quota_format(format_id); 1762 struct quota_format_type *fmt = find_quota_format(format_id);
1680 struct super_block *sb = inode->i_sb; 1763 struct super_block *sb = inode->i_sb;
@@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
1696 error = -EINVAL; 1779 error = -EINVAL;
1697 goto out_fmt; 1780 goto out_fmt;
1698 } 1781 }
1782 /* Usage always has to be set... */
1783 if (!(flags & DQUOT_USAGE_ENABLED)) {
1784 error = -EINVAL;
1785 goto out_fmt;
1786 }
1699 1787
1700 /* As we bypass the pagecache we must now flush the inode so that 1788 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
1701 * we see all the changes from userspace... */ 1789 /* As we bypass the pagecache we must now flush the inode so
1702 write_inode_now(inode, 1); 1790 * that we see all the changes from userspace... */
1703 /* And now flush the block cache so that kernel sees the changes */ 1791 write_inode_now(inode, 1);
1704 invalidate_bdev(sb->s_bdev); 1792 /* And now flush the block cache so that kernel sees the
1793 * changes */
1794 invalidate_bdev(sb->s_bdev);
1795 }
1705 mutex_lock(&inode->i_mutex); 1796 mutex_lock(&inode->i_mutex);
1706 mutex_lock(&dqopt->dqonoff_mutex); 1797 mutex_lock(&dqopt->dqonoff_mutex);
1707 if (sb_has_quota_enabled(sb, type) || 1798 if (sb_has_quota_loaded(sb, type)) {
1708 sb_has_quota_suspended(sb, type)) {
1709 error = -EBUSY; 1799 error = -EBUSY;
1710 goto out_lock; 1800 goto out_lock;
1711 } 1801 }
1712 /* We don't want quota and atime on quota files (deadlocks possible) 1802
1713 * Also nobody should write to the file - we use special IO operations 1803 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
1714 * which ignore the immutable bit. */ 1804 /* We don't want quota and atime on quota files (deadlocks
1715 down_write(&dqopt->dqptr_sem); 1805 * possible) Also nobody should write to the file - we use
1716 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); 1806 * special IO operations which ignore the immutable bit. */
1717 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; 1807 down_write(&dqopt->dqptr_sem);
1718 up_write(&dqopt->dqptr_sem); 1808 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
1719 sb->dq_op->drop(inode); 1809 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
1810 up_write(&dqopt->dqptr_sem);
1811 sb->dq_op->drop(inode);
1812 }
1720 1813
1721 error = -EIO; 1814 error = -EIO;
1722 dqopt->files[type] = igrab(inode); 1815 dqopt->files[type] = igrab(inode);
@@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
1737 } 1830 }
1738 mutex_unlock(&dqopt->dqio_mutex); 1831 mutex_unlock(&dqopt->dqio_mutex);
1739 mutex_unlock(&inode->i_mutex); 1832 mutex_unlock(&inode->i_mutex);
1740 set_enable_flags(dqopt, type); 1833 dqopt->flags |= dquot_state_flag(flags, type);
1741 1834
1742 add_dquot_ref(sb, type); 1835 add_dquot_ref(sb, type);
1743 mutex_unlock(&dqopt->dqonoff_mutex); 1836 mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
1770 struct quota_info *dqopt = sb_dqopt(sb); 1863 struct quota_info *dqopt = sb_dqopt(sb);
1771 struct inode *inode; 1864 struct inode *inode;
1772 int ret; 1865 int ret;
1866 unsigned int flags;
1773 1867
1774 mutex_lock(&dqopt->dqonoff_mutex); 1868 mutex_lock(&dqopt->dqonoff_mutex);
1775 if (!sb_has_quota_suspended(sb, type)) { 1869 if (!sb_has_quota_suspended(sb, type)) {
1776 mutex_unlock(&dqopt->dqonoff_mutex); 1870 mutex_unlock(&dqopt->dqonoff_mutex);
1777 return 0; 1871 return 0;
1778 } 1872 }
1779 BUG_ON(sb_has_quota_enabled(sb, type));
1780
1781 inode = dqopt->files[type]; 1873 inode = dqopt->files[type];
1782 dqopt->files[type] = NULL; 1874 dqopt->files[type] = NULL;
1783 reset_enable_flags(dqopt, type, 0); 1875 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
1876 DQUOT_LIMITS_ENABLED, type);
1877 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
1784 mutex_unlock(&dqopt->dqonoff_mutex); 1878 mutex_unlock(&dqopt->dqonoff_mutex);
1785 1879
1786 ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id); 1880 flags = dquot_generic_flag(flags, type);
1881 ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
1882 flags);
1787 iput(inode); 1883 iput(inode);
1788 1884
1789 return ret; 1885 return ret;
@@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
1799 if (path->mnt->mnt_sb != sb) 1895 if (path->mnt->mnt_sb != sb)
1800 error = -EXDEV; 1896 error = -EXDEV;
1801 else 1897 else
1802 error = vfs_quota_on_inode(path->dentry->d_inode, type, 1898 error = vfs_load_quota_inode(path->dentry->d_inode, type,
1803 format_id); 1899 format_id, DQUOT_USAGE_ENABLED |
1900 DQUOT_LIMITS_ENABLED);
1804 return error; 1901 return error;
1805} 1902}
1806 1903
1807/* Actual function called from quotactl() */
1808int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, 1904int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
1809 int remount) 1905 int remount)
1810{ 1906{
@@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
1823} 1919}
1824 1920
1825/* 1921/*
1922 * More powerful function for turning on quotas allowing setting
1923 * of individual quota flags
1924 */
1925int vfs_quota_enable(struct inode *inode, int type, int format_id,
1926 unsigned int flags)
1927{
1928 int ret = 0;
1929 struct super_block *sb = inode->i_sb;
1930 struct quota_info *dqopt = sb_dqopt(sb);
1931
1932 /* Just unsuspend quotas? */
1933 if (flags & DQUOT_SUSPENDED)
1934 return vfs_quota_on_remount(sb, type);
1935 if (!flags)
1936 return 0;
1937 /* Just updating flags needed? */
1938 if (sb_has_quota_loaded(sb, type)) {
1939 mutex_lock(&dqopt->dqonoff_mutex);
1940 /* Now do a reliable test... */
1941 if (!sb_has_quota_loaded(sb, type)) {
1942 mutex_unlock(&dqopt->dqonoff_mutex);
1943 goto load_quota;
1944 }
1945 if (flags & DQUOT_USAGE_ENABLED &&
1946 sb_has_quota_usage_enabled(sb, type)) {
1947 ret = -EBUSY;
1948 goto out_lock;
1949 }
1950 if (flags & DQUOT_LIMITS_ENABLED &&
1951 sb_has_quota_limits_enabled(sb, type)) {
1952 ret = -EBUSY;
1953 goto out_lock;
1954 }
1955 sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
1956out_lock:
1957 mutex_unlock(&dqopt->dqonoff_mutex);
1958 return ret;
1959 }
1960
1961load_quota:
1962 return vfs_load_quota_inode(inode, type, format_id, flags);
1963}
1964
1965/*
1826 * This function is used when filesystem needs to initialize quotas 1966 * This function is used when filesystem needs to initialize quotas
1827 * during mount time. 1967 * during mount time.
1828 */ 1968 */
@@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
1843 1983
1844 error = security_quota_on(dentry); 1984 error = security_quota_on(dentry);
1845 if (!error) 1985 if (!error)
1846 error = vfs_quota_on_inode(dentry->d_inode, type, format_id); 1986 error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
1987 DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
1847 1988
1848out: 1989out:
1849 dput(dentry); 1990 dput(dentry);
@@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
1866 return ret; 2007 return ret;
1867} 2008}
1868 2009
2010static inline qsize_t qbtos(qsize_t blocks)
2011{
2012 return blocks << QIF_DQBLKSIZE_BITS;
2013}
2014
2015static inline qsize_t stoqb(qsize_t space)
2016{
2017 return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
2018}
2019
1869/* Generic routine for getting common part of quota structure */ 2020/* Generic routine for getting common part of quota structure */
1870static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) 2021static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
1871{ 2022{
1872 struct mem_dqblk *dm = &dquot->dq_dqb; 2023 struct mem_dqblk *dm = &dquot->dq_dqb;
1873 2024
1874 spin_lock(&dq_data_lock); 2025 spin_lock(&dq_data_lock);
1875 di->dqb_bhardlimit = dm->dqb_bhardlimit; 2026 di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
1876 di->dqb_bsoftlimit = dm->dqb_bsoftlimit; 2027 di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
1877 di->dqb_curspace = dm->dqb_curspace; 2028 di->dqb_curspace = dm->dqb_curspace;
1878 di->dqb_ihardlimit = dm->dqb_ihardlimit; 2029 di->dqb_ihardlimit = dm->dqb_ihardlimit;
1879 di->dqb_isoftlimit = dm->dqb_isoftlimit; 2030 di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1918,28 +2069,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
1918 if (di->dqb_valid & QIF_SPACE) { 2069 if (di->dqb_valid & QIF_SPACE) {
1919 dm->dqb_curspace = di->dqb_curspace; 2070 dm->dqb_curspace = di->dqb_curspace;
1920 check_blim = 1; 2071 check_blim = 1;
2072 __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
1921 } 2073 }
1922 if (di->dqb_valid & QIF_BLIMITS) { 2074 if (di->dqb_valid & QIF_BLIMITS) {
1923 dm->dqb_bsoftlimit = di->dqb_bsoftlimit; 2075 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
1924 dm->dqb_bhardlimit = di->dqb_bhardlimit; 2076 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
1925 check_blim = 1; 2077 check_blim = 1;
2078 __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
1926 } 2079 }
1927 if (di->dqb_valid & QIF_INODES) { 2080 if (di->dqb_valid & QIF_INODES) {
1928 dm->dqb_curinodes = di->dqb_curinodes; 2081 dm->dqb_curinodes = di->dqb_curinodes;
1929 check_ilim = 1; 2082 check_ilim = 1;
2083 __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
1930 } 2084 }
1931 if (di->dqb_valid & QIF_ILIMITS) { 2085 if (di->dqb_valid & QIF_ILIMITS) {
1932 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2086 dm->dqb_isoftlimit = di->dqb_isoftlimit;
1933 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2087 dm->dqb_ihardlimit = di->dqb_ihardlimit;
1934 check_ilim = 1; 2088 check_ilim = 1;
2089 __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
1935 } 2090 }
1936 if (di->dqb_valid & QIF_BTIME) 2091 if (di->dqb_valid & QIF_BTIME) {
1937 dm->dqb_btime = di->dqb_btime; 2092 dm->dqb_btime = di->dqb_btime;
1938 if (di->dqb_valid & QIF_ITIME) 2093 check_blim = 1;
2094 __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2095 }
2096 if (di->dqb_valid & QIF_ITIME) {
1939 dm->dqb_itime = di->dqb_itime; 2097 dm->dqb_itime = di->dqb_itime;
2098 check_ilim = 1;
2099 __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2100 }
1940 2101
1941 if (check_blim) { 2102 if (check_blim) {
1942 if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) { 2103 if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
1943 dm->dqb_btime = 0; 2104 dm->dqb_btime = 0;
1944 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 2105 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
1945 } 2106 }
@@ -1970,12 +2131,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
1970 int rc; 2131 int rc;
1971 2132
1972 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2133 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
1973 if (!(dquot = dqget(sb, id, type))) { 2134 dquot = dqget(sb, id, type);
1974 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2135 if (!dquot) {
1975 return -ESRCH; 2136 rc = -ESRCH;
2137 goto out;
1976 } 2138 }
1977 rc = do_set_dqblk(dquot, di); 2139 rc = do_set_dqblk(dquot, di);
1978 dqput(dquot); 2140 dqput(dquot);
2141out:
1979 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2142 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
1980 return rc; 2143 return rc;
1981} 2144}
@@ -1986,7 +2149,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
1986 struct mem_dqinfo *mi; 2149 struct mem_dqinfo *mi;
1987 2150
1988 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2151 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
1989 if (!sb_has_quota_enabled(sb, type)) { 2152 if (!sb_has_quota_active(sb, type)) {
1990 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2153 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
1991 return -ESRCH; 2154 return -ESRCH;
1992 } 2155 }
@@ -2005,11 +2168,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2005int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2168int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2006{ 2169{
2007 struct mem_dqinfo *mi; 2170 struct mem_dqinfo *mi;
2171 int err = 0;
2008 2172
2009 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2173 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
2010 if (!sb_has_quota_enabled(sb, type)) { 2174 if (!sb_has_quota_active(sb, type)) {
2011 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2175 err = -ESRCH;
2012 return -ESRCH; 2176 goto out;
2013 } 2177 }
2014 mi = sb_dqopt(sb)->info + type; 2178 mi = sb_dqopt(sb)->info + type;
2015 spin_lock(&dq_data_lock); 2179 spin_lock(&dq_data_lock);
@@ -2023,8 +2187,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2023 mark_info_dirty(sb, type); 2187 mark_info_dirty(sb, type);
2024 /* Force write to disk */ 2188 /* Force write to disk */
2025 sb->dq_op->write_info(sb, type); 2189 sb->dq_op->write_info(sb, type);
2190out:
2026 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2191 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2027 return 0; 2192 return err;
2028} 2193}
2029 2194
2030struct quotactl_ops vfs_quotactl_ops = { 2195struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2351,13 @@ EXPORT_SYMBOL(register_quota_format);
2186EXPORT_SYMBOL(unregister_quota_format); 2351EXPORT_SYMBOL(unregister_quota_format);
2187EXPORT_SYMBOL(dqstats); 2352EXPORT_SYMBOL(dqstats);
2188EXPORT_SYMBOL(dq_data_lock); 2353EXPORT_SYMBOL(dq_data_lock);
2354EXPORT_SYMBOL(vfs_quota_enable);
2189EXPORT_SYMBOL(vfs_quota_on); 2355EXPORT_SYMBOL(vfs_quota_on);
2190EXPORT_SYMBOL(vfs_quota_on_path); 2356EXPORT_SYMBOL(vfs_quota_on_path);
2191EXPORT_SYMBOL(vfs_quota_on_mount); 2357EXPORT_SYMBOL(vfs_quota_on_mount);
2358EXPORT_SYMBOL(vfs_quota_disable);
2192EXPORT_SYMBOL(vfs_quota_off); 2359EXPORT_SYMBOL(vfs_quota_off);
2360EXPORT_SYMBOL(dquot_scan_active);
2193EXPORT_SYMBOL(vfs_quota_sync); 2361EXPORT_SYMBOL(vfs_quota_sync);
2194EXPORT_SYMBOL(vfs_get_dqinfo); 2362EXPORT_SYMBOL(vfs_get_dqinfo);
2195EXPORT_SYMBOL(vfs_set_dqinfo); 2363EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2202,7 +2370,11 @@ EXPORT_SYMBOL(dquot_release);
2202EXPORT_SYMBOL(dquot_mark_dquot_dirty); 2370EXPORT_SYMBOL(dquot_mark_dquot_dirty);
2203EXPORT_SYMBOL(dquot_initialize); 2371EXPORT_SYMBOL(dquot_initialize);
2204EXPORT_SYMBOL(dquot_drop); 2372EXPORT_SYMBOL(dquot_drop);
2373EXPORT_SYMBOL(dquot_drop_locked);
2205EXPORT_SYMBOL(vfs_dq_drop); 2374EXPORT_SYMBOL(vfs_dq_drop);
2375EXPORT_SYMBOL(dqget);
2376EXPORT_SYMBOL(dqput);
2377EXPORT_SYMBOL(dquot_is_cached);
2206EXPORT_SYMBOL(dquot_alloc_space); 2378EXPORT_SYMBOL(dquot_alloc_space);
2207EXPORT_SYMBOL(dquot_alloc_inode); 2379EXPORT_SYMBOL(dquot_alloc_inode);
2208EXPORT_SYMBOL(dquot_free_space); 2380EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 6046239465a1..c01e043670e2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -175,8 +175,8 @@ out:
175 * 175 *
176 * Returns zero on success; non-zero on error. 176 * Returns zero on success; non-zero on error.
177 */ 177 */
178static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, 178int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
179 loff_t offset) 179 loff_t offset)
180{ 180{
181 int rc = 0; 181 int rc = 0;
182 char dst[MD5_DIGEST_SIZE]; 182 char dst[MD5_DIGEST_SIZE];
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
924 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; 924 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
925 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) 925 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
926 crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; 926 crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
927 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
928 crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
929 if (mount_crypt_stat->flags
930 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
931 crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
932 else if (mount_crypt_stat->flags
933 & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
934 crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
935 }
927} 936}
928 937
929static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( 938static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem {
1060static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { 1069static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
1061 {0x00000001, ECRYPTFS_ENABLE_HMAC}, 1070 {0x00000001, ECRYPTFS_ENABLE_HMAC},
1062 {0x00000002, ECRYPTFS_ENCRYPTED}, 1071 {0x00000002, ECRYPTFS_ENCRYPTED},
1063 {0x00000004, ECRYPTFS_METADATA_IN_XATTR} 1072 {0x00000004, ECRYPTFS_METADATA_IN_XATTR},
1073 {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
1064}; 1074};
1065 1075
1066/** 1076/**
@@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = {
1149 1159
1150/** 1160/**
1151 * ecryptfs_code_for_cipher_string 1161 * ecryptfs_code_for_cipher_string
1152 * @crypt_stat: The cryptographic context 1162 * @cipher_name: The string alias for the cipher
1163 * @key_bytes: Length of key in bytes; used for AES code selection
1153 * 1164 *
1154 * Returns zero on no match, or the cipher code on match 1165 * Returns zero on no match, or the cipher code on match
1155 */ 1166 */
1156u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) 1167u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
1157{ 1168{
1158 int i; 1169 int i;
1159 u8 code = 0; 1170 u8 code = 0;
1160 struct ecryptfs_cipher_code_str_map_elem *map = 1171 struct ecryptfs_cipher_code_str_map_elem *map =
1161 ecryptfs_cipher_code_str_map; 1172 ecryptfs_cipher_code_str_map;
1162 1173
1163 if (strcmp(crypt_stat->cipher, "aes") == 0) { 1174 if (strcmp(cipher_name, "aes") == 0) {
1164 switch (crypt_stat->key_size) { 1175 switch (key_bytes) {
1165 case 16: 1176 case 16:
1166 code = RFC2440_CIPHER_AES_128; 1177 code = RFC2440_CIPHER_AES_128;
1167 break; 1178 break;
@@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
1173 } 1184 }
1174 } else { 1185 } else {
1175 for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) 1186 for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
1176 if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){ 1187 if (strcmp(cipher_name, map[i].cipher_str) == 0) {
1177 code = map[i].cipher_code; 1188 code = map[i].cipher_code;
1178 break; 1189 break;
1179 } 1190 }
@@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data,
1212 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 1223 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
1213 int rc; 1224 int rc;
1214 1225
1226 if (crypt_stat->extent_size == 0)
1227 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
1215 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, 1228 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
1216 ecryptfs_inode); 1229 ecryptfs_inode);
1217 if (rc) { 1230 if (rc) {
@@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data,
1221 } 1234 }
1222 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { 1235 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
1223 rc = -EINVAL; 1236 rc = -EINVAL;
1224 ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n");
1225 } 1237 }
1226out: 1238out:
1227 return rc; 1239 return rc;
@@ -1628,95 +1640,95 @@ out:
1628} 1640}
1629 1641
1630/** 1642/**
1631 * ecryptfs_encode_filename - converts a plaintext file name to cipher text 1643 * ecryptfs_encrypt_filename - encrypt filename
1632 * @crypt_stat: The crypt_stat struct associated with the file anem to encode
1633 * @name: The plaintext name
1634 * @length: The length of the plaintext
1635 * @encoded_name: The encypted name
1636 * 1644 *
1637 * Encrypts and encodes a filename into something that constitutes a 1645 * CBC-encrypts the filename. We do not want to encrypt the same
1638 * valid filename for a filesystem, with printable characters. 1646 * filename with the same key and IV, which may happen with hard
1647 * links, so we prepend random bits to each filename.
1639 * 1648 *
1640 * We assume that we have a properly initialized crypto context, 1649 * Returns zero on success; non-zero otherwise
1641 * pointed to by crypt_stat->tfm.
1642 *
1643 * TODO: Implement filename decoding and decryption here, in place of
1644 * memcpy. We are keeping the framework around for now to (1)
1645 * facilitate testing of the components needed to implement filename
1646 * encryption and (2) to provide a code base from which other
1647 * developers in the community can easily implement this feature.
1648 *
1649 * Returns the length of encoded filename; negative if error
1650 */ 1650 */
1651int 1651static int
1652ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, 1652ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
1653 const char *name, int length, char **encoded_name) 1653 struct ecryptfs_crypt_stat *crypt_stat,
1654 struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
1654{ 1655{
1655 int error = 0; 1656 int rc = 0;
1656 1657
1657 (*encoded_name) = kmalloc(length + 2, GFP_KERNEL); 1658 filename->encrypted_filename = NULL;
1658 if (!(*encoded_name)) { 1659 filename->encrypted_filename_size = 0;
1659 error = -ENOMEM; 1660 if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
1661 || (mount_crypt_stat && (mount_crypt_stat->flags
1662 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
1663 size_t packet_size;
1664 size_t remaining_bytes;
1665
1666 rc = ecryptfs_write_tag_70_packet(
1667 NULL, NULL,
1668 &filename->encrypted_filename_size,
1669 mount_crypt_stat, NULL,
1670 filename->filename_size);
1671 if (rc) {
1672 printk(KERN_ERR "%s: Error attempting to get packet "
1673 "size for tag 72; rc = [%d]\n", __func__,
1674 rc);
1675 filename->encrypted_filename_size = 0;
1676 goto out;
1677 }
1678 filename->encrypted_filename =
1679 kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
1680 if (!filename->encrypted_filename) {
1681 printk(KERN_ERR "%s: Out of memory whilst attempting "
1682 "to kmalloc [%zd] bytes\n", __func__,
1683 filename->encrypted_filename_size);
1684 rc = -ENOMEM;
1685 goto out;
1686 }
1687 remaining_bytes = filename->encrypted_filename_size;
1688 rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
1689 &remaining_bytes,
1690 &packet_size,
1691 mount_crypt_stat,
1692 filename->filename,
1693 filename->filename_size);
1694 if (rc) {
1695 printk(KERN_ERR "%s: Error attempting to generate "
1696 "tag 70 packet; rc = [%d]\n", __func__,
1697 rc);
1698 kfree(filename->encrypted_filename);
1699 filename->encrypted_filename = NULL;
1700 filename->encrypted_filename_size = 0;
1701 goto out;
1702 }
1703 filename->encrypted_filename_size = packet_size;
1704 } else {
1705 printk(KERN_ERR "%s: No support for requested filename "
1706 "encryption method in this release\n", __func__);
1707 rc = -ENOTSUPP;
1660 goto out; 1708 goto out;
1661 } 1709 }
1662 /* TODO: Filename encryption is a scheduled feature for a
1663 * future version of eCryptfs. This function is here only for
1664 * the purpose of providing a framework for other developers
1665 * to easily implement filename encryption. Hint: Replace this
1666 * memcpy() with a call to encrypt and encode the
1667 * filename, the set the length accordingly. */
1668 memcpy((void *)(*encoded_name), (void *)name, length);
1669 (*encoded_name)[length] = '\0';
1670 error = length + 1;
1671out: 1710out:
1672 return error; 1711 return rc;
1673} 1712}
1674 1713
1675/** 1714static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1676 * ecryptfs_decode_filename - converts the cipher text name to plaintext 1715 const char *name, size_t name_size)
1677 * @crypt_stat: The crypt_stat struct associated with the file
1678 * @name: The filename in cipher text
1679 * @length: The length of the cipher text name
1680 * @decrypted_name: The plaintext name
1681 *
1682 * Decodes and decrypts the filename.
1683 *
1684 * We assume that we have a properly initialized crypto context,
1685 * pointed to by crypt_stat->tfm.
1686 *
1687 * TODO: Implement filename decoding and decryption here, in place of
1688 * memcpy. We are keeping the framework around for now to (1)
1689 * facilitate testing of the components needed to implement filename
1690 * encryption and (2) to provide a code base from which other
1691 * developers in the community can easily implement this feature.
1692 *
1693 * Returns the length of decoded filename; negative if error
1694 */
1695int
1696ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
1697 const char *name, int length, char **decrypted_name)
1698{ 1716{
1699 int error = 0; 1717 int rc = 0;
1700 1718
1701 (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL); 1719 (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
1702 if (!(*decrypted_name)) { 1720 if (!(*copied_name)) {
1703 error = -ENOMEM; 1721 rc = -ENOMEM;
1704 goto out; 1722 goto out;
1705 } 1723 }
1706 /* TODO: Filename encryption is a scheduled feature for a 1724 memcpy((void *)(*copied_name), (void *)name, name_size);
1707 * future version of eCryptfs. This function is here only for 1725 (*copied_name)[(name_size)] = '\0'; /* Only for convenience
1708 * the purpose of providing a framework for other developers
1709 * to easily implement filename encryption. Hint: Replace this
1710 * memcpy() with a call to decode and decrypt the
1711 * filename, the set the length accordingly. */
1712 memcpy((void *)(*decrypted_name), (void *)name, length);
1713 (*decrypted_name)[length + 1] = '\0'; /* Only for convenience
1714 * in printing out the 1726 * in printing out the
1715 * string in debug 1727 * string in debug
1716 * messages */ 1728 * messages */
1717 error = length; 1729 (*copied_name_size) = (name_size + 1);
1718out: 1730out:
1719 return error; 1731 return rc;
1720} 1732}
1721 1733
1722/** 1734/**
@@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1740 *key_tfm = NULL; 1752 *key_tfm = NULL;
1741 if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { 1753 if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
1742 rc = -EINVAL; 1754 rc = -EINVAL;
1743 printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum " 1755 printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
1744 "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); 1756 "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
1745 goto out; 1757 goto out;
1746 } 1758 }
@@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1765 get_random_bytes(dummy_key, *key_size); 1777 get_random_bytes(dummy_key, *key_size);
1766 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); 1778 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
1767 if (rc) { 1779 if (rc) {
1768 printk(KERN_ERR "Error attempting to set key of size [%Zd] for " 1780 printk(KERN_ERR "Error attempting to set key of size [%zd] for "
1769 "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); 1781 "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
1770 rc = -EINVAL; 1782 rc = -EINVAL;
1771 goto out; 1783 goto out;
@@ -1910,3 +1922,341 @@ out:
1910 mutex_unlock(&key_tfm_list_mutex); 1922 mutex_unlock(&key_tfm_list_mutex);
1911 return rc; 1923 return rc;
1912} 1924}
1925
1926/* 64 characters forming a 6-bit target field */
1927static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
1928 "EFGHIJKLMNOPQRST"
1929 "UVWXYZabcdefghij"
1930 "klmnopqrstuvwxyz");
1931
1932/* We could either offset on every reverse map or just pad some 0x00's
1933 * at the front here */
1934static const unsigned char filename_rev_map[] = {
1935 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
1936 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
1937 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
1938 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
1939 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
1940 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
1941 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
1942 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
1943 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
1944 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
1945 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
1946 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
1947 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
1948 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
1949 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
1950 0x3D, 0x3E, 0x3F
1951};
1952
1953/**
1954 * ecryptfs_encode_for_filename
1955 * @dst: Destination location for encoded filename
1956 * @dst_size: Size of the encoded filename in bytes
1957 * @src: Source location for the filename to encode
1958 * @src_size: Size of the source in bytes
1959 */
1960void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
1961 unsigned char *src, size_t src_size)
1962{
1963 size_t num_blocks;
1964 size_t block_num = 0;
1965 size_t dst_offset = 0;
1966 unsigned char last_block[3];
1967
1968 if (src_size == 0) {
1969 (*dst_size) = 0;
1970 goto out;
1971 }
1972 num_blocks = (src_size / 3);
1973 if ((src_size % 3) == 0) {
1974 memcpy(last_block, (&src[src_size - 3]), 3);
1975 } else {
1976 num_blocks++;
1977 last_block[2] = 0x00;
1978 switch (src_size % 3) {
1979 case 1:
1980 last_block[0] = src[src_size - 1];
1981 last_block[1] = 0x00;
1982 break;
1983 case 2:
1984 last_block[0] = src[src_size - 2];
1985 last_block[1] = src[src_size - 1];
1986 }
1987 }
1988 (*dst_size) = (num_blocks * 4);
1989 if (!dst)
1990 goto out;
1991 while (block_num < num_blocks) {
1992 unsigned char *src_block;
1993 unsigned char dst_block[4];
1994
1995 if (block_num == (num_blocks - 1))
1996 src_block = last_block;
1997 else
1998 src_block = &src[block_num * 3];
1999 dst_block[0] = ((src_block[0] >> 2) & 0x3F);
2000 dst_block[1] = (((src_block[0] << 4) & 0x30)
2001 | ((src_block[1] >> 4) & 0x0F));
2002 dst_block[2] = (((src_block[1] << 2) & 0x3C)
2003 | ((src_block[2] >> 6) & 0x03));
2004 dst_block[3] = (src_block[2] & 0x3F);
2005 dst[dst_offset++] = portable_filename_chars[dst_block[0]];
2006 dst[dst_offset++] = portable_filename_chars[dst_block[1]];
2007 dst[dst_offset++] = portable_filename_chars[dst_block[2]];
2008 dst[dst_offset++] = portable_filename_chars[dst_block[3]];
2009 block_num++;
2010 }
2011out:
2012 return;
2013}
2014
2015/**
2016 * ecryptfs_decode_from_filename
2017 * @dst: If NULL, this function only sets @dst_size and returns. If
2018 * non-NULL, this function decodes the encoded octets in @src
2019 * into the memory that @dst points to.
2020 * @dst_size: Set to the size of the decoded string.
2021 * @src: The encoded set of octets to decode.
2022 * @src_size: The size of the encoded set of octets to decode.
2023 */
2024static void
2025ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
2026 const unsigned char *src, size_t src_size)
2027{
2028 u8 current_bit_offset = 0;
2029 size_t src_byte_offset = 0;
2030 size_t dst_byte_offset = 0;
2031
2032 if (dst == NULL) {
2033 /* Not exact; conservatively long. Every block of 4
2034 * encoded characters decodes into a block of 3
2035 * decoded characters. This segment of code provides
2036 * the caller with the maximum amount of allocated
2037 * space that @dst will need to point to in a
2038 * subsequent call. */
2039 (*dst_size) = (((src_size + 1) * 3) / 4);
2040 goto out;
2041 }
2042 while (src_byte_offset < src_size) {
2043 unsigned char src_byte =
2044 filename_rev_map[(int)src[src_byte_offset]];
2045
2046 switch (current_bit_offset) {
2047 case 0:
2048 dst[dst_byte_offset] = (src_byte << 2);
2049 current_bit_offset = 6;
2050 break;
2051 case 6:
2052 dst[dst_byte_offset++] |= (src_byte >> 4);
2053 dst[dst_byte_offset] = ((src_byte & 0xF)
2054 << 4);
2055 current_bit_offset = 4;
2056 break;
2057 case 4:
2058 dst[dst_byte_offset++] |= (src_byte >> 2);
2059 dst[dst_byte_offset] = (src_byte << 6);
2060 current_bit_offset = 2;
2061 break;
2062 case 2:
2063 dst[dst_byte_offset++] |= (src_byte);
2064 dst[dst_byte_offset] = 0;
2065 current_bit_offset = 0;
2066 break;
2067 }
2068 src_byte_offset++;
2069 }
2070 (*dst_size) = dst_byte_offset;
2071out:
2072 return;
2073}
2074
2075/**
2076 * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
2077 * @crypt_stat: The crypt_stat struct associated with the file anem to encode
2078 * @name: The plaintext name
2079 * @length: The length of the plaintext
2080 * @encoded_name: The encypted name
2081 *
2082 * Encrypts and encodes a filename into something that constitutes a
2083 * valid filename for a filesystem, with printable characters.
2084 *
2085 * We assume that we have a properly initialized crypto context,
2086 * pointed to by crypt_stat->tfm.
2087 *
2088 * Returns zero on success; non-zero on otherwise
2089 */
2090int ecryptfs_encrypt_and_encode_filename(
2091 char **encoded_name,
2092 size_t *encoded_name_size,
2093 struct ecryptfs_crypt_stat *crypt_stat,
2094 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2095 const char *name, size_t name_size)
2096{
2097 size_t encoded_name_no_prefix_size;
2098 int rc = 0;
2099
2100 (*encoded_name) = NULL;
2101 (*encoded_name_size) = 0;
2102 if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
2103 || (mount_crypt_stat && (mount_crypt_stat->flags
2104 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
2105 struct ecryptfs_filename *filename;
2106
2107 filename = kzalloc(sizeof(*filename), GFP_KERNEL);
2108 if (!filename) {
2109 printk(KERN_ERR "%s: Out of memory whilst attempting "
2110 "to kzalloc [%zd] bytes\n", __func__,
2111 sizeof(*filename));
2112 rc = -ENOMEM;
2113 goto out;
2114 }
2115 filename->filename = (char *)name;
2116 filename->filename_size = name_size;
2117 rc = ecryptfs_encrypt_filename(filename, crypt_stat,
2118 mount_crypt_stat);
2119 if (rc) {
2120 printk(KERN_ERR "%s: Error attempting to encrypt "
2121 "filename; rc = [%d]\n", __func__, rc);
2122 kfree(filename);
2123 goto out;
2124 }
2125 ecryptfs_encode_for_filename(
2126 NULL, &encoded_name_no_prefix_size,
2127 filename->encrypted_filename,
2128 filename->encrypted_filename_size);
2129 if ((crypt_stat && (crypt_stat->flags
2130 & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
2131 || (mount_crypt_stat
2132 && (mount_crypt_stat->flags
2133 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
2134 (*encoded_name_size) =
2135 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2136 + encoded_name_no_prefix_size);
2137 else
2138 (*encoded_name_size) =
2139 (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2140 + encoded_name_no_prefix_size);
2141 (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
2142 if (!(*encoded_name)) {
2143 printk(KERN_ERR "%s: Out of memory whilst attempting "
2144 "to kzalloc [%zd] bytes\n", __func__,
2145 (*encoded_name_size));
2146 rc = -ENOMEM;
2147 kfree(filename->encrypted_filename);
2148 kfree(filename);
2149 goto out;
2150 }
2151 if ((crypt_stat && (crypt_stat->flags
2152 & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
2153 || (mount_crypt_stat
2154 && (mount_crypt_stat->flags
2155 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
2156 memcpy((*encoded_name),
2157 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
2158 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
2159 ecryptfs_encode_for_filename(
2160 ((*encoded_name)
2161 + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
2162 &encoded_name_no_prefix_size,
2163 filename->encrypted_filename,
2164 filename->encrypted_filename_size);
2165 (*encoded_name_size) =
2166 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2167 + encoded_name_no_prefix_size);
2168 (*encoded_name)[(*encoded_name_size)] = '\0';
2169 (*encoded_name_size)++;
2170 } else {
2171 rc = -ENOTSUPP;
2172 }
2173 if (rc) {
2174 printk(KERN_ERR "%s: Error attempting to encode "
2175 "encrypted filename; rc = [%d]\n", __func__,
2176 rc);
2177 kfree((*encoded_name));
2178 (*encoded_name) = NULL;
2179 (*encoded_name_size) = 0;
2180 }
2181 kfree(filename->encrypted_filename);
2182 kfree(filename);
2183 } else {
2184 rc = ecryptfs_copy_filename(encoded_name,
2185 encoded_name_size,
2186 name, name_size);
2187 }
2188out:
2189 return rc;
2190}
2191
2192/**
2193 * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
2194 * @plaintext_name: The plaintext name
2195 * @plaintext_name_size: The plaintext name size
2196 * @ecryptfs_dir_dentry: eCryptfs directory dentry
2197 * @name: The filename in cipher text
2198 * @name_size: The cipher text name size
2199 *
2200 * Decrypts and decodes the filename.
2201 *
2202 * Returns zero on error; non-zero otherwise
2203 */
2204int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
2205 size_t *plaintext_name_size,
2206 struct dentry *ecryptfs_dir_dentry,
2207 const char *name, size_t name_size)
2208{
2209 char *decoded_name;
2210 size_t decoded_name_size;
2211 size_t packet_size;
2212 int rc = 0;
2213
2214 if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
2215 && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
2216 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
2217 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2218 &ecryptfs_superblock_to_private(
2219 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2220 const char *orig_name = name;
2221 size_t orig_name_size = name_size;
2222
2223 name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
2224 name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
2225 ecryptfs_decode_from_filename(NULL, &decoded_name_size,
2226 name, name_size);
2227 decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
2228 if (!decoded_name) {
2229 printk(KERN_ERR "%s: Out of memory whilst attempting "
2230 "to kmalloc [%zd] bytes\n", __func__,
2231 decoded_name_size);
2232 rc = -ENOMEM;
2233 goto out;
2234 }
2235 ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
2236 name, name_size);
2237 rc = ecryptfs_parse_tag_70_packet(plaintext_name,
2238 plaintext_name_size,
2239 &packet_size,
2240 mount_crypt_stat,
2241 decoded_name,
2242 decoded_name_size);
2243 if (rc) {
2244 printk(KERN_INFO "%s: Could not parse tag 70 packet "
2245 "from filename; copying through filename "
2246 "as-is\n", __func__);
2247 rc = ecryptfs_copy_filename(plaintext_name,
2248 plaintext_name_size,
2249 orig_name, orig_name_size);
2250 goto out_free;
2251 }
2252 } else {
2253 rc = ecryptfs_copy_filename(plaintext_name,
2254 plaintext_name_size,
2255 name, name_size);
2256 goto out;
2257 }
2258out_free:
2259 kfree(decoded_name);
2260out:
2261 return rc;
2262}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a75026d35d16..c11fc95714ab 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -51,12 +51,16 @@
51#define ECRYPTFS_VERSIONING_XATTR 0x00000010 51#define ECRYPTFS_VERSIONING_XATTR 0x00000010
52#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 52#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020
53#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 53#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040
54#define ECRYPTFS_VERSIONING_HMAC 0x00000080
55#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100
56#define ECRYPTFS_VERSIONING_GCM 0x00000200
54#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ 57#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
55 | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ 58 | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
56 | ECRYPTFS_VERSIONING_PUBKEY \ 59 | ECRYPTFS_VERSIONING_PUBKEY \
57 | ECRYPTFS_VERSIONING_XATTR \ 60 | ECRYPTFS_VERSIONING_XATTR \
58 | ECRYPTFS_VERSIONING_MULTKEY \ 61 | ECRYPTFS_VERSIONING_MULTKEY \
59 | ECRYPTFS_VERSIONING_DEVMISC) 62 | ECRYPTFS_VERSIONING_DEVMISC \
63 | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
60#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 64#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
61#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH 65#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
62#define ECRYPTFS_SALT_SIZE 8 66#define ECRYPTFS_SALT_SIZE 8
@@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key)
199#define ECRYPTFS_DEFAULT_CIPHER "aes" 203#define ECRYPTFS_DEFAULT_CIPHER "aes"
200#define ECRYPTFS_DEFAULT_KEY_BYTES 16 204#define ECRYPTFS_DEFAULT_KEY_BYTES 16
201#define ECRYPTFS_DEFAULT_HASH "md5" 205#define ECRYPTFS_DEFAULT_HASH "md5"
206#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
202#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 207#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
203#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C 208#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
204#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED 209#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key)
206#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 211#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
207#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 212#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
208#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 213#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
214#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
215 * as dentry name */
216#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
217 * metadata */
218#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
219 * dentry name */
220#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
221 * metadata */
222/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
223 * ECRYPTFS_MAX_IV_BYTES */
224#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
225#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
209#define MD5_DIGEST_SIZE 16 226#define MD5_DIGEST_SIZE 16
227#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
228#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
229#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
230#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
231#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
232#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
210 233
211struct ecryptfs_key_sig { 234struct ecryptfs_key_sig {
212 struct list_head crypt_stat_list; 235 struct list_head crypt_stat_list;
213 char keysig[ECRYPTFS_SIG_SIZE_HEX]; 236 char keysig[ECRYPTFS_SIG_SIZE_HEX];
214}; 237};
215 238
239struct ecryptfs_filename {
240 struct list_head crypt_stat_list;
241#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
242 u32 flags;
243 u32 seq_no;
244 char *filename;
245 char *encrypted_filename;
246 size_t filename_size;
247 size_t encrypted_filename_size;
248 char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
249 char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
250};
251
216/** 252/**
217 * This is the primary struct associated with each encrypted file. 253 * This is the primary struct associated with each encrypted file.
218 * 254 *
219 * TODO: cache align/pack? 255 * TODO: cache align/pack?
220 */ 256 */
221struct ecryptfs_crypt_stat { 257struct ecryptfs_crypt_stat {
222#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 258#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
223#define ECRYPTFS_POLICY_APPLIED 0x00000002 259#define ECRYPTFS_POLICY_APPLIED 0x00000002
224#define ECRYPTFS_NEW_FILE 0x00000004 260#define ECRYPTFS_NEW_FILE 0x00000004
225#define ECRYPTFS_ENCRYPTED 0x00000008 261#define ECRYPTFS_ENCRYPTED 0x00000008
226#define ECRYPTFS_SECURITY_WARNING 0x00000010 262#define ECRYPTFS_SECURITY_WARNING 0x00000010
227#define ECRYPTFS_ENABLE_HMAC 0x00000020 263#define ECRYPTFS_ENABLE_HMAC 0x00000020
228#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 264#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040
229#define ECRYPTFS_KEY_VALID 0x00000080 265#define ECRYPTFS_KEY_VALID 0x00000080
230#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 266#define ECRYPTFS_METADATA_IN_XATTR 0x00000100
231#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 267#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200
232#define ECRYPTFS_KEY_SET 0x00000400 268#define ECRYPTFS_KEY_SET 0x00000400
269#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800
270#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
271#define ECRYPTFS_ENCFN_USE_FEK 0x00002000
233 u32 flags; 272 u32 flags;
234 unsigned int file_version; 273 unsigned int file_version;
235 size_t iv_bytes; 274 size_t iv_bytes;
@@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat {
332#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 371#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002
333#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 372#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004
334#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 373#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008
374#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010
375#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020
376#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040
335 u32 flags; 377 u32 flags;
336 struct list_head global_auth_tok_list; 378 struct list_head global_auth_tok_list;
337 struct mutex global_auth_tok_list_mutex; 379 struct mutex global_auth_tok_list_mutex;
338 size_t num_global_auth_toks; 380 size_t num_global_auth_toks;
339 size_t global_default_cipher_key_size; 381 size_t global_default_cipher_key_size;
382 size_t global_default_fn_cipher_key_bytes;
340 unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE 383 unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
341 + 1]; 384 + 1];
385 unsigned char global_default_fn_cipher_name[
386 ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
387 char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
342}; 388};
343 389
344/* superblock private data. */ 390/* superblock private data. */
@@ -571,13 +617,22 @@ struct ecryptfs_open_req {
571int ecryptfs_interpose(struct dentry *hidden_dentry, 617int ecryptfs_interpose(struct dentry *hidden_dentry,
572 struct dentry *this_dentry, struct super_block *sb, 618 struct dentry *this_dentry, struct super_block *sb,
573 u32 flags); 619 u32 flags);
620int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
621 struct dentry *lower_dentry,
622 struct ecryptfs_crypt_stat *crypt_stat,
623 struct inode *ecryptfs_dir_inode,
624 struct nameidata *ecryptfs_nd);
625int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
626 size_t *decrypted_name_size,
627 struct dentry *ecryptfs_dentry,
628 const char *name, size_t name_size);
574int ecryptfs_fill_zeros(struct file *file, loff_t new_length); 629int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
575int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, 630int ecryptfs_encrypt_and_encode_filename(
576 const char *name, int length, 631 char **encoded_name,
577 char **decrypted_name); 632 size_t *encoded_name_size,
578int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, 633 struct ecryptfs_crypt_stat *crypt_stat,
579 const char *name, int length, 634 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
580 char **encoded_name); 635 const char *name, size_t name_size);
581struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); 636struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
582void ecryptfs_dump_hex(char *data, int bytes); 637void ecryptfs_dump_hex(char *data, int bytes);
583int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, 638int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
@@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
599 struct inode *ecryptfs_inode); 654 struct inode *ecryptfs_inode);
600int ecryptfs_read_and_validate_xattr_region(char *page_virt, 655int ecryptfs_read_and_validate_xattr_region(char *page_virt,
601 struct dentry *ecryptfs_dentry); 656 struct dentry *ecryptfs_dentry);
602u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat); 657u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
603int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); 658int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
604void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); 659void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
605int ecryptfs_generate_key_packet_set(char *dest_base, 660int ecryptfs_generate_key_packet_set(char *dest_base,
@@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file,
694 struct vfsmount *lower_mnt, 749 struct vfsmount *lower_mnt,
695 const struct cred *cred); 750 const struct cred *cred);
696int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); 751int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
752int
753ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
754 size_t *packet_size,
755 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
756 char *filename, size_t filename_size);
757int
758ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
759 size_t *packet_size,
760 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
761 char *data, size_t max_packet_size);
762int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
763 loff_t offset);
697 764
698#endif /* #ifndef ECRYPTFS_KERNEL_H */ 765#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac06..9e944057001b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback {
77 77
78/* Inspired by generic filldir in fs/readdir.c */ 78/* Inspired by generic filldir in fs/readdir.c */
79static int 79static int
80ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, 80ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
81 u64 ino, unsigned int d_type) 81 loff_t offset, u64 ino, unsigned int d_type)
82{ 82{
83 struct ecryptfs_crypt_stat *crypt_stat;
84 struct ecryptfs_getdents_callback *buf = 83 struct ecryptfs_getdents_callback *buf =
85 (struct ecryptfs_getdents_callback *)dirent; 84 (struct ecryptfs_getdents_callback *)dirent;
85 size_t name_size;
86 char *name;
86 int rc; 87 int rc;
87 int decoded_length;
88 char *decoded_name;
89 88
90 crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
91 buf->filldir_called++; 89 buf->filldir_called++;
92 decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen, 90 rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
93 &decoded_name); 91 buf->dentry, lower_name,
94 if (decoded_length < 0) { 92 lower_namelen);
95 rc = decoded_length; 93 if (rc) {
94 printk(KERN_ERR "%s: Error attempting to decode and decrypt "
95 "filename [%s]; rc = [%d]\n", __func__, lower_name,
96 rc);
96 goto out; 97 goto out;
97 } 98 }
98 rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset, 99 rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
99 ino, d_type); 100 kfree(name);
100 kfree(decoded_name);
101 if (rc >= 0) 101 if (rc >= 0)
102 buf->entries_written++; 102 buf->entries_written++;
103out: 103out:
@@ -106,8 +106,8 @@ out:
106 106
107/** 107/**
108 * ecryptfs_readdir 108 * ecryptfs_readdir
109 * @file: The ecryptfs file struct 109 * @file: The eCryptfs directory file
110 * @dirent: Directory entry 110 * @dirent: Directory entry handle
111 * @filldir: The filldir callback function 111 * @filldir: The filldir callback function
112 */ 112 */
113static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) 113static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
275static int 275static int
276ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 276ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
277{ 277{
278 struct file *lower_file = ecryptfs_file_to_lower(file); 278 return vfs_fsync(ecryptfs_file_to_lower(file),
279 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 279 ecryptfs_dentry_to_lower(dentry),
280 struct inode *lower_inode = lower_dentry->d_inode; 280 datasync);
281 int rc = -EINVAL;
282
283 if (lower_inode->i_fop->fsync) {
284 mutex_lock(&lower_inode->i_mutex);
285 rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
286 datasync);
287 mutex_unlock(&lower_inode->i_mutex);
288 }
289 return rc;
290} 281}
291 282
292static int ecryptfs_fasync(int fd, struct file *file, int flag) 283static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5e78fc179886..5697899a168d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir)
52/** 52/**
53 * ecryptfs_create_underlying_file 53 * ecryptfs_create_underlying_file
54 * @lower_dir_inode: inode of the parent in the lower fs of the new file 54 * @lower_dir_inode: inode of the parent in the lower fs of the new file
55 * @lower_dentry: New file's dentry in the lower fs 55 * @dentry: New file's dentry
56 * @ecryptfs_dentry: New file's dentry in ecryptfs
57 * @mode: The mode of the new file 56 * @mode: The mode of the new file
58 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount 57 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
59 * 58 *
@@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
228{ 227{
229 int rc; 228 int rc;
230 229
231 /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens 230 /* ecryptfs_do_create() calls ecryptfs_interpose() */
232 * the crypt_stat->lower_file (persistent file) */
233 rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); 231 rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
234 if (unlikely(rc)) { 232 if (unlikely(rc)) {
235 ecryptfs_printk(KERN_WARNING, "Failed to create file in" 233 ecryptfs_printk(KERN_WARNING, "Failed to create file in"
@@ -244,141 +242,91 @@ out:
244} 242}
245 243
246/** 244/**
247 * ecryptfs_lookup 245 * ecryptfs_lookup_and_interpose_lower - Perform a lookup
248 * @dir: inode
249 * @dentry: The dentry
250 * @nd: nameidata, may be NULL
251 *
252 * Find a file on disk. If the file does not exist, then we'll add it to the
253 * dentry cache and continue on to read it from the disk.
254 */ 246 */
255static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, 247int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
256 struct nameidata *nd) 248 struct dentry *lower_dentry,
249 struct ecryptfs_crypt_stat *crypt_stat,
250 struct inode *ecryptfs_dir_inode,
251 struct nameidata *ecryptfs_nd)
257{ 252{
258 int rc = 0;
259 struct dentry *lower_dir_dentry; 253 struct dentry *lower_dir_dentry;
260 struct dentry *lower_dentry;
261 struct vfsmount *lower_mnt; 254 struct vfsmount *lower_mnt;
262 char *encoded_name; 255 struct inode *lower_inode;
263 int encoded_namelen;
264 struct ecryptfs_crypt_stat *crypt_stat = NULL;
265 struct ecryptfs_mount_crypt_stat *mount_crypt_stat; 256 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
266 char *page_virt = NULL; 257 char *page_virt = NULL;
267 struct inode *lower_inode;
268 u64 file_size; 258 u64 file_size;
259 int rc = 0;
269 260
270 lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); 261 lower_dir_dentry = lower_dentry->d_parent;
271 dentry->d_op = &ecryptfs_dops; 262 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
272 if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, ".")) 263 ecryptfs_dentry->d_parent));
273 || (dentry->d_name.len == 2
274 && !strcmp(dentry->d_name.name, ".."))) {
275 d_drop(dentry);
276 goto out;
277 }
278 encoded_namelen = ecryptfs_encode_filename(crypt_stat,
279 dentry->d_name.name,
280 dentry->d_name.len,
281 &encoded_name);
282 if (encoded_namelen < 0) {
283 rc = encoded_namelen;
284 d_drop(dentry);
285 goto out;
286 }
287 ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
288 "= [%d]\n", encoded_name, encoded_namelen);
289 lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
290 encoded_namelen - 1);
291 kfree(encoded_name);
292 if (IS_ERR(lower_dentry)) {
293 ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
294 rc = PTR_ERR(lower_dentry);
295 d_drop(dentry);
296 goto out;
297 }
298 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
299 ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
300 "d_name.name = [%s]\n", lower_dentry,
301 lower_dentry->d_name.name);
302 lower_inode = lower_dentry->d_inode; 264 lower_inode = lower_dentry->d_inode;
303 fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode); 265 fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
304 BUG_ON(!atomic_read(&lower_dentry->d_count)); 266 BUG_ON(!atomic_read(&lower_dentry->d_count));
305 ecryptfs_set_dentry_private(dentry, 267 ecryptfs_set_dentry_private(ecryptfs_dentry,
306 kmem_cache_alloc(ecryptfs_dentry_info_cache, 268 kmem_cache_alloc(ecryptfs_dentry_info_cache,
307 GFP_KERNEL)); 269 GFP_KERNEL));
308 if (!ecryptfs_dentry_to_private(dentry)) { 270 if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
309 rc = -ENOMEM; 271 rc = -ENOMEM;
310 ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting " 272 printk(KERN_ERR "%s: Out of memory whilst attempting "
311 "to allocate ecryptfs_dentry_info struct\n"); 273 "to allocate ecryptfs_dentry_info struct\n",
274 __func__);
312 goto out_dput; 275 goto out_dput;
313 } 276 }
314 ecryptfs_set_dentry_lower(dentry, lower_dentry); 277 ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
315 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); 278 ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
316 if (!lower_dentry->d_inode) { 279 if (!lower_dentry->d_inode) {
317 /* We want to add because we couldn't find in lower */ 280 /* We want to add because we couldn't find in lower */
318 d_add(dentry, NULL); 281 d_add(ecryptfs_dentry, NULL);
319 goto out; 282 goto out;
320 } 283 }
321 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 284 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
322 ECRYPTFS_INTERPOSE_FLAG_D_ADD); 285 ecryptfs_dir_inode->i_sb, 1);
323 if (rc) { 286 if (rc) {
324 ecryptfs_printk(KERN_ERR, "Error interposing\n"); 287 printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
288 __func__, rc);
325 goto out; 289 goto out;
326 } 290 }
327 if (S_ISDIR(lower_inode->i_mode)) { 291 if (S_ISDIR(lower_inode->i_mode))
328 ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
329 goto out; 292 goto out;
330 } 293 if (S_ISLNK(lower_inode->i_mode))
331 if (S_ISLNK(lower_inode->i_mode)) {
332 ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
333 goto out; 294 goto out;
334 } 295 if (special_file(lower_inode->i_mode))
335 if (special_file(lower_inode->i_mode)) {
336 ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
337 goto out; 296 goto out;
338 } 297 if (!ecryptfs_nd)
339 if (!nd) {
340 ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
341 "as we *think* we are about to unlink\n");
342 goto out; 298 goto out;
343 }
344 /* Released in this function */ 299 /* Released in this function */
345 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, 300 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
346 GFP_USER);
347 if (!page_virt) { 301 if (!page_virt) {
302 printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
303 __func__);
348 rc = -ENOMEM; 304 rc = -ENOMEM;
349 ecryptfs_printk(KERN_ERR,
350 "Cannot ecryptfs_kmalloc a page\n");
351 goto out; 305 goto out;
352 } 306 }
353 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 307 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
354 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) 308 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
355 ecryptfs_set_default_sizes(crypt_stat);
356 if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
357 rc = ecryptfs_init_persistent_file(dentry);
358 if (rc) { 309 if (rc) {
359 printk(KERN_ERR "%s: Error attempting to initialize " 310 printk(KERN_ERR "%s: Error attempting to initialize "
360 "the persistent file for the dentry with name " 311 "the persistent file for the dentry with name "
361 "[%s]; rc = [%d]\n", __func__, 312 "[%s]; rc = [%d]\n", __func__,
362 dentry->d_name.name, rc); 313 ecryptfs_dentry->d_name.name, rc);
363 goto out; 314 goto out_free_kmem;
364 } 315 }
365 } 316 }
366 rc = ecryptfs_read_and_validate_header_region(page_virt, 317 rc = ecryptfs_read_and_validate_header_region(page_virt,
367 dentry->d_inode); 318 ecryptfs_dentry->d_inode);
368 if (rc) { 319 if (rc) {
369 rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry); 320 rc = ecryptfs_read_and_validate_xattr_region(page_virt,
321 ecryptfs_dentry);
370 if (rc) { 322 if (rc) {
371 printk(KERN_DEBUG "Valid metadata not found in header "
372 "region or xattr region; treating file as "
373 "unencrypted\n");
374 rc = 0; 323 rc = 0;
375 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 324 goto out_free_kmem;
376 goto out;
377 } 325 }
378 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; 326 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
379 } 327 }
380 mount_crypt_stat = &ecryptfs_superblock_to_private( 328 mount_crypt_stat = &ecryptfs_superblock_to_private(
381 dentry->d_sb)->mount_crypt_stat; 329 ecryptfs_dentry->d_sb)->mount_crypt_stat;
382 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { 330 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
383 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 331 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
384 file_size = (crypt_stat->num_header_bytes_at_front 332 file_size = (crypt_stat->num_header_bytes_at_front
@@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
388 } else { 336 } else {
389 file_size = get_unaligned_be64(page_virt); 337 file_size = get_unaligned_be64(page_virt);
390 } 338 }
391 i_size_write(dentry->d_inode, (loff_t)file_size); 339 i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
340out_free_kmem:
392 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 341 kmem_cache_free(ecryptfs_header_cache_2, page_virt);
393 goto out; 342 goto out;
394
395out_dput: 343out_dput:
396 dput(lower_dentry); 344 dput(lower_dentry);
397 d_drop(dentry); 345 d_drop(ecryptfs_dentry);
346out:
347 return rc;
348}
349
350/**
351 * ecryptfs_lookup
352 * @ecryptfs_dir_inode: The eCryptfs directory inode
353 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
354 * @ecryptfs_nd: nameidata; may be NULL
355 *
356 * Find a file on disk. If the file does not exist, then we'll add it to the
357 * dentry cache and continue on to read it from the disk.
358 */
359static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
360 struct dentry *ecryptfs_dentry,
361 struct nameidata *ecryptfs_nd)
362{
363 char *encrypted_and_encoded_name = NULL;
364 size_t encrypted_and_encoded_name_size;
365 struct ecryptfs_crypt_stat *crypt_stat = NULL;
366 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
367 struct ecryptfs_inode_info *inode_info;
368 struct dentry *lower_dir_dentry, *lower_dentry;
369 int rc = 0;
370
371 ecryptfs_dentry->d_op = &ecryptfs_dops;
372 if ((ecryptfs_dentry->d_name.len == 1
373 && !strcmp(ecryptfs_dentry->d_name.name, "."))
374 || (ecryptfs_dentry->d_name.len == 2
375 && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
376 goto out_d_drop;
377 }
378 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
379 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
380 lower_dir_dentry,
381 ecryptfs_dentry->d_name.len);
382 if (IS_ERR(lower_dentry)) {
383 rc = PTR_ERR(lower_dentry);
384 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
385 "lower_dentry = [%s]\n", __func__, rc,
386 ecryptfs_dentry->d_name.name);
387 goto out_d_drop;
388 }
389 if (lower_dentry->d_inode)
390 goto lookup_and_interpose;
391 inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
392 if (inode_info) {
393 crypt_stat = &inode_info->crypt_stat;
394 /* TODO: lock for crypt_stat comparison */
395 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
396 ecryptfs_set_default_sizes(crypt_stat);
397 }
398 if (crypt_stat)
399 mount_crypt_stat = crypt_stat->mount_crypt_stat;
400 else
401 mount_crypt_stat = &ecryptfs_superblock_to_private(
402 ecryptfs_dentry->d_sb)->mount_crypt_stat;
403 if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
404 && !(mount_crypt_stat && (mount_crypt_stat->flags
405 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
406 goto lookup_and_interpose;
407 dput(lower_dentry);
408 rc = ecryptfs_encrypt_and_encode_filename(
409 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
410 crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
411 ecryptfs_dentry->d_name.len);
412 if (rc) {
413 printk(KERN_ERR "%s: Error attempting to encrypt and encode "
414 "filename; rc = [%d]\n", __func__, rc);
415 goto out_d_drop;
416 }
417 lower_dentry = lookup_one_len(encrypted_and_encoded_name,
418 lower_dir_dentry,
419 encrypted_and_encoded_name_size - 1);
420 if (IS_ERR(lower_dentry)) {
421 rc = PTR_ERR(lower_dentry);
422 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
423 "lower_dentry = [%s]\n", __func__, rc,
424 encrypted_and_encoded_name);
425 goto out_d_drop;
426 }
427lookup_and_interpose:
428 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
429 crypt_stat, ecryptfs_dir_inode,
430 ecryptfs_nd);
431 goto out;
432out_d_drop:
433 d_drop(ecryptfs_dentry);
398out: 434out:
435 kfree(encrypted_and_encoded_name);
399 return ERR_PTR(rc); 436 return ERR_PTR(rc);
400} 437}
401 438
@@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
466 struct dentry *lower_dentry; 503 struct dentry *lower_dentry;
467 struct dentry *lower_dir_dentry; 504 struct dentry *lower_dir_dentry;
468 char *encoded_symname; 505 char *encoded_symname;
469 int encoded_symlen; 506 size_t encoded_symlen;
470 struct ecryptfs_crypt_stat *crypt_stat = NULL; 507 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
471 508
472 lower_dentry = ecryptfs_dentry_to_lower(dentry); 509 lower_dentry = ecryptfs_dentry_to_lower(dentry);
473 dget(lower_dentry); 510 dget(lower_dentry);
474 lower_dir_dentry = lock_parent(lower_dentry); 511 lower_dir_dentry = lock_parent(lower_dentry);
475 encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname, 512 mount_crypt_stat = &ecryptfs_superblock_to_private(
476 strlen(symname), 513 dir->i_sb)->mount_crypt_stat;
477 &encoded_symname); 514 rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
478 if (encoded_symlen < 0) { 515 &encoded_symlen,
479 rc = encoded_symlen; 516 NULL,
517 mount_crypt_stat, symname,
518 strlen(symname));
519 if (rc)
480 goto out_lock; 520 goto out_lock;
481 }
482 rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, 521 rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
483 encoded_symname); 522 encoded_symname);
484 kfree(encoded_symname); 523 kfree(encoded_symname);
@@ -602,53 +641,54 @@ out_lock:
602} 641}
603 642
604static int 643static int
605ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz) 644ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
606{ 645{
607 int rc;
608 struct dentry *lower_dentry;
609 char *decoded_name;
610 char *lower_buf; 646 char *lower_buf;
611 mm_segment_t old_fs; 647 struct dentry *lower_dentry;
612 struct ecryptfs_crypt_stat *crypt_stat; 648 struct ecryptfs_crypt_stat *crypt_stat;
649 char *plaintext_name;
650 size_t plaintext_name_size;
651 mm_segment_t old_fs;
652 int rc;
613 653
614 lower_dentry = ecryptfs_dentry_to_lower(dentry); 654 lower_dentry = ecryptfs_dentry_to_lower(dentry);
615 if (!lower_dentry->d_inode->i_op || 655 if (!lower_dentry->d_inode->i_op->readlink) {
616 !lower_dentry->d_inode->i_op->readlink) {
617 rc = -EINVAL; 656 rc = -EINVAL;
618 goto out; 657 goto out;
619 } 658 }
659 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
620 /* Released in this function */ 660 /* Released in this function */
621 lower_buf = kmalloc(bufsiz, GFP_KERNEL); 661 lower_buf = kmalloc(bufsiz, GFP_KERNEL);
622 if (lower_buf == NULL) { 662 if (lower_buf == NULL) {
623 ecryptfs_printk(KERN_ERR, "Out of memory\n"); 663 printk(KERN_ERR "%s: Out of memory whilst attempting to "
664 "kmalloc [%d] bytes\n", __func__, bufsiz);
624 rc = -ENOMEM; 665 rc = -ENOMEM;
625 goto out; 666 goto out;
626 } 667 }
627 old_fs = get_fs(); 668 old_fs = get_fs();
628 set_fs(get_ds()); 669 set_fs(get_ds());
629 ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
630 "lower_dentry->d_name.name = [%s]\n",
631 lower_dentry->d_name.name);
632 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, 670 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
633 (char __user *)lower_buf, 671 (char __user *)lower_buf,
634 bufsiz); 672 bufsiz);
635 set_fs(old_fs); 673 set_fs(old_fs);
636 if (rc >= 0) { 674 if (rc >= 0) {
637 crypt_stat = NULL; 675 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
638 rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc, 676 &plaintext_name_size,
639 &decoded_name); 677 dentry, lower_buf,
640 if (rc == -ENOMEM) 678 rc);
679 if (rc) {
680 printk(KERN_ERR "%s: Error attempting to decode and "
681 "decrypt filename; rc = [%d]\n", __func__,
682 rc);
641 goto out_free_lower_buf; 683 goto out_free_lower_buf;
642 if (rc > 0) {
643 ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
644 "to userspace: [%*s]\n", rc,
645 decoded_name);
646 if (copy_to_user(buf, decoded_name, rc))
647 rc = -EFAULT;
648 } 684 }
649 kfree(decoded_name); 685 rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
650 fsstack_copy_attr_atime(dentry->d_inode, 686 if (rc)
651 lower_dentry->d_inode); 687 rc = -EFAULT;
688 else
689 rc = plaintext_name_size;
690 kfree(plaintext_name);
691 fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
652 } 692 }
653out_free_lower_buf: 693out_free_lower_buf:
654 kfree(lower_buf); 694 kfree(lower_buf);
@@ -670,8 +710,6 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
670 } 710 }
671 old_fs = get_fs(); 711 old_fs = get_fs();
672 set_fs(get_ds()); 712 set_fs(get_ds());
673 ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
674 "dentry->d_name.name = [%s]\n", dentry->d_name.name);
675 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); 713 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
676 set_fs(old_fs); 714 set_fs(old_fs);
677 if (rc < 0) 715 if (rc < 0)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 0d713b691941..ff539420cc6f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
358 /* verify that everything through the encrypted FEK size is present */ 358 /* verify that everything through the encrypted FEK size is present */
359 if (message_len < 4) { 359 if (message_len < 4) {
360 rc = -EIO; 360 rc = -EIO;
361 printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable " 361 printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
362 "message length is [%d]\n", __func__, message_len, 4); 362 "message length is [%d]\n", __func__, message_len, 4);
363 goto out; 363 goto out;
364 } 364 }
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
385 i += data_len; 385 i += data_len;
386 if (message_len < (i + key_rec->enc_key_size)) { 386 if (message_len < (i + key_rec->enc_key_size)) {
387 rc = -EIO; 387 rc = -EIO;
388 printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n", 388 printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
389 __func__, message_len, (i + key_rec->enc_key_size)); 389 __func__, message_len, (i + key_rec->enc_key_size));
390 goto out; 390 goto out;
391 } 391 }
392 if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { 392 if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
393 rc = -EIO; 393 rc = -EIO;
394 printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than " 394 printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
395 "the maximum key size [%d]\n", __func__, 395 "the maximum key size [%d]\n", __func__,
396 key_rec->enc_key_size, 396 key_rec->enc_key_size,
397 ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); 397 ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
@@ -403,6 +403,580 @@ out:
403} 403}
404 404
405static int 405static int
406ecryptfs_find_global_auth_tok_for_sig(
407 struct ecryptfs_global_auth_tok **global_auth_tok,
408 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
409{
410 struct ecryptfs_global_auth_tok *walker;
411 int rc = 0;
412
413 (*global_auth_tok) = NULL;
414 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
415 list_for_each_entry(walker,
416 &mount_crypt_stat->global_auth_tok_list,
417 mount_crypt_stat_list) {
418 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
419 (*global_auth_tok) = walker;
420 goto out;
421 }
422 }
423 rc = -EINVAL;
424out:
425 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
426 return rc;
427}
428
429/**
430 * ecryptfs_find_auth_tok_for_sig
431 * @auth_tok: Set to the matching auth_tok; NULL if not found
432 * @crypt_stat: inode crypt_stat crypto context
433 * @sig: Sig of auth_tok to find
434 *
435 * For now, this function simply looks at the registered auth_tok's
436 * linked off the mount_crypt_stat, so all the auth_toks that can be
437 * used must be registered at mount time. This function could
438 * potentially try a lot harder to find auth_tok's (e.g., by calling
439 * out to ecryptfsd to dynamically retrieve an auth_tok object) so
440 * that static registration of auth_tok's will no longer be necessary.
441 *
442 * Returns zero on no error; non-zero on error
443 */
444static int
445ecryptfs_find_auth_tok_for_sig(
446 struct ecryptfs_auth_tok **auth_tok,
447 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
448 char *sig)
449{
450 struct ecryptfs_global_auth_tok *global_auth_tok;
451 int rc = 0;
452
453 (*auth_tok) = NULL;
454 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
455 mount_crypt_stat, sig)) {
456 struct key *auth_tok_key;
457
458 rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
459 sig);
460 } else
461 (*auth_tok) = global_auth_tok->global_auth_tok;
462 return rc;
463}
464
465/**
466 * write_tag_70_packet can gobble a lot of stack space. We stuff most
467 * of the function's parameters in a kmalloc'd struct to help reduce
468 * eCryptfs' overall stack usage.
469 */
470struct ecryptfs_write_tag_70_packet_silly_stack {
471 u8 cipher_code;
472 size_t max_packet_size;
473 size_t packet_size_len;
474 size_t block_aligned_filename_size;
475 size_t block_size;
476 size_t i;
477 size_t j;
478 size_t num_rand_bytes;
479 struct mutex *tfm_mutex;
480 char *block_aligned_filename;
481 struct ecryptfs_auth_tok *auth_tok;
482 struct scatterlist src_sg;
483 struct scatterlist dst_sg;
484 struct blkcipher_desc desc;
485 char iv[ECRYPTFS_MAX_IV_BYTES];
486 char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
487 char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
488 struct hash_desc hash_desc;
489 struct scatterlist hash_sg;
490};
491
492/**
493 * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
494 * @filename: NULL-terminated filename string
495 *
496 * This is the simplest mechanism for achieving filename encryption in
497 * eCryptfs. It encrypts the given filename with the mount-wide
498 * filename encryption key (FNEK) and stores it in a packet to @dest,
499 * which the callee will encode and write directly into the dentry
500 * name.
501 */
502int
503ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
504 size_t *packet_size,
505 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
506 char *filename, size_t filename_size)
507{
508 struct ecryptfs_write_tag_70_packet_silly_stack *s;
509 int rc = 0;
510
511 s = kmalloc(sizeof(*s), GFP_KERNEL);
512 if (!s) {
513 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
514 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
515 goto out;
516 }
517 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
518 (*packet_size) = 0;
519 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
520 &s->desc.tfm,
521 &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
522 if (unlikely(rc)) {
523 printk(KERN_ERR "Internal error whilst attempting to get "
524 "tfm and mutex for cipher name [%s]; rc = [%d]\n",
525 mount_crypt_stat->global_default_fn_cipher_name, rc);
526 goto out;
527 }
528 mutex_lock(s->tfm_mutex);
529 s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
530 /* Plus one for the \0 separator between the random prefix
531 * and the plaintext filename */
532 s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
533 s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
534 if ((s->block_aligned_filename_size % s->block_size) != 0) {
535 s->num_rand_bytes += (s->block_size
536 - (s->block_aligned_filename_size
537 % s->block_size));
538 s->block_aligned_filename_size = (s->num_rand_bytes
539 + filename_size);
540 }
541 /* Octet 0: Tag 70 identifier
542 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
543 * and block-aligned encrypted filename size)
544 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
545 * Octet N2-N3: Cipher identifier (1 octet)
546 * Octets N3-N4: Block-aligned encrypted filename
547 * - Consists of a minimum number of random characters, a \0
548 * separator, and then the filename */
549 s->max_packet_size = (1 /* Tag 70 identifier */
550 + 3 /* Max Tag 70 packet size */
551 + ECRYPTFS_SIG_SIZE /* FNEK sig */
552 + 1 /* Cipher identifier */
553 + s->block_aligned_filename_size);
554 if (dest == NULL) {
555 (*packet_size) = s->max_packet_size;
556 goto out_unlock;
557 }
558 if (s->max_packet_size > (*remaining_bytes)) {
559 printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
560 "[%zd] available\n", __func__, s->max_packet_size,
561 (*remaining_bytes));
562 rc = -EINVAL;
563 goto out_unlock;
564 }
565 s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
566 GFP_KERNEL);
567 if (!s->block_aligned_filename) {
568 printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
569 "kzalloc [%zd] bytes\n", __func__,
570 s->block_aligned_filename_size);
571 rc = -ENOMEM;
572 goto out_unlock;
573 }
574 s->i = 0;
575 dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
576 rc = ecryptfs_write_packet_length(&dest[s->i],
577 (ECRYPTFS_SIG_SIZE
578 + 1 /* Cipher code */
579 + s->block_aligned_filename_size),
580 &s->packet_size_len);
581 if (rc) {
582 printk(KERN_ERR "%s: Error generating tag 70 packet "
583 "header; cannot generate packet length; rc = [%d]\n",
584 __func__, rc);
585 goto out_free_unlock;
586 }
587 s->i += s->packet_size_len;
588 ecryptfs_from_hex(&dest[s->i],
589 mount_crypt_stat->global_default_fnek_sig,
590 ECRYPTFS_SIG_SIZE);
591 s->i += ECRYPTFS_SIG_SIZE;
592 s->cipher_code = ecryptfs_code_for_cipher_string(
593 mount_crypt_stat->global_default_fn_cipher_name,
594 mount_crypt_stat->global_default_fn_cipher_key_bytes);
595 if (s->cipher_code == 0) {
596 printk(KERN_WARNING "%s: Unable to generate code for "
597 "cipher [%s] with key bytes [%zd]\n", __func__,
598 mount_crypt_stat->global_default_fn_cipher_name,
599 mount_crypt_stat->global_default_fn_cipher_key_bytes);
600 rc = -EINVAL;
601 goto out_free_unlock;
602 }
603 dest[s->i++] = s->cipher_code;
604 rc = ecryptfs_find_auth_tok_for_sig(
605 &s->auth_tok, mount_crypt_stat,
606 mount_crypt_stat->global_default_fnek_sig);
607 if (rc) {
608 printk(KERN_ERR "%s: Error attempting to find auth tok for "
609 "fnek sig [%s]; rc = [%d]\n", __func__,
610 mount_crypt_stat->global_default_fnek_sig, rc);
611 goto out_free_unlock;
612 }
613 /* TODO: Support other key modules than passphrase for
614 * filename encryption */
615 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
616 sg_init_one(
617 &s->hash_sg,
618 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
619 s->auth_tok->token.password.session_key_encryption_key_bytes);
620 s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
621 s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
622 CRYPTO_ALG_ASYNC);
623 if (IS_ERR(s->hash_desc.tfm)) {
624 rc = PTR_ERR(s->hash_desc.tfm);
625 printk(KERN_ERR "%s: Error attempting to "
626 "allocate hash crypto context; rc = [%d]\n",
627 __func__, rc);
628 goto out_free_unlock;
629 }
630 rc = crypto_hash_init(&s->hash_desc);
631 if (rc) {
632 printk(KERN_ERR
633 "%s: Error initializing crypto hash; rc = [%d]\n",
634 __func__, rc);
635 goto out_release_free_unlock;
636 }
637 rc = crypto_hash_update(
638 &s->hash_desc, &s->hash_sg,
639 s->auth_tok->token.password.session_key_encryption_key_bytes);
640 if (rc) {
641 printk(KERN_ERR
642 "%s: Error updating crypto hash; rc = [%d]\n",
643 __func__, rc);
644 goto out_release_free_unlock;
645 }
646 rc = crypto_hash_final(&s->hash_desc, s->hash);
647 if (rc) {
648 printk(KERN_ERR
649 "%s: Error finalizing crypto hash; rc = [%d]\n",
650 __func__, rc);
651 goto out_release_free_unlock;
652 }
653 for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
654 s->block_aligned_filename[s->j] =
655 s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
656 if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
657 == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
658 sg_init_one(&s->hash_sg, (u8 *)s->hash,
659 ECRYPTFS_TAG_70_DIGEST_SIZE);
660 rc = crypto_hash_init(&s->hash_desc);
661 if (rc) {
662 printk(KERN_ERR
663 "%s: Error initializing crypto hash; "
664 "rc = [%d]\n", __func__, rc);
665 goto out_release_free_unlock;
666 }
667 rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
668 ECRYPTFS_TAG_70_DIGEST_SIZE);
669 if (rc) {
670 printk(KERN_ERR
671 "%s: Error updating crypto hash; "
672 "rc = [%d]\n", __func__, rc);
673 goto out_release_free_unlock;
674 }
675 rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
676 if (rc) {
677 printk(KERN_ERR
678 "%s: Error finalizing crypto hash; "
679 "rc = [%d]\n", __func__, rc);
680 goto out_release_free_unlock;
681 }
682 memcpy(s->hash, s->tmp_hash,
683 ECRYPTFS_TAG_70_DIGEST_SIZE);
684 }
685 if (s->block_aligned_filename[s->j] == '\0')
686 s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
687 }
688 memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
689 filename_size);
690 rc = virt_to_scatterlist(s->block_aligned_filename,
691 s->block_aligned_filename_size, &s->src_sg, 1);
692 if (rc != 1) {
693 printk(KERN_ERR "%s: Internal error whilst attempting to "
694 "convert filename memory to scatterlist; "
695 "expected rc = 1; got rc = [%d]. "
696 "block_aligned_filename_size = [%zd]\n", __func__, rc,
697 s->block_aligned_filename_size);
698 goto out_release_free_unlock;
699 }
700 rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
701 &s->dst_sg, 1);
702 if (rc != 1) {
703 printk(KERN_ERR "%s: Internal error whilst attempting to "
704 "convert encrypted filename memory to scatterlist; "
705 "expected rc = 1; got rc = [%d]. "
706 "block_aligned_filename_size = [%zd]\n", __func__, rc,
707 s->block_aligned_filename_size);
708 goto out_release_free_unlock;
709 }
710 /* The characters in the first block effectively do the job
711 * of the IV here, so we just use 0's for the IV. Note the
712 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
713 * >= ECRYPTFS_MAX_IV_BYTES. */
714 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
715 s->desc.info = s->iv;
716 rc = crypto_blkcipher_setkey(
717 s->desc.tfm,
718 s->auth_tok->token.password.session_key_encryption_key,
719 mount_crypt_stat->global_default_fn_cipher_key_bytes);
720 if (rc < 0) {
721 printk(KERN_ERR "%s: Error setting key for crypto context; "
722 "rc = [%d]. s->auth_tok->token.password.session_key_"
723 "encryption_key = [0x%p]; mount_crypt_stat->"
724 "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
725 rc,
726 s->auth_tok->token.password.session_key_encryption_key,
727 mount_crypt_stat->global_default_fn_cipher_key_bytes);
728 goto out_release_free_unlock;
729 }
730 rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
731 s->block_aligned_filename_size);
732 if (rc) {
733 printk(KERN_ERR "%s: Error attempting to encrypt filename; "
734 "rc = [%d]\n", __func__, rc);
735 goto out_release_free_unlock;
736 }
737 s->i += s->block_aligned_filename_size;
738 (*packet_size) = s->i;
739 (*remaining_bytes) -= (*packet_size);
740out_release_free_unlock:
741 crypto_free_hash(s->hash_desc.tfm);
742out_free_unlock:
743 memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
744 kfree(s->block_aligned_filename);
745out_unlock:
746 mutex_unlock(s->tfm_mutex);
747out:
748 kfree(s);
749 return rc;
750}
751
752struct ecryptfs_parse_tag_70_packet_silly_stack {
753 u8 cipher_code;
754 size_t max_packet_size;
755 size_t packet_size_len;
756 size_t parsed_tag_70_packet_size;
757 size_t block_aligned_filename_size;
758 size_t block_size;
759 size_t i;
760 struct mutex *tfm_mutex;
761 char *decrypted_filename;
762 struct ecryptfs_auth_tok *auth_tok;
763 struct scatterlist src_sg;
764 struct scatterlist dst_sg;
765 struct blkcipher_desc desc;
766 char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
767 char iv[ECRYPTFS_MAX_IV_BYTES];
768 char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
769};
770
771/**
772 * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
773 * @filename: This function kmalloc's the memory for the filename
774 * @filename_size: This function sets this to the amount of memory
775 * kmalloc'd for the filename
776 * @packet_size: This function sets this to the the number of octets
777 * in the packet parsed
778 * @mount_crypt_stat: The mount-wide cryptographic context
779 * @data: The memory location containing the start of the tag 70
780 * packet
781 * @max_packet_size: The maximum legal size of the packet to be parsed
782 * from @data
783 *
784 * Returns zero on success; non-zero otherwise
785 */
786int
787ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
788 size_t *packet_size,
789 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
790 char *data, size_t max_packet_size)
791{
792 struct ecryptfs_parse_tag_70_packet_silly_stack *s;
793 int rc = 0;
794
795 (*packet_size) = 0;
796 (*filename_size) = 0;
797 (*filename) = NULL;
798 s = kmalloc(sizeof(*s), GFP_KERNEL);
799 if (!s) {
800 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
801 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
802 goto out;
803 }
804 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
805 if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
806 printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
807 "at least [%d]\n", __func__, max_packet_size,
808 (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
809 rc = -EINVAL;
810 goto out;
811 }
812 /* Octet 0: Tag 70 identifier
813 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
814 * and block-aligned encrypted filename size)
815 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
816 * Octet N2-N3: Cipher identifier (1 octet)
817 * Octets N3-N4: Block-aligned encrypted filename
818 * - Consists of a minimum number of random numbers, a \0
819 * separator, and then the filename */
820 if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
821 printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
822 "tag [0x%.2x]\n", __func__,
823 data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
824 rc = -EINVAL;
825 goto out;
826 }
827 rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
828 &s->parsed_tag_70_packet_size,
829 &s->packet_size_len);
830 if (rc) {
831 printk(KERN_WARNING "%s: Error parsing packet length; "
832 "rc = [%d]\n", __func__, rc);
833 goto out;
834 }
835 s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
836 - ECRYPTFS_SIG_SIZE - 1);
837 if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
838 > max_packet_size) {
839 printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
840 "size is [%zd]\n", __func__, max_packet_size,
841 (1 + s->packet_size_len + 1
842 + s->block_aligned_filename_size));
843 rc = -EINVAL;
844 goto out;
845 }
846 (*packet_size) += s->packet_size_len;
847 ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
848 ECRYPTFS_SIG_SIZE);
849 s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
850 (*packet_size) += ECRYPTFS_SIG_SIZE;
851 s->cipher_code = data[(*packet_size)++];
852 rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
853 if (rc) {
854 printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
855 __func__, s->cipher_code);
856 goto out;
857 }
858 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
859 &s->tfm_mutex,
860 s->cipher_string);
861 if (unlikely(rc)) {
862 printk(KERN_ERR "Internal error whilst attempting to get "
863 "tfm and mutex for cipher name [%s]; rc = [%d]\n",
864 s->cipher_string, rc);
865 goto out;
866 }
867 mutex_lock(s->tfm_mutex);
868 rc = virt_to_scatterlist(&data[(*packet_size)],
869 s->block_aligned_filename_size, &s->src_sg, 1);
870 if (rc != 1) {
871 printk(KERN_ERR "%s: Internal error whilst attempting to "
872 "convert encrypted filename memory to scatterlist; "
873 "expected rc = 1; got rc = [%d]. "
874 "block_aligned_filename_size = [%zd]\n", __func__, rc,
875 s->block_aligned_filename_size);
876 goto out_unlock;
877 }
878 (*packet_size) += s->block_aligned_filename_size;
879 s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
880 GFP_KERNEL);
881 if (!s->decrypted_filename) {
882 printk(KERN_ERR "%s: Out of memory whilst attempting to "
883 "kmalloc [%zd] bytes\n", __func__,
884 s->block_aligned_filename_size);
885 rc = -ENOMEM;
886 goto out_unlock;
887 }
888 rc = virt_to_scatterlist(s->decrypted_filename,
889 s->block_aligned_filename_size, &s->dst_sg, 1);
890 if (rc != 1) {
891 printk(KERN_ERR "%s: Internal error whilst attempting to "
892 "convert decrypted filename memory to scatterlist; "
893 "expected rc = 1; got rc = [%d]. "
894 "block_aligned_filename_size = [%zd]\n", __func__, rc,
895 s->block_aligned_filename_size);
896 goto out_free_unlock;
897 }
898 /* The characters in the first block effectively do the job of
899 * the IV here, so we just use 0's for the IV. Note the
900 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
901 * >= ECRYPTFS_MAX_IV_BYTES. */
902 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
903 s->desc.info = s->iv;
904 rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
905 s->fnek_sig_hex);
906 if (rc) {
907 printk(KERN_ERR "%s: Error attempting to find auth tok for "
908 "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
909 rc);
910 goto out_free_unlock;
911 }
912 /* TODO: Support other key modules than passphrase for
913 * filename encryption */
914 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
915 rc = crypto_blkcipher_setkey(
916 s->desc.tfm,
917 s->auth_tok->token.password.session_key_encryption_key,
918 mount_crypt_stat->global_default_fn_cipher_key_bytes);
919 if (rc < 0) {
920 printk(KERN_ERR "%s: Error setting key for crypto context; "
921 "rc = [%d]. s->auth_tok->token.password.session_key_"
922 "encryption_key = [0x%p]; mount_crypt_stat->"
923 "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
924 rc,
925 s->auth_tok->token.password.session_key_encryption_key,
926 mount_crypt_stat->global_default_fn_cipher_key_bytes);
927 goto out_free_unlock;
928 }
929 rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
930 s->block_aligned_filename_size);
931 if (rc) {
932 printk(KERN_ERR "%s: Error attempting to decrypt filename; "
933 "rc = [%d]\n", __func__, rc);
934 goto out_free_unlock;
935 }
936 s->i = 0;
937 while (s->decrypted_filename[s->i] != '\0'
938 && s->i < s->block_aligned_filename_size)
939 s->i++;
940 if (s->i == s->block_aligned_filename_size) {
941 printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
942 "find valid separator between random characters and "
943 "the filename\n", __func__);
944 rc = -EINVAL;
945 goto out_free_unlock;
946 }
947 s->i++;
948 (*filename_size) = (s->block_aligned_filename_size - s->i);
949 if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
950 printk(KERN_WARNING "%s: Filename size is [%zd], which is "
951 "invalid\n", __func__, (*filename_size));
952 rc = -EINVAL;
953 goto out_free_unlock;
954 }
955 (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
956 if (!(*filename)) {
957 printk(KERN_ERR "%s: Out of memory whilst attempting to "
958 "kmalloc [%zd] bytes\n", __func__,
959 ((*filename_size) + 1));
960 rc = -ENOMEM;
961 goto out_free_unlock;
962 }
963 memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
964 (*filename)[(*filename_size)] = '\0';
965out_free_unlock:
966 kfree(s->decrypted_filename);
967out_unlock:
968 mutex_unlock(s->tfm_mutex);
969out:
970 if (rc) {
971 (*packet_size) = 0;
972 (*filename_size) = 0;
973 (*filename) = NULL;
974 }
975 kfree(s);
976 return rc;
977}
978
979static int
406ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) 980ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
407{ 981{
408 int rc = 0; 982 int rc = 0;
@@ -897,30 +1471,6 @@ out:
897 return rc; 1471 return rc;
898} 1472}
899 1473
900static int
901ecryptfs_find_global_auth_tok_for_sig(
902 struct ecryptfs_global_auth_tok **global_auth_tok,
903 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
904{
905 struct ecryptfs_global_auth_tok *walker;
906 int rc = 0;
907
908 (*global_auth_tok) = NULL;
909 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
910 list_for_each_entry(walker,
911 &mount_crypt_stat->global_auth_tok_list,
912 mount_crypt_stat_list) {
913 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
914 (*global_auth_tok) = walker;
915 goto out;
916 }
917 }
918 rc = -EINVAL;
919out:
920 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
921 return rc;
922}
923
924/** 1474/**
925 * ecryptfs_verify_version 1475 * ecryptfs_verify_version
926 * @version: The version number to confirm 1476 * @version: The version number to confirm
@@ -990,43 +1540,6 @@ out:
990} 1540}
991 1541
992/** 1542/**
993 * ecryptfs_find_auth_tok_for_sig
994 * @auth_tok: Set to the matching auth_tok; NULL if not found
995 * @crypt_stat: inode crypt_stat crypto context
996 * @sig: Sig of auth_tok to find
997 *
998 * For now, this function simply looks at the registered auth_tok's
999 * linked off the mount_crypt_stat, so all the auth_toks that can be
1000 * used must be registered at mount time. This function could
1001 * potentially try a lot harder to find auth_tok's (e.g., by calling
1002 * out to ecryptfsd to dynamically retrieve an auth_tok object) so
1003 * that static registration of auth_tok's will no longer be necessary.
1004 *
1005 * Returns zero on no error; non-zero on error
1006 */
1007static int
1008ecryptfs_find_auth_tok_for_sig(
1009 struct ecryptfs_auth_tok **auth_tok,
1010 struct ecryptfs_crypt_stat *crypt_stat, char *sig)
1011{
1012 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
1013 crypt_stat->mount_crypt_stat;
1014 struct ecryptfs_global_auth_tok *global_auth_tok;
1015 int rc = 0;
1016
1017 (*auth_tok) = NULL;
1018 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
1019 mount_crypt_stat, sig)) {
1020 struct key *auth_tok_key;
1021
1022 rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
1023 sig);
1024 } else
1025 (*auth_tok) = global_auth_tok->global_auth_tok;
1026 return rc;
1027}
1028
1029/**
1030 * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. 1543 * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
1031 * @auth_tok: The passphrase authentication token to use to encrypt the FEK 1544 * @auth_tok: The passphrase authentication token to use to encrypt the FEK
1032 * @crypt_stat: The cryptographic context 1545 * @crypt_stat: The cryptographic context
@@ -1256,7 +1769,8 @@ find_next_matching_auth_tok:
1256 rc = -EINVAL; 1769 rc = -EINVAL;
1257 goto out_wipe_list; 1770 goto out_wipe_list;
1258 } 1771 }
1259 ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat, 1772 ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
1773 crypt_stat->mount_crypt_stat,
1260 candidate_auth_tok_sig); 1774 candidate_auth_tok_sig);
1261 if (matching_auth_tok) { 1775 if (matching_auth_tok) {
1262 found_auth_tok = 1; 1776 found_auth_tok = 1;
@@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
1336 int rc; 1850 int rc;
1337 1851
1338 rc = write_tag_66_packet(auth_tok->token.private_key.signature, 1852 rc = write_tag_66_packet(auth_tok->token.private_key.signature,
1339 ecryptfs_code_for_cipher_string(crypt_stat), 1853 ecryptfs_code_for_cipher_string(
1854 crypt_stat->cipher,
1855 crypt_stat->key_size),
1340 crypt_stat, &payload, &payload_len); 1856 crypt_stat, &payload, &payload_len);
1341 if (rc) { 1857 if (rc) {
1342 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); 1858 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
@@ -1696,7 +2212,8 @@ encrypted_session_key_set:
1696 dest[(*packet_size)++] = 0x04; /* version 4 */ 2212 dest[(*packet_size)++] = 0x04; /* version 4 */
1697 /* TODO: Break from RFC2440 so that arbitrary ciphers can be 2213 /* TODO: Break from RFC2440 so that arbitrary ciphers can be
1698 * specified with strings */ 2214 * specified with strings */
1699 cipher_code = ecryptfs_code_for_cipher_string(crypt_stat); 2215 cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
2216 crypt_stat->key_size);
1700 if (cipher_code == 0) { 2217 if (cipher_code == 0) {
1701 ecryptfs_printk(KERN_WARNING, "Unable to generate code for " 2218 ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
1702 "cipher [%s]\n", crypt_stat->cipher); 2219 "cipher [%s]\n", crypt_stat->cipher);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fd630713c5c7..789cf2e1be1e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
206 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, 206 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
207 ecryptfs_opt_ecryptfs_key_bytes, 207 ecryptfs_opt_ecryptfs_key_bytes,
208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
209 ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; 209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
211 ecryptfs_opt_err };
210 212
211static const match_table_t tokens = { 213static const match_table_t tokens = {
212 {ecryptfs_opt_sig, "sig=%s"}, 214 {ecryptfs_opt_sig, "sig=%s"},
@@ -217,6 +219,9 @@ static const match_table_t tokens = {
217 {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, 219 {ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
218 {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, 220 {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
219 {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, 221 {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
222 {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
220 {ecryptfs_opt_err, NULL} 225 {ecryptfs_opt_err, NULL}
221}; 226};
222 227
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
281 int rc = 0; 286 int rc = 0;
282 int sig_set = 0; 287 int sig_set = 0;
283 int cipher_name_set = 0; 288 int cipher_name_set = 0;
289 int fn_cipher_name_set = 0;
284 int cipher_key_bytes; 290 int cipher_key_bytes;
285 int cipher_key_bytes_set = 0; 291 int cipher_key_bytes_set = 0;
292 int fn_cipher_key_bytes;
293 int fn_cipher_key_bytes_set = 0;
286 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 294 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
287 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; 295 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
288 substring_t args[MAX_OPT_ARGS]; 296 substring_t args[MAX_OPT_ARGS];
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
290 char *sig_src; 298 char *sig_src;
291 char *cipher_name_dst; 299 char *cipher_name_dst;
292 char *cipher_name_src; 300 char *cipher_name_src;
301 char *fn_cipher_name_dst;
302 char *fn_cipher_name_src;
303 char *fnek_dst;
304 char *fnek_src;
293 char *cipher_key_bytes_src; 305 char *cipher_key_bytes_src;
306 char *fn_cipher_key_bytes_src;
294 307
295 if (!options) { 308 if (!options) {
296 rc = -EINVAL; 309 rc = -EINVAL;
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
322 global_default_cipher_name; 335 global_default_cipher_name;
323 strncpy(cipher_name_dst, cipher_name_src, 336 strncpy(cipher_name_dst, cipher_name_src,
324 ECRYPTFS_MAX_CIPHER_NAME_SIZE); 337 ECRYPTFS_MAX_CIPHER_NAME_SIZE);
325 ecryptfs_printk(KERN_DEBUG, 338 cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
326 "The mount_crypt_stat "
327 "global_default_cipher_name set to: "
328 "[%s]\n", cipher_name_dst);
329 cipher_name_set = 1; 339 cipher_name_set = 1;
330 break; 340 break;
331 case ecryptfs_opt_ecryptfs_key_bytes: 341 case ecryptfs_opt_ecryptfs_key_bytes:
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
335 &cipher_key_bytes_src, 0); 345 &cipher_key_bytes_src, 0);
336 mount_crypt_stat->global_default_cipher_key_size = 346 mount_crypt_stat->global_default_cipher_key_size =
337 cipher_key_bytes; 347 cipher_key_bytes;
338 ecryptfs_printk(KERN_DEBUG,
339 "The mount_crypt_stat "
340 "global_default_cipher_key_size "
341 "set to: [%d]\n", mount_crypt_stat->
342 global_default_cipher_key_size);
343 cipher_key_bytes_set = 1; 348 cipher_key_bytes_set = 1;
344 break; 349 break;
345 case ecryptfs_opt_passthrough: 350 case ecryptfs_opt_passthrough:
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
356 mount_crypt_stat->flags |= 361 mount_crypt_stat->flags |=
357 ECRYPTFS_ENCRYPTED_VIEW_ENABLED; 362 ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
358 break; 363 break;
364 case ecryptfs_opt_fnek_sig:
365 fnek_src = args[0].from;
366 fnek_dst =
367 mount_crypt_stat->global_default_fnek_sig;
368 strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
369 mount_crypt_stat->global_default_fnek_sig[
370 ECRYPTFS_SIG_SIZE_HEX] = '\0';
371 rc = ecryptfs_add_global_auth_tok(
372 mount_crypt_stat,
373 mount_crypt_stat->global_default_fnek_sig);
374 if (rc) {
375 printk(KERN_ERR "Error attempting to register "
376 "global fnek sig [%s]; rc = [%d]\n",
377 mount_crypt_stat->global_default_fnek_sig,
378 rc);
379 goto out;
380 }
381 mount_crypt_stat->flags |=
382 (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
383 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
384 break;
385 case ecryptfs_opt_fn_cipher:
386 fn_cipher_name_src = args[0].from;
387 fn_cipher_name_dst =
388 mount_crypt_stat->global_default_fn_cipher_name;
389 strncpy(fn_cipher_name_dst, fn_cipher_name_src,
390 ECRYPTFS_MAX_CIPHER_NAME_SIZE);
391 mount_crypt_stat->global_default_fn_cipher_name[
392 ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
393 fn_cipher_name_set = 1;
394 break;
395 case ecryptfs_opt_fn_cipher_key_bytes:
396 fn_cipher_key_bytes_src = args[0].from;
397 fn_cipher_key_bytes =
398 (int)simple_strtol(fn_cipher_key_bytes_src,
399 &fn_cipher_key_bytes_src, 0);
400 mount_crypt_stat->global_default_fn_cipher_key_bytes =
401 fn_cipher_key_bytes;
402 fn_cipher_key_bytes_set = 1;
403 break;
359 case ecryptfs_opt_err: 404 case ecryptfs_opt_err:
360 default: 405 default:
361 ecryptfs_printk(KERN_WARNING, 406 printk(KERN_WARNING
362 "eCryptfs: unrecognized option '%s'\n", 407 "%s: eCryptfs: unrecognized option [%s]\n",
363 p); 408 __func__, p);
364 } 409 }
365 } 410 }
366 if (!sig_set) { 411 if (!sig_set) {
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
374 int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); 419 int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
375 420
376 BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); 421 BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
377
378 strcpy(mount_crypt_stat->global_default_cipher_name, 422 strcpy(mount_crypt_stat->global_default_cipher_name,
379 ECRYPTFS_DEFAULT_CIPHER); 423 ECRYPTFS_DEFAULT_CIPHER);
380 } 424 }
381 if (!cipher_key_bytes_set) { 425 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
426 && !fn_cipher_name_set)
427 strcpy(mount_crypt_stat->global_default_fn_cipher_name,
428 mount_crypt_stat->global_default_cipher_name);
429 if (!cipher_key_bytes_set)
382 mount_crypt_stat->global_default_cipher_key_size = 0; 430 mount_crypt_stat->global_default_cipher_key_size = 0;
383 } 431 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
432 && !fn_cipher_key_bytes_set)
433 mount_crypt_stat->global_default_fn_cipher_key_bytes =
434 mount_crypt_stat->global_default_cipher_key_size;
384 mutex_lock(&key_tfm_list_mutex); 435 mutex_lock(&key_tfm_list_mutex);
385 if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, 436 if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
386 NULL)) 437 NULL)) {
387 rc = ecryptfs_add_new_key_tfm( 438 rc = ecryptfs_add_new_key_tfm(
388 NULL, mount_crypt_stat->global_default_cipher_name, 439 NULL, mount_crypt_stat->global_default_cipher_name,
389 mount_crypt_stat->global_default_cipher_key_size); 440 mount_crypt_stat->global_default_cipher_key_size);
390 mutex_unlock(&key_tfm_list_mutex); 441 if (rc) {
391 if (rc) { 442 printk(KERN_ERR "Error attempting to initialize "
392 printk(KERN_ERR "Error attempting to initialize cipher with " 443 "cipher with name = [%s] and key size = [%td]; "
393 "name = [%s] and key size = [%td]; rc = [%d]\n", 444 "rc = [%d]\n",
394 mount_crypt_stat->global_default_cipher_name, 445 mount_crypt_stat->global_default_cipher_name,
395 mount_crypt_stat->global_default_cipher_key_size, rc); 446 mount_crypt_stat->global_default_cipher_key_size,
396 rc = -EINVAL; 447 rc);
397 goto out; 448 rc = -EINVAL;
449 mutex_unlock(&key_tfm_list_mutex);
450 goto out;
451 }
398 } 452 }
453 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
454 && !ecryptfs_tfm_exists(
455 mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
456 rc = ecryptfs_add_new_key_tfm(
457 NULL, mount_crypt_stat->global_default_fn_cipher_name,
458 mount_crypt_stat->global_default_fn_cipher_key_bytes);
459 if (rc) {
460 printk(KERN_ERR "Error attempting to initialize "
461 "cipher with name = [%s] and key size = [%td]; "
462 "rc = [%d]\n",
463 mount_crypt_stat->global_default_fn_cipher_name,
464 mount_crypt_stat->global_default_fn_cipher_key_bytes,
465 rc);
466 rc = -EINVAL;
467 mutex_unlock(&key_tfm_list_mutex);
468 goto out;
469 }
470 }
471 mutex_unlock(&key_tfm_list_mutex);
399 rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); 472 rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
400 if (rc) { 473 if (rc)
401 printk(KERN_WARNING "One or more global auth toks could not " 474 printk(KERN_WARNING "One or more global auth toks could not "
402 "properly register; rc = [%d]\n", rc); 475 "properly register; rc = [%d]\n", rc);
403 }
404out: 476out:
405 return rc; 477 return rc;
406} 478}
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 6913f727624d..96ef51489e01 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
193 (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); 193 (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
194 if (!(*daemon)) { 194 if (!(*daemon)) {
195 rc = -ENOMEM; 195 rc = -ENOMEM;
196 printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " 196 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
197 "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); 197 "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
198 goto out; 198 goto out;
199 } 199 }
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
435 msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); 435 msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
436 if (!msg_ctx->msg) { 436 if (!msg_ctx->msg) {
437 rc = -ENOMEM; 437 rc = -ENOMEM;
438 printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " 438 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
439 "GFP_KERNEL memory\n", __func__, msg_size); 439 "GFP_KERNEL memory\n", __func__, msg_size);
440 goto unlock; 440 goto unlock;
441 } 441 }
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index efd95a0ed1ea..a67fea655f49 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
199 if (!msg_ctx->msg) { 199 if (!msg_ctx->msg) {
200 rc = -ENOMEM; 200 rc = -ENOMEM;
201 printk(KERN_ERR "%s: Out of memory whilst attempting " 201 printk(KERN_ERR "%s: Out of memory whilst attempting "
202 "to kmalloc(%Zd, GFP_KERNEL)\n", __func__, 202 "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
203 (sizeof(*msg_ctx->msg) + data_size)); 203 (sizeof(*msg_ctx->msg) + data_size));
204 goto out_unlock; 204 goto out_unlock;
205 } 205 }
@@ -322,7 +322,7 @@ check_list:
322 if (count < total_length) { 322 if (count < total_length) {
323 rc = 0; 323 rc = 0;
324 printk(KERN_WARNING "%s: Only given user buffer of " 324 printk(KERN_WARNING "%s: Only given user buffer of "
325 "size [%Zd], but we need [%Zd] to read the " 325 "size [%zd], but we need [%zd] to read the "
326 "pending message\n", __func__, count, total_length); 326 "pending message\n", __func__, count, total_length);
327 goto out_unlock_msg_ctx; 327 goto out_unlock_msg_ctx;
328 } 328 }
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
376 376
377 if ((sizeof(*msg) + msg->data_len) != data_size) { 377 if ((sizeof(*msg) + msg->data_len) != data_size) {
378 printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " 378 printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
379 "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__, 379 "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
380 (sizeof(*msg) + msg->data_len), data_size); 380 (sizeof(*msg) + msg->data_len), data_size);
381 rc = -EINVAL; 381 rc = -EINVAL;
382 goto out; 382 goto out;
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
421 data = kmalloc(count, GFP_KERNEL); 421 data = kmalloc(count, GFP_KERNEL);
422 if (!data) { 422 if (!data) {
423 printk(KERN_ERR "%s: Out of memory whilst attempting to " 423 printk(KERN_ERR "%s: Out of memory whilst attempting to "
424 "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count); 424 "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
425 goto out; 425 goto out;
426 } 426 }
427 rc = copy_from_user(data, buf, count); 427 rc = copy_from_user(data, buf, count);
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
436 case ECRYPTFS_MSG_RESPONSE: 436 case ECRYPTFS_MSG_RESPONSE:
437 if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { 437 if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
438 printk(KERN_WARNING "%s: Minimum acceptable packet " 438 printk(KERN_WARNING "%s: Minimum acceptable packet "
439 "size is [%Zd], but amount of data written is " 439 "size is [%zd], but amount of data written is "
440 "only [%Zd]. Discarding response packet.\n", 440 "only [%zd]. Discarding response packet.\n",
441 __func__, 441 __func__,
442 (1 + 4 + 1 + sizeof(struct ecryptfs_message)), 442 (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
443 count); 443 count);
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
455 } 455 }
456 i += packet_size_length; 456 i += packet_size_length;
457 if ((1 + 4 + packet_size_length + packet_size) != count) { 457 if ((1 + 4 + packet_size_length + packet_size) != count) {
458 printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])" 458 printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
459 " + packet_size([%Zd]))([%Zd]) != " 459 " + packet_size([%zd]))([%zd]) != "
460 "count([%Zd]). Invalid packet format.\n", 460 "count([%zd]). Invalid packet format.\n",
461 __func__, packet_size_length, packet_size, 461 __func__, packet_size_length, packet_size,
462 (1 + packet_size_length + packet_size), count); 462 (1 + packet_size_length + packet_size), count);
463 goto out_free; 463 goto out_free;
diff --git a/fs/exec.c b/fs/exec.c
index 3ef9cf9b1871..71a6efe5d8bd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,6 +51,7 @@
51#include <linux/audit.h> 51#include <linux/audit.h>
52#include <linux/tracehook.h> 52#include <linux/tracehook.h>
53#include <linux/kmod.h> 53#include <linux/kmod.h>
54#include <linux/fsnotify.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
@@ -132,6 +133,8 @@ asmlinkage long sys_uselib(const char __user * library)
132 if (IS_ERR(file)) 133 if (IS_ERR(file))
133 goto out; 134 goto out;
134 135
136 fsnotify_open(file->f_path.dentry);
137
135 error = -ENOEXEC; 138 error = -ENOEXEC;
136 if(file->f_op) { 139 if(file->f_op) {
137 struct linux_binfmt * fmt; 140 struct linux_binfmt * fmt;
@@ -229,13 +232,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
229 232
230static int __bprm_mm_init(struct linux_binprm *bprm) 233static int __bprm_mm_init(struct linux_binprm *bprm)
231{ 234{
232 int err = -ENOMEM; 235 int err;
233 struct vm_area_struct *vma = NULL; 236 struct vm_area_struct *vma = NULL;
234 struct mm_struct *mm = bprm->mm; 237 struct mm_struct *mm = bprm->mm;
235 238
236 bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 239 bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
237 if (!vma) 240 if (!vma)
238 goto err; 241 return -ENOMEM;
239 242
240 down_write(&mm->mmap_sem); 243 down_write(&mm->mmap_sem);
241 vma->vm_mm = mm; 244 vma->vm_mm = mm;
@@ -248,28 +251,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
248 */ 251 */
249 vma->vm_end = STACK_TOP_MAX; 252 vma->vm_end = STACK_TOP_MAX;
250 vma->vm_start = vma->vm_end - PAGE_SIZE; 253 vma->vm_start = vma->vm_end - PAGE_SIZE;
251
252 vma->vm_flags = VM_STACK_FLAGS; 254 vma->vm_flags = VM_STACK_FLAGS;
253 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 255 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
254 err = insert_vm_struct(mm, vma); 256 err = insert_vm_struct(mm, vma);
255 if (err) { 257 if (err)
256 up_write(&mm->mmap_sem);
257 goto err; 258 goto err;
258 }
259 259
260 mm->stack_vm = mm->total_vm = 1; 260 mm->stack_vm = mm->total_vm = 1;
261 up_write(&mm->mmap_sem); 261 up_write(&mm->mmap_sem);
262
263 bprm->p = vma->vm_end - sizeof(void *); 262 bprm->p = vma->vm_end - sizeof(void *);
264
265 return 0; 263 return 0;
266
267err: 264err:
268 if (vma) { 265 up_write(&mm->mmap_sem);
269 bprm->vma = NULL; 266 bprm->vma = NULL;
270 kmem_cache_free(vm_area_cachep, vma); 267 kmem_cache_free(vm_area_cachep, vma);
271 }
272
273 return err; 268 return err;
274} 269}
275 270
@@ -684,6 +679,8 @@ struct file *open_exec(const char *name)
684 if (IS_ERR(file)) 679 if (IS_ERR(file))
685 return file; 680 return file;
686 681
682 fsnotify_open(file->f_path.dentry);
683
687 err = deny_write_access(file); 684 err = deny_write_access(file);
688 if (err) { 685 if (err) {
689 fput(file); 686 fput(file);
@@ -1689,7 +1686,7 @@ int get_dumpable(struct mm_struct *mm)
1689 return (ret >= 2) ? 2 : ret; 1686 return (ret >= 2) ? 2 : ret;
1690} 1687}
1691 1688
1692int do_coredump(long signr, int exit_code, struct pt_regs * regs) 1689void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1693{ 1690{
1694 struct core_state core_state; 1691 struct core_state core_state;
1695 char corename[CORENAME_MAX_SIZE + 1]; 1692 char corename[CORENAME_MAX_SIZE + 1];
@@ -1773,6 +1770,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1773 1770
1774 if (ispipe) { 1771 if (ispipe) {
1775 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1772 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1773 if (!helper_argv) {
1774 printk(KERN_WARNING "%s failed to allocate memory\n",
1775 __func__);
1776 goto fail_unlock;
1777 }
1776 /* Terminate the string before the first option */ 1778 /* Terminate the string before the first option */
1777 delimit = strchr(corename, ' '); 1779 delimit = strchr(corename, ' ');
1778 if (delimit) 1780 if (delimit)
@@ -1840,5 +1842,5 @@ fail_unlock:
1840 put_cred(cred); 1842 put_cred(cred);
1841 coredump_finish(mm); 1843 coredump_finish(mm);
1842fail: 1844fail:
1843 return retval; 1845 return;
1844} 1846}
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c454d5db28a5..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
565 inode->i_blocks = 0; 565 inode->i_blocks = 0;
566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
567 memset(ei->i_data, 0, sizeof(ei->i_data)); 567 memset(ei->i_data, 0, sizeof(ei->i_data));
568 ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL; 568 ei->i_flags =
569 if (S_ISLNK(mode)) 569 ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
570 ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
571 /* dirsync is only applied to directories */
572 if (!S_ISDIR(mode))
573 ei->i_flags &= ~EXT2_DIRSYNC_FL;
574 ei->i_faddr = 0; 570 ei->i_faddr = 0;
575 ei->i_frag_no = 0; 571 ei->i_frag_no = 0;
576 ei->i_frag_size = 0; 572 ei->i_frag_size = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 02b39a5deb74..23fff2f87783 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -498,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
498 * ext2_splice_branch - splice the allocated branch onto inode. 498 * ext2_splice_branch - splice the allocated branch onto inode.
499 * @inode: owner 499 * @inode: owner
500 * @block: (logical) number of block we are adding 500 * @block: (logical) number of block we are adding
501 * @chain: chain of indirect blocks (with a missing link - see
502 * ext2_alloc_branch)
503 * @where: location of missing link 501 * @where: location of missing link
504 * @num: number of indirect blocks we are adding 502 * @num: number of indirect blocks we are adding
505 * @blks: number of direct blocks we are adding 503 * @blks: number of direct blocks we are adding
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
50 goto setflags_out; 50 goto setflags_out;
51 } 51 }
52 52
53 if (!S_ISDIR(inode->i_mode)) 53 flags = ext2_mask_flags(inode->i_mode, flags);
54 flags &= ~EXT2_DIRSYNC_FL;
55 54
56 mutex_lock(&inode->i_mutex); 55 mutex_lock(&inode->i_mutex);
57 /* Is it quota file? Do not allow user to mess with it */ 56 /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
132 percpu_counter_destroy(&sbi->s_dirs_counter); 132 percpu_counter_destroy(&sbi->s_dirs_counter);
133 brelse (sbi->s_sbh); 133 brelse (sbi->s_sbh);
134 sb->s_fs_info = NULL; 134 sb->s_fs_info = NULL;
135 kfree(sbi->s_blockgroup_lock);
135 kfree(sbi); 136 kfree(sbi);
136 137
137 return; 138 return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
756 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 757 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
757 if (!sbi) 758 if (!sbi)
758 return -ENOMEM; 759 return -ENOMEM;
760
761 sbi->s_blockgroup_lock =
762 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
763 if (!sbi->s_blockgroup_lock) {
764 kfree(sbi);
765 return -ENOMEM;
766 }
759 sb->s_fs_info = sbi; 767 sb->s_fs_info = sbi;
760 sbi->s_sb_block = sb_block; 768 sbi->s_sb_block = sb_block;
761 769
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
983 printk ("EXT2-fs: not enough memory\n"); 991 printk ("EXT2-fs: not enough memory\n");
984 goto failed_mount; 992 goto failed_mount;
985 } 993 }
986 bgl_lock_init(&sbi->s_blockgroup_lock); 994 bgl_lock_init(sbi->s_blockgroup_lock);
987 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); 995 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
988 if (!sbi->s_debts) { 996 if (!sbi->s_debts) {
989 printk ("EXT2-fs: not enough memory\n"); 997 printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5655fbcbd11f..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
559 ei->i_dir_start_lookup = 0; 559 ei->i_dir_start_lookup = 0;
560 ei->i_disksize = 0; 560 ei->i_disksize = 0;
561 561
562 ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; 562 ei->i_flags =
563 if (S_ISLNK(mode)) 563 ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
564 ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
565 /* dirsync only applies to directories */
566 if (!S_ISDIR(mode))
567 ei->i_flags &= ~EXT3_DIRSYNC_FL;
568#ifdef EXT3_FRAGMENTS 564#ifdef EXT3_FRAGMENTS
569 ei->i_faddr = 0; 565 ei->i_faddr = 0;
570 ei->i_frag_no = 0; 566 ei->i_frag_no = 0;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
53 goto flags_out; 53 goto flags_out;
54 } 54 }
55 55
56 if (!S_ISDIR(inode->i_mode)) 56 flags = ext3_mask_flags(inode->i_mode, flags);
57 flags &= ~EXT3_DIRSYNC_FL;
58 57
59 mutex_lock(&inode->i_mutex); 58 mutex_lock(&inode->i_mutex);
60 /* Is it quota file? Do not allow user to mess with it */ 59 /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2c2d700c1ccf..69a3d19ca9fd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
74#define assert(test) J_ASSERT(test) 74#define assert(test) J_ASSERT(test)
75#endif 75#endif
76 76
77#ifndef swap
78#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
79#endif
80
81#ifdef DX_DEBUG 77#ifdef DX_DEBUG
82#define dxtrace(command) command 78#define dxtrace(command) command
83#else 79#else
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6900ff05e3ab..5d047a030a73 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
439 ext3_blkdev_remove(sbi); 439 ext3_blkdev_remove(sbi);
440 } 440 }
441 sb->s_fs_info = NULL; 441 sb->s_fs_info = NULL;
442 kfree(sbi->s_blockgroup_lock);
442 kfree(sbi); 443 kfree(sbi);
443 return; 444 return;
444} 445}
@@ -733,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = {
733 .acquire_dquot = ext3_acquire_dquot, 734 .acquire_dquot = ext3_acquire_dquot,
734 .release_dquot = ext3_release_dquot, 735 .release_dquot = ext3_release_dquot,
735 .mark_dirty = ext3_mark_dquot_dirty, 736 .mark_dirty = ext3_mark_dquot_dirty,
736 .write_info = ext3_write_info 737 .write_info = ext3_write_info,
738 .alloc_dquot = dquot_alloc,
739 .destroy_dquot = dquot_destroy,
737}; 740};
738 741
739static struct quotactl_ops ext3_qctl_operations = { 742static struct quotactl_ops ext3_qctl_operations = {
@@ -1056,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb,
1056 case Opt_grpjquota: 1059 case Opt_grpjquota:
1057 qtype = GRPQUOTA; 1060 qtype = GRPQUOTA;
1058set_qf_name: 1061set_qf_name:
1059 if ((sb_any_quota_enabled(sb) || 1062 if (sb_any_quota_loaded(sb) &&
1060 sb_any_quota_suspended(sb)) &&
1061 !sbi->s_qf_names[qtype]) { 1063 !sbi->s_qf_names[qtype]) {
1062 printk(KERN_ERR 1064 printk(KERN_ERR
1063 "EXT3-fs: Cannot change journaled " 1065 "EXT3-fs: Cannot change journaled "
@@ -1096,8 +1098,7 @@ set_qf_name:
1096 case Opt_offgrpjquota: 1098 case Opt_offgrpjquota:
1097 qtype = GRPQUOTA; 1099 qtype = GRPQUOTA;
1098clear_qf_name: 1100clear_qf_name:
1099 if ((sb_any_quota_enabled(sb) || 1101 if (sb_any_quota_loaded(sb) &&
1100 sb_any_quota_suspended(sb)) &&
1101 sbi->s_qf_names[qtype]) { 1102 sbi->s_qf_names[qtype]) {
1102 printk(KERN_ERR "EXT3-fs: Cannot change " 1103 printk(KERN_ERR "EXT3-fs: Cannot change "
1103 "journaled quota options when " 1104 "journaled quota options when "
@@ -1116,8 +1117,7 @@ clear_qf_name:
1116 case Opt_jqfmt_vfsv0: 1117 case Opt_jqfmt_vfsv0:
1117 qfmt = QFMT_VFS_V0; 1118 qfmt = QFMT_VFS_V0;
1118set_qf_format: 1119set_qf_format:
1119 if ((sb_any_quota_enabled(sb) || 1120 if (sb_any_quota_loaded(sb) &&
1120 sb_any_quota_suspended(sb)) &&
1121 sbi->s_jquota_fmt != qfmt) { 1121 sbi->s_jquota_fmt != qfmt) {
1122 printk(KERN_ERR "EXT3-fs: Cannot change " 1122 printk(KERN_ERR "EXT3-fs: Cannot change "
1123 "journaled quota options when " 1123 "journaled quota options when "
@@ -1136,8 +1136,7 @@ set_qf_format:
1136 set_opt(sbi->s_mount_opt, GRPQUOTA); 1136 set_opt(sbi->s_mount_opt, GRPQUOTA);
1137 break; 1137 break;
1138 case Opt_noquota: 1138 case Opt_noquota:
1139 if (sb_any_quota_enabled(sb) || 1139 if (sb_any_quota_loaded(sb)) {
1140 sb_any_quota_suspended(sb)) {
1141 printk(KERN_ERR "EXT3-fs: Cannot change quota " 1140 printk(KERN_ERR "EXT3-fs: Cannot change quota "
1142 "options when quota turned on.\n"); 1141 "options when quota turned on.\n");
1143 return 0; 1142 return 0;
@@ -1569,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1569 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 1568 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1570 if (!sbi) 1569 if (!sbi)
1571 return -ENOMEM; 1570 return -ENOMEM;
1571
1572 sbi->s_blockgroup_lock =
1573 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
1574 if (!sbi->s_blockgroup_lock) {
1575 kfree(sbi);
1576 return -ENOMEM;
1577 }
1572 sb->s_fs_info = sbi; 1578 sb->s_fs_info = sbi;
1573 sbi->s_mount_opt = 0; 1579 sbi->s_mount_opt = 0;
1574 sbi->s_resuid = EXT3_DEF_RESUID; 1580 sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1821,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1821 goto failed_mount; 1827 goto failed_mount;
1822 } 1828 }
1823 1829
1824 bgl_lock_init(&sbi->s_blockgroup_lock); 1830 bgl_lock_init(sbi->s_blockgroup_lock);
1825 1831
1826 for (i = 0; i < db_count; i++) { 1832 for (i = 0; i < db_count; i++) {
1827 block = descriptor_loc(sb, logic_sb_block, i); 1833 block = descriptor_loc(sb, logic_sb_block, i);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index db1718833f58..c668e4377d76 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1247,11 +1247,11 @@ do { \
1247} while (0) 1247} while (0)
1248 1248
1249#ifdef CONFIG_SMP 1249#ifdef CONFIG_SMP
1250/* Each CPU can accumulate FBC_BATCH blocks in their local 1250/* Each CPU can accumulate percpu_counter_batch blocks in their local
1251 * counters. So we need to make sure we have free blocks more 1251 * counters. So we need to make sure we have free blocks more
1252 * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. 1252 * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
1253 */ 1253 */
1254#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) 1254#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
1255#else 1255#else
1256#define EXT4_FREEBLOCKS_WATERMARK 0 1256#define EXT4_FREEBLOCKS_WATERMARK 0
1257#endif 1257#endif
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 240cf0daad4b..54bf0623a9ae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2533,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2533 */ 2533 */
2534 newdepth = ext_depth(inode); 2534 newdepth = ext_depth(inode);
2535 /* 2535 /*
2536 * update the extent length after successfull insert of the 2536 * update the extent length after successful insert of the
2537 * split extent 2537 * split extent
2538 */ 2538 */
2539 orig_ex.ee_len = cpu_to_le16(ee_len - 2539 orig_ex.ee_len = cpu_to_le16(ee_len -
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4cac8da4e0c1..a6444cee0c7e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2592,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb)
2592 /* 2592 /*
2593 * switch to non delalloc mode if we are running low 2593 * switch to non delalloc mode if we are running low
2594 * on free block. The free block accounting via percpu 2594 * on free block. The free block accounting via percpu
2595 * counters can get slightly wrong with FBC_BATCH getting 2595 * counters can get slightly wrong with percpu_counter_batch getting
2596 * accumulated on each CPU without updating global counters 2596 * accumulated on each CPU without updating global counters
2597 * Delalloc need an accurate free block accounting. So switch 2597 * Delalloc need an accurate free block accounting. So switch
2598 * to non delalloc when we are near to error range. 2598 * to non delalloc when we are near to error range.
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 183a09a8b14e..fec0b4c2f5f1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
74#define assert(test) J_ASSERT(test) 74#define assert(test) J_ASSERT(test)
75#endif 75#endif
76 76
77#ifndef swap
78#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
79#endif
80
81#ifdef DX_DEBUG 77#ifdef DX_DEBUG
82#define dxtrace(command) command 78#define dxtrace(command) command
83#else 79#else
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index acb69c00fd42..8f7e0be8ab1b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -953,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = {
953 .acquire_dquot = ext4_acquire_dquot, 953 .acquire_dquot = ext4_acquire_dquot,
954 .release_dquot = ext4_release_dquot, 954 .release_dquot = ext4_release_dquot,
955 .mark_dirty = ext4_mark_dquot_dirty, 955 .mark_dirty = ext4_mark_dquot_dirty,
956 .write_info = ext4_write_info 956 .write_info = ext4_write_info,
957 .alloc_dquot = dquot_alloc,
958 .destroy_dquot = dquot_destroy,
957}; 959};
958 960
959static struct quotactl_ops ext4_qctl_operations = { 961static struct quotactl_ops ext4_qctl_operations = {
@@ -1302,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb,
1302 case Opt_grpjquota: 1304 case Opt_grpjquota:
1303 qtype = GRPQUOTA; 1305 qtype = GRPQUOTA;
1304set_qf_name: 1306set_qf_name:
1305 if ((sb_any_quota_enabled(sb) || 1307 if (sb_any_quota_loaded(sb) &&
1306 sb_any_quota_suspended(sb)) &&
1307 !sbi->s_qf_names[qtype]) { 1308 !sbi->s_qf_names[qtype]) {
1308 printk(KERN_ERR 1309 printk(KERN_ERR
1309 "EXT4-fs: Cannot change journaled " 1310 "EXT4-fs: Cannot change journaled "
@@ -1342,8 +1343,7 @@ set_qf_name:
1342 case Opt_offgrpjquota: 1343 case Opt_offgrpjquota:
1343 qtype = GRPQUOTA; 1344 qtype = GRPQUOTA;
1344clear_qf_name: 1345clear_qf_name:
1345 if ((sb_any_quota_enabled(sb) || 1346 if (sb_any_quota_loaded(sb) &&
1346 sb_any_quota_suspended(sb)) &&
1347 sbi->s_qf_names[qtype]) { 1347 sbi->s_qf_names[qtype]) {
1348 printk(KERN_ERR "EXT4-fs: Cannot change " 1348 printk(KERN_ERR "EXT4-fs: Cannot change "
1349 "journaled quota options when " 1349 "journaled quota options when "
@@ -1362,8 +1362,7 @@ clear_qf_name:
1362 case Opt_jqfmt_vfsv0: 1362 case Opt_jqfmt_vfsv0:
1363 qfmt = QFMT_VFS_V0; 1363 qfmt = QFMT_VFS_V0;
1364set_qf_format: 1364set_qf_format:
1365 if ((sb_any_quota_enabled(sb) || 1365 if (sb_any_quota_loaded(sb) &&
1366 sb_any_quota_suspended(sb)) &&
1367 sbi->s_jquota_fmt != qfmt) { 1366 sbi->s_jquota_fmt != qfmt) {
1368 printk(KERN_ERR "EXT4-fs: Cannot change " 1367 printk(KERN_ERR "EXT4-fs: Cannot change "
1369 "journaled quota options when " 1368 "journaled quota options when "
@@ -1382,7 +1381,7 @@ set_qf_format:
1382 set_opt(sbi->s_mount_opt, GRPQUOTA); 1381 set_opt(sbi->s_mount_opt, GRPQUOTA);
1383 break; 1382 break;
1384 case Opt_noquota: 1383 case Opt_noquota:
1385 if (sb_any_quota_enabled(sb)) { 1384 if (sb_any_quota_loaded(sb)) {
1386 printk(KERN_ERR "EXT4-fs: Cannot change quota " 1385 printk(KERN_ERR "EXT4-fs: Cannot change quota "
1387 "options when quota turned on.\n"); 1386 "options when quota turned on.\n");
1388 return 0; 1387 return 0;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62dd..d488dcd7f2bb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
253module_init(proc_filesystems_init); 253module_init(proc_filesystems_init);
254#endif 254#endif
255 255
256struct file_system_type *get_fs_type(const char *name) 256static struct file_system_type *__get_fs_type(const char *name, int len)
257{ 257{
258 struct file_system_type *fs; 258 struct file_system_type *fs;
259 const char *dot = strchr(name, '.');
260 unsigned len = dot ? dot - name : strlen(name);
261 259
262 read_lock(&file_systems_lock); 260 read_lock(&file_systems_lock);
263 fs = *(find_filesystem(name, len)); 261 fs = *(find_filesystem(name, len));
264 if (fs && !try_module_get(fs->owner)) 262 if (fs && !try_module_get(fs->owner))
265 fs = NULL; 263 fs = NULL;
266 read_unlock(&file_systems_lock); 264 read_unlock(&file_systems_lock);
267 if (!fs && (request_module("%.*s", len, name) == 0)) { 265 return fs;
268 read_lock(&file_systems_lock); 266}
269 fs = *(find_filesystem(name, len)); 267
270 if (fs && !try_module_get(fs->owner)) 268struct file_system_type *get_fs_type(const char *name)
271 fs = NULL; 269{
272 read_unlock(&file_systems_lock); 270 struct file_system_type *fs;
273 } 271 const char *dot = strchr(name, '.');
272 int len = dot ? dot - name : strlen(name);
273
274 fs = __get_fs_type(name, len);
275 if (!fs && (request_module("%.*s", len, name) == 0))
276 fs = __get_fs_type(name, len);
274 277
275 if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { 278 if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
276 put_filesystem(fs); 279 put_filesystem(fs);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0ff0b8cf309..e5eaa62fd17f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
421 * If we're a pdlfush thread, then implement pdflush collision avoidance 421 * If we're a pdlfush thread, then implement pdflush collision avoidance
422 * against the entire list. 422 * against the entire list.
423 * 423 *
424 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
425 * that it can be located for waiting on in __writeback_single_inode().
426 *
427 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 424 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
428 * This function assumes that the blockdev superblock's inodes are backed by 425 * This function assumes that the blockdev superblock's inodes are backed by
429 * a variety of queues, so all inodes are searched. For other superblocks, 426 * a variety of queues, so all inodes are searched. For other superblocks,
@@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
443 struct writeback_control *wbc) 440 struct writeback_control *wbc)
444{ 441{
445 const unsigned long start = jiffies; /* livelock avoidance */ 442 const unsigned long start = jiffies; /* livelock avoidance */
443 int sync = wbc->sync_mode == WB_SYNC_ALL;
446 444
447 spin_lock(&inode_lock); 445 spin_lock(&inode_lock);
448 if (!wbc->for_kupdate || list_empty(&sb->s_io)) 446 if (!wbc->for_kupdate || list_empty(&sb->s_io))
@@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
499 __iget(inode); 497 __iget(inode);
500 pages_skipped = wbc->pages_skipped; 498 pages_skipped = wbc->pages_skipped;
501 __writeback_single_inode(inode, wbc); 499 __writeback_single_inode(inode, wbc);
502 if (wbc->sync_mode == WB_SYNC_HOLD) {
503 inode->dirtied_when = jiffies;
504 list_move(&inode->i_list, &sb->s_dirty);
505 }
506 if (current_is_pdflush()) 500 if (current_is_pdflush())
507 writeback_release(bdi); 501 writeback_release(bdi);
508 if (wbc->pages_skipped != pages_skipped) { 502 if (wbc->pages_skipped != pages_skipped) {
@@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb,
523 if (!list_empty(&sb->s_more_io)) 517 if (!list_empty(&sb->s_more_io))
524 wbc->more_io = 1; 518 wbc->more_io = 1;
525 } 519 }
526 spin_unlock(&inode_lock); 520
521 if (sync) {
522 struct inode *inode, *old_inode = NULL;
523
524 /*
525 * Data integrity sync. Must wait for all pages under writeback,
526 * because there may have been pages dirtied before our sync
527 * call, but which had writeout started before we write it out.
528 * In which case, the inode may not be on the dirty list, but
529 * we still have to wait for that writeout.
530 */
531 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
532 struct address_space *mapping;
533
534 if (inode->i_state & (I_FREEING|I_WILL_FREE))
535 continue;
536 mapping = inode->i_mapping;
537 if (mapping->nrpages == 0)
538 continue;
539 __iget(inode);
540 spin_unlock(&inode_lock);
541 /*
542 * We hold a reference to 'inode' so it couldn't have
543 * been removed from s_inodes list while we dropped the
544 * inode_lock. We cannot iput the inode now as we can
545 * be holding the last reference and we cannot iput it
546 * under inode_lock. So we keep the reference and iput
547 * it later.
548 */
549 iput(old_inode);
550 old_inode = inode;
551
552 filemap_fdatawait(mapping);
553
554 cond_resched();
555
556 spin_lock(&inode_lock);
557 }
558 spin_unlock(&inode_lock);
559 iput(old_inode);
560 } else
561 spin_unlock(&inode_lock);
562
527 return; /* Leave any unwritten inodes on s_io */ 563 return; /* Leave any unwritten inodes on s_io */
528} 564}
529EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); 565EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -588,8 +624,7 @@ restart:
588 624
589/* 625/*
590 * writeback and wait upon the filesystem's dirty inodes. The caller will 626 * writeback and wait upon the filesystem's dirty inodes. The caller will
591 * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is 627 * do this in two passes - one to write, and one to wait.
592 * used to park the written inodes on sb->s_dirty for the wait pass.
593 * 628 *
594 * A finite limit is set on the number of pages which will be written. 629 * A finite limit is set on the number of pages which will be written.
595 * To prevent infinite livelock of sys_sync(). 630 * To prevent infinite livelock of sys_sync().
@@ -600,30 +635,21 @@ restart:
600void sync_inodes_sb(struct super_block *sb, int wait) 635void sync_inodes_sb(struct super_block *sb, int wait)
601{ 636{
602 struct writeback_control wbc = { 637 struct writeback_control wbc = {
603 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, 638 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
604 .range_start = 0, 639 .range_start = 0,
605 .range_end = LLONG_MAX, 640 .range_end = LLONG_MAX,
606 }; 641 };
607 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
608 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
609 642
610 wbc.nr_to_write = nr_dirty + nr_unstable + 643 if (!wait) {
611 (inodes_stat.nr_inodes - inodes_stat.nr_unused) + 644 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
612 nr_dirty + nr_unstable; 645 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
613 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
614 sync_sb_inodes(sb, &wbc);
615}
616 646
617/* 647 wbc.nr_to_write = nr_dirty + nr_unstable +
618 * Rather lame livelock avoidance. 648 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
619 */ 649 } else
620static void set_sb_syncing(int val) 650 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
621{ 651
622 struct super_block *sb; 652 sync_sb_inodes(sb, &wbc);
623 spin_lock(&sb_lock);
624 list_for_each_entry_reverse(sb, &super_blocks, s_list)
625 sb->s_syncing = val;
626 spin_unlock(&sb_lock);
627} 653}
628 654
629/** 655/**
@@ -652,9 +678,6 @@ static void __sync_inodes(int wait)
652 spin_lock(&sb_lock); 678 spin_lock(&sb_lock);
653restart: 679restart:
654 list_for_each_entry(sb, &super_blocks, s_list) { 680 list_for_each_entry(sb, &super_blocks, s_list) {
655 if (sb->s_syncing)
656 continue;
657 sb->s_syncing = 1;
658 sb->s_count++; 681 sb->s_count++;
659 spin_unlock(&sb_lock); 682 spin_unlock(&sb_lock);
660 down_read(&sb->s_umount); 683 down_read(&sb->s_umount);
@@ -672,13 +695,10 @@ restart:
672 695
673void sync_inodes(int wait) 696void sync_inodes(int wait)
674{ 697{
675 set_sb_syncing(0);
676 __sync_inodes(0); 698 __sync_inodes(0);
677 699
678 if (wait) { 700 if (wait)
679 set_sb_syncing(0);
680 __sync_inodes(1); 701 __sync_inodes(1);
681 }
682} 702}
683 703
684/** 704/**
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4f3cab321415..99c99dfb0373 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
48 size_t size; 48 size_t size;
49 49
50 if (!*ppos) { 50 if (!*ppos) {
51 long value;
51 struct fuse_conn *fc = fuse_ctl_file_conn_get(file); 52 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
52 if (!fc) 53 if (!fc)
53 return 0; 54 return 0;
54 55
55 file->private_data=(void *)(long)atomic_read(&fc->num_waiting); 56 value = atomic_read(&fc->num_waiting);
57 file->private_data = (void *)value;
56 fuse_conn_put(fc); 58 fuse_conn_put(fc);
57 } 59 }
58 size = sprintf(tmp, "%ld\n", (long)file->private_data); 60 size = sprintf(tmp, "%ld\n", (long)file->private_data);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fba571648a8e..e0c7ada08a1f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
269 * Called with fc->lock, unlocks it 269 * Called with fc->lock, unlocks it
270 */ 270 */
271static void request_end(struct fuse_conn *fc, struct fuse_req *req) 271static void request_end(struct fuse_conn *fc, struct fuse_req *req)
272 __releases(fc->lock) 272__releases(&fc->lock)
273{ 273{
274 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; 274 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
275 req->end = NULL; 275 req->end = NULL;
@@ -293,13 +293,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
293 wake_up(&req->waitq); 293 wake_up(&req->waitq);
294 if (end) 294 if (end)
295 end(fc, req); 295 end(fc, req);
296 else 296 fuse_put_request(fc, req);
297 fuse_put_request(fc, req);
298} 297}
299 298
300static void wait_answer_interruptible(struct fuse_conn *fc, 299static void wait_answer_interruptible(struct fuse_conn *fc,
301 struct fuse_req *req) 300 struct fuse_req *req)
302 __releases(fc->lock) __acquires(fc->lock) 301__releases(&fc->lock)
302__acquires(&fc->lock)
303{ 303{
304 if (signal_pending(current)) 304 if (signal_pending(current))
305 return; 305 return;
@@ -317,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
317} 317}
318 318
319static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) 319static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
320 __releases(fc->lock) __acquires(fc->lock) 320__releases(&fc->lock)
321__acquires(&fc->lock)
321{ 322{
322 if (!fc->no_interrupt) { 323 if (!fc->no_interrupt) {
323 /* Any signal may interrupt this */ 324 /* Any signal may interrupt this */
@@ -380,7 +381,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
380 } 381 }
381} 382}
382 383
383void request_send(struct fuse_conn *fc, struct fuse_req *req) 384void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
384{ 385{
385 req->isreply = 1; 386 req->isreply = 1;
386 spin_lock(&fc->lock); 387 spin_lock(&fc->lock);
@@ -399,8 +400,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
399 spin_unlock(&fc->lock); 400 spin_unlock(&fc->lock);
400} 401}
401 402
402static void request_send_nowait_locked(struct fuse_conn *fc, 403static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
403 struct fuse_req *req) 404 struct fuse_req *req)
404{ 405{
405 req->background = 1; 406 req->background = 1;
406 fc->num_background++; 407 fc->num_background++;
@@ -414,11 +415,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
414 flush_bg_queue(fc); 415 flush_bg_queue(fc);
415} 416}
416 417
417static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) 418static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
418{ 419{
419 spin_lock(&fc->lock); 420 spin_lock(&fc->lock);
420 if (fc->connected) { 421 if (fc->connected) {
421 request_send_nowait_locked(fc, req); 422 fuse_request_send_nowait_locked(fc, req);
422 spin_unlock(&fc->lock); 423 spin_unlock(&fc->lock);
423 } else { 424 } else {
424 req->out.h.error = -ENOTCONN; 425 req->out.h.error = -ENOTCONN;
@@ -426,16 +427,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
426 } 427 }
427} 428}
428 429
429void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) 430void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
430{ 431{
431 req->isreply = 0; 432 req->isreply = 0;
432 request_send_nowait(fc, req); 433 fuse_request_send_nowait(fc, req);
433} 434}
434 435
435void request_send_background(struct fuse_conn *fc, struct fuse_req *req) 436void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
436{ 437{
437 req->isreply = 1; 438 req->isreply = 1;
438 request_send_nowait(fc, req); 439 fuse_request_send_nowait(fc, req);
439} 440}
440 441
441/* 442/*
@@ -443,10 +444,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
443 * 444 *
444 * fc->connected must have been checked previously 445 * fc->connected must have been checked previously
445 */ 446 */
446void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req) 447void fuse_request_send_background_locked(struct fuse_conn *fc,
448 struct fuse_req *req)
447{ 449{
448 req->isreply = 1; 450 req->isreply = 1;
449 request_send_nowait_locked(fc, req); 451 fuse_request_send_nowait_locked(fc, req);
450} 452}
451 453
452/* 454/*
@@ -539,8 +541,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
539 BUG_ON(!cs->nr_segs); 541 BUG_ON(!cs->nr_segs);
540 cs->seglen = cs->iov[0].iov_len; 542 cs->seglen = cs->iov[0].iov_len;
541 cs->addr = (unsigned long) cs->iov[0].iov_base; 543 cs->addr = (unsigned long) cs->iov[0].iov_base;
542 cs->iov ++; 544 cs->iov++;
543 cs->nr_segs --; 545 cs->nr_segs--;
544 } 546 }
545 down_read(&current->mm->mmap_sem); 547 down_read(&current->mm->mmap_sem);
546 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, 548 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
@@ -589,9 +591,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
589 kunmap_atomic(mapaddr, KM_USER1); 591 kunmap_atomic(mapaddr, KM_USER1);
590 } 592 }
591 while (count) { 593 while (count) {
592 int err; 594 if (!cs->len) {
593 if (!cs->len && (err = fuse_copy_fill(cs))) 595 int err = fuse_copy_fill(cs);
594 return err; 596 if (err)
597 return err;
598 }
595 if (page) { 599 if (page) {
596 void *mapaddr = kmap_atomic(page, KM_USER1); 600 void *mapaddr = kmap_atomic(page, KM_USER1);
597 void *buf = mapaddr + offset; 601 void *buf = mapaddr + offset;
@@ -631,9 +635,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
631static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) 635static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
632{ 636{
633 while (size) { 637 while (size) {
634 int err; 638 if (!cs->len) {
635 if (!cs->len && (err = fuse_copy_fill(cs))) 639 int err = fuse_copy_fill(cs);
636 return err; 640 if (err)
641 return err;
642 }
637 fuse_copy_do(cs, &val, &size); 643 fuse_copy_do(cs, &val, &size);
638 } 644 }
639 return 0; 645 return 0;
@@ -664,6 +670,8 @@ static int request_pending(struct fuse_conn *fc)
664 670
665/* Wait until a request is available on the pending list */ 671/* Wait until a request is available on the pending list */
666static void request_wait(struct fuse_conn *fc) 672static void request_wait(struct fuse_conn *fc)
673__releases(&fc->lock)
674__acquires(&fc->lock)
667{ 675{
668 DECLARE_WAITQUEUE(wait, current); 676 DECLARE_WAITQUEUE(wait, current);
669 677
@@ -691,7 +699,7 @@ static void request_wait(struct fuse_conn *fc)
691 */ 699 */
692static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, 700static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
693 const struct iovec *iov, unsigned long nr_segs) 701 const struct iovec *iov, unsigned long nr_segs)
694 __releases(fc->lock) 702__releases(&fc->lock)
695{ 703{
696 struct fuse_copy_state cs; 704 struct fuse_copy_state cs;
697 struct fuse_in_header ih; 705 struct fuse_in_header ih;
@@ -813,6 +821,34 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
813 return err; 821 return err;
814} 822}
815 823
824static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
825 struct fuse_copy_state *cs)
826{
827 struct fuse_notify_poll_wakeup_out outarg;
828 int err;
829
830 if (size != sizeof(outarg))
831 return -EINVAL;
832
833 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
834 if (err)
835 return err;
836
837 return fuse_notify_poll_wakeup(fc, &outarg);
838}
839
840static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
841 unsigned int size, struct fuse_copy_state *cs)
842{
843 switch (code) {
844 case FUSE_NOTIFY_POLL:
845 return fuse_notify_poll(fc, size, cs);
846
847 default:
848 return -EINVAL;
849 }
850}
851
816/* Look up request on processing list by unique ID */ 852/* Look up request on processing list by unique ID */
817static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) 853static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
818{ 854{
@@ -876,9 +912,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
876 err = fuse_copy_one(&cs, &oh, sizeof(oh)); 912 err = fuse_copy_one(&cs, &oh, sizeof(oh));
877 if (err) 913 if (err)
878 goto err_finish; 914 goto err_finish;
915
916 err = -EINVAL;
917 if (oh.len != nbytes)
918 goto err_finish;
919
920 /*
921 * Zero oh.unique indicates unsolicited notification message
922 * and error contains notification code.
923 */
924 if (!oh.unique) {
925 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
926 fuse_copy_finish(&cs);
927 return err ? err : nbytes;
928 }
929
879 err = -EINVAL; 930 err = -EINVAL;
880 if (!oh.unique || oh.error <= -1000 || oh.error > 0 || 931 if (oh.error <= -1000 || oh.error > 0)
881 oh.len != nbytes)
882 goto err_finish; 932 goto err_finish;
883 933
884 spin_lock(&fc->lock); 934 spin_lock(&fc->lock);
@@ -966,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
966 * This function releases and reacquires fc->lock 1016 * This function releases and reacquires fc->lock
967 */ 1017 */
968static void end_requests(struct fuse_conn *fc, struct list_head *head) 1018static void end_requests(struct fuse_conn *fc, struct list_head *head)
1019__releases(&fc->lock)
1020__acquires(&fc->lock)
969{ 1021{
970 while (!list_empty(head)) { 1022 while (!list_empty(head)) {
971 struct fuse_req *req; 1023 struct fuse_req *req;
@@ -988,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
988 * locked). 1040 * locked).
989 */ 1041 */
990static void end_io_requests(struct fuse_conn *fc) 1042static void end_io_requests(struct fuse_conn *fc)
991 __releases(fc->lock) __acquires(fc->lock) 1043__releases(&fc->lock)
1044__acquires(&fc->lock)
992{ 1045{
993 while (!list_empty(&fc->io)) { 1046 while (!list_empty(&fc->io)) {
994 struct fuse_req *req = 1047 struct fuse_req *req =
@@ -1002,11 +1055,11 @@ static void end_io_requests(struct fuse_conn *fc)
1002 wake_up(&req->waitq); 1055 wake_up(&req->waitq);
1003 if (end) { 1056 if (end) {
1004 req->end = NULL; 1057 req->end = NULL;
1005 /* The end function will consume this reference */
1006 __fuse_get_request(req); 1058 __fuse_get_request(req);
1007 spin_unlock(&fc->lock); 1059 spin_unlock(&fc->lock);
1008 wait_event(req->waitq, !req->locked); 1060 wait_event(req->waitq, !req->locked);
1009 end(fc, req); 1061 end(fc, req);
1062 fuse_put_request(fc, req);
1010 spin_lock(&fc->lock); 1063 spin_lock(&fc->lock);
1011 } 1064 }
1012 } 1065 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 95bc22bdd060..fdff346e96fd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
189 parent = dget_parent(entry); 189 parent = dget_parent(entry);
190 fuse_lookup_init(fc, req, get_node_id(parent->d_inode), 190 fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
191 &entry->d_name, &outarg); 191 &entry->d_name, &outarg);
192 request_send(fc, req); 192 fuse_request_send(fc, req);
193 dput(parent); 193 dput(parent);
194 err = req->out.h.error; 194 err = req->out.h.error;
195 fuse_put_request(fc, req); 195 fuse_put_request(fc, req);
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
204 return 0; 204 return 0;
205 } 205 }
206 spin_lock(&fc->lock); 206 spin_lock(&fc->lock);
207 fi->nlookup ++; 207 fi->nlookup++;
208 spin_unlock(&fc->lock); 208 spin_unlock(&fc->lock);
209 } 209 }
210 fuse_put_request(fc, forget_req); 210 fuse_put_request(fc, forget_req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
283 attr_version = fuse_get_attr_version(fc); 283 attr_version = fuse_get_attr_version(fc);
284 284
285 fuse_lookup_init(fc, req, nodeid, name, outarg); 285 fuse_lookup_init(fc, req, nodeid, name, outarg);
286 request_send(fc, req); 286 fuse_request_send(fc, req);
287 err = req->out.h.error; 287 err = req->out.h.error;
288 fuse_put_request(fc, req); 288 fuse_put_request(fc, req);
289 /* Zero nodeid is same as -ENOENT, but with valid timeout */ 289 /* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
369{ 369{
370 fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE); 370 fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
371 ff->reserved_req->force = 1; 371 ff->reserved_req->force = 1;
372 request_send(fc, ff->reserved_req); 372 fuse_request_send(fc, ff->reserved_req);
373 fuse_put_request(fc, ff->reserved_req); 373 fuse_put_request(fc, ff->reserved_req);
374 kfree(ff); 374 kfree(ff);
375} 375}
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
408 goto out_put_forget_req; 408 goto out_put_forget_req;
409 409
410 err = -ENOMEM; 410 err = -ENOMEM;
411 ff = fuse_file_alloc(); 411 ff = fuse_file_alloc(fc);
412 if (!ff) 412 if (!ff)
413 goto out_put_request; 413 goto out_put_request;
414 414
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
432 req->out.args[0].value = &outentry; 432 req->out.args[0].value = &outentry;
433 req->out.args[1].size = sizeof(outopen); 433 req->out.args[1].size = sizeof(outopen);
434 req->out.args[1].value = &outopen; 434 req->out.args[1].value = &outopen;
435 request_send(fc, req); 435 fuse_request_send(fc, req);
436 err = req->out.h.error; 436 err = req->out.h.error;
437 if (err) { 437 if (err) {
438 if (err == -ENOSYS) 438 if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
502 else 502 else
503 req->out.args[0].size = sizeof(outarg); 503 req->out.args[0].size = sizeof(outarg);
504 req->out.args[0].value = &outarg; 504 req->out.args[0].value = &outarg;
505 request_send(fc, req); 505 fuse_request_send(fc, req);
506 err = req->out.h.error; 506 err = req->out.h.error;
507 fuse_put_request(fc, req); 507 fuse_put_request(fc, req);
508 if (err) 508 if (err)
@@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
631 req->in.numargs = 1; 631 req->in.numargs = 1;
632 req->in.args[0].size = entry->d_name.len + 1; 632 req->in.args[0].size = entry->d_name.len + 1;
633 req->in.args[0].value = entry->d_name.name; 633 req->in.args[0].value = entry->d_name.name;
634 request_send(fc, req); 634 fuse_request_send(fc, req);
635 err = req->out.h.error; 635 err = req->out.h.error;
636 fuse_put_request(fc, req); 636 fuse_put_request(fc, req);
637 if (!err) { 637 if (!err) {
638 struct inode *inode = entry->d_inode; 638 struct inode *inode = entry->d_inode;
639 639
640 /* Set nlink to zero so the inode can be cleared, if 640 /*
641 the inode does have more links this will be 641 * Set nlink to zero so the inode can be cleared, if the inode
642 discovered at the next lookup/getattr */ 642 * does have more links this will be discovered at the next
643 * lookup/getattr.
644 */
643 clear_nlink(inode); 645 clear_nlink(inode);
644 fuse_invalidate_attr(inode); 646 fuse_invalidate_attr(inode);
645 fuse_invalidate_attr(dir); 647 fuse_invalidate_attr(dir);
@@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
662 req->in.numargs = 1; 664 req->in.numargs = 1;
663 req->in.args[0].size = entry->d_name.len + 1; 665 req->in.args[0].size = entry->d_name.len + 1;
664 req->in.args[0].value = entry->d_name.name; 666 req->in.args[0].value = entry->d_name.name;
665 request_send(fc, req); 667 fuse_request_send(fc, req);
666 err = req->out.h.error; 668 err = req->out.h.error;
667 fuse_put_request(fc, req); 669 fuse_put_request(fc, req);
668 if (!err) { 670 if (!err) {
@@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
695 req->in.args[1].value = oldent->d_name.name; 697 req->in.args[1].value = oldent->d_name.name;
696 req->in.args[2].size = newent->d_name.len + 1; 698 req->in.args[2].size = newent->d_name.len + 1;
697 req->in.args[2].value = newent->d_name.name; 699 req->in.args[2].value = newent->d_name.name;
698 request_send(fc, req); 700 fuse_request_send(fc, req);
699 err = req->out.h.error; 701 err = req->out.h.error;
700 fuse_put_request(fc, req); 702 fuse_put_request(fc, req);
701 if (!err) { 703 if (!err) {
@@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
811 else 813 else
812 req->out.args[0].size = sizeof(outarg); 814 req->out.args[0].size = sizeof(outarg);
813 req->out.args[0].value = &outarg; 815 req->out.args[0].value = &outarg;
814 request_send(fc, req); 816 fuse_request_send(fc, req);
815 err = req->out.h.error; 817 err = req->out.h.error;
816 fuse_put_request(fc, req); 818 fuse_put_request(fc, req);
817 if (!err) { 819 if (!err) {
@@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask)
911 req->in.numargs = 1; 913 req->in.numargs = 1;
912 req->in.args[0].size = sizeof(inarg); 914 req->in.args[0].size = sizeof(inarg);
913 req->in.args[0].value = &inarg; 915 req->in.args[0].value = &inarg;
914 request_send(fc, req); 916 fuse_request_send(fc, req);
915 err = req->out.h.error; 917 err = req->out.h.error;
916 fuse_put_request(fc, req); 918 fuse_put_request(fc, req);
917 if (err == -ENOSYS) { 919 if (err == -ENOSYS) {
@@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1033 req->num_pages = 1; 1035 req->num_pages = 1;
1034 req->pages[0] = page; 1036 req->pages[0] = page;
1035 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); 1037 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
1036 request_send(fc, req); 1038 fuse_request_send(fc, req);
1037 nbytes = req->out.args[0].size; 1039 nbytes = req->out.args[0].size;
1038 err = req->out.h.error; 1040 err = req->out.h.error;
1039 fuse_put_request(fc, req); 1041 fuse_put_request(fc, req);
@@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry)
1067 req->out.numargs = 1; 1069 req->out.numargs = 1;
1068 req->out.args[0].size = PAGE_SIZE - 1; 1070 req->out.args[0].size = PAGE_SIZE - 1;
1069 req->out.args[0].value = link; 1071 req->out.args[0].value = link;
1070 request_send(fc, req); 1072 fuse_request_send(fc, req);
1071 if (req->out.h.error) { 1073 if (req->out.h.error) {
1072 free_page((unsigned long) link); 1074 free_page((unsigned long) link);
1073 link = ERR_PTR(req->out.h.error); 1075 link = ERR_PTR(req->out.h.error);
@@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1273 else 1275 else
1274 req->out.args[0].size = sizeof(outarg); 1276 req->out.args[0].size = sizeof(outarg);
1275 req->out.args[0].value = &outarg; 1277 req->out.args[0].value = &outarg;
1276 request_send(fc, req); 1278 fuse_request_send(fc, req);
1277 err = req->out.h.error; 1279 err = req->out.h.error;
1278 fuse_put_request(fc, req); 1280 fuse_put_request(fc, req);
1279 if (err) { 1281 if (err) {
@@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
1367 req->in.args[1].value = name; 1369 req->in.args[1].value = name;
1368 req->in.args[2].size = size; 1370 req->in.args[2].size = size;
1369 req->in.args[2].value = value; 1371 req->in.args[2].value = value;
1370 request_send(fc, req); 1372 fuse_request_send(fc, req);
1371 err = req->out.h.error; 1373 err = req->out.h.error;
1372 fuse_put_request(fc, req); 1374 fuse_put_request(fc, req);
1373 if (err == -ENOSYS) { 1375 if (err == -ENOSYS) {
@@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
1413 req->out.args[0].size = sizeof(outarg); 1415 req->out.args[0].size = sizeof(outarg);
1414 req->out.args[0].value = &outarg; 1416 req->out.args[0].value = &outarg;
1415 } 1417 }
1416 request_send(fc, req); 1418 fuse_request_send(fc, req);
1417 ret = req->out.h.error; 1419 ret = req->out.h.error;
1418 if (!ret) 1420 if (!ret)
1419 ret = size ? req->out.args[0].size : outarg.size; 1421 ret = size ? req->out.args[0].size : outarg.size;
@@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
1463 req->out.args[0].size = sizeof(outarg); 1465 req->out.args[0].size = sizeof(outarg);
1464 req->out.args[0].value = &outarg; 1466 req->out.args[0].value = &outarg;
1465 } 1467 }
1466 request_send(fc, req); 1468 fuse_request_send(fc, req);
1467 ret = req->out.h.error; 1469 ret = req->out.h.error;
1468 if (!ret) 1470 if (!ret)
1469 ret = size ? req->out.args[0].size : outarg.size; 1471 ret = size ? req->out.args[0].size : outarg.size;
@@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
1496 req->in.numargs = 1; 1498 req->in.numargs = 1;
1497 req->in.args[0].size = strlen(name) + 1; 1499 req->in.args[0].size = strlen(name) + 1;
1498 req->in.args[0].value = name; 1500 req->in.args[0].value = name;
1499 request_send(fc, req); 1501 fuse_request_send(fc, req);
1500 err = req->out.h.error; 1502 err = req->out.h.error;
1501 fuse_put_request(fc, req); 1503 fuse_put_request(fc, req);
1502 if (err == -ENOSYS) { 1504 if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4c9ee7011265..e8162646a9b5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
39 req->out.numargs = 1; 39 req->out.numargs = 1;
40 req->out.args[0].size = sizeof(*outargp); 40 req->out.args[0].size = sizeof(*outargp);
41 req->out.args[0].value = outargp; 41 req->out.args[0].value = outargp;
42 request_send(fc, req); 42 fuse_request_send(fc, req);
43 err = req->out.h.error; 43 err = req->out.h.error;
44 fuse_put_request(fc, req); 44 fuse_put_request(fc, req);
45 45
46 return err; 46 return err;
47} 47}
48 48
49struct fuse_file *fuse_file_alloc(void) 49struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
50{ 50{
51 struct fuse_file *ff; 51 struct fuse_file *ff;
52 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); 52 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -58,7 +58,12 @@ struct fuse_file *fuse_file_alloc(void)
58 } else { 58 } else {
59 INIT_LIST_HEAD(&ff->write_entry); 59 INIT_LIST_HEAD(&ff->write_entry);
60 atomic_set(&ff->count, 0); 60 atomic_set(&ff->count, 0);
61 spin_lock(&fc->lock);
62 ff->kh = ++fc->khctr;
63 spin_unlock(&fc->lock);
61 } 64 }
65 RB_CLEAR_NODE(&ff->polled_node);
66 init_waitqueue_head(&ff->poll_wait);
62 } 67 }
63 return ff; 68 return ff;
64} 69}
@@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
79{ 84{
80 dput(req->misc.release.dentry); 85 dput(req->misc.release.dentry);
81 mntput(req->misc.release.vfsmount); 86 mntput(req->misc.release.vfsmount);
82 fuse_put_request(fc, req);
83} 87}
84 88
85static void fuse_file_put(struct fuse_file *ff) 89static void fuse_file_put(struct fuse_file *ff)
@@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff)
89 struct inode *inode = req->misc.release.dentry->d_inode; 93 struct inode *inode = req->misc.release.dentry->d_inode;
90 struct fuse_conn *fc = get_fuse_conn(inode); 94 struct fuse_conn *fc = get_fuse_conn(inode);
91 req->end = fuse_release_end; 95 req->end = fuse_release_end;
92 request_send_background(fc, req); 96 fuse_request_send_background(fc, req);
93 kfree(ff); 97 kfree(ff);
94 } 98 }
95} 99}
@@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file,
109 113
110int fuse_open_common(struct inode *inode, struct file *file, int isdir) 114int fuse_open_common(struct inode *inode, struct file *file, int isdir)
111{ 115{
116 struct fuse_conn *fc = get_fuse_conn(inode);
112 struct fuse_open_out outarg; 117 struct fuse_open_out outarg;
113 struct fuse_file *ff; 118 struct fuse_file *ff;
114 int err; 119 int err;
@@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
121 if (err) 126 if (err)
122 return err; 127 return err;
123 128
124 ff = fuse_file_alloc(); 129 ff = fuse_file_alloc(fc);
125 if (!ff) 130 if (!ff)
126 return -ENOMEM; 131 return -ENOMEM;
127 132
@@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
167 172
168 spin_lock(&fc->lock); 173 spin_lock(&fc->lock);
169 list_del(&ff->write_entry); 174 list_del(&ff->write_entry);
175 if (!RB_EMPTY_NODE(&ff->polled_node))
176 rb_erase(&ff->polled_node, &fc->polled_files);
170 spin_unlock(&fc->lock); 177 spin_unlock(&fc->lock);
178
179 wake_up_interruptible_sync(&ff->poll_wait);
171 /* 180 /*
172 * Normally this will send the RELEASE request, 181 * Normally this will send the RELEASE request,
173 * however if some asynchronous READ or WRITE requests 182 * however if some asynchronous READ or WRITE requests
@@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
280 req->in.args[0].size = sizeof(inarg); 289 req->in.args[0].size = sizeof(inarg);
281 req->in.args[0].value = &inarg; 290 req->in.args[0].value = &inarg;
282 req->force = 1; 291 req->force = 1;
283 request_send(fc, req); 292 fuse_request_send(fc, req);
284 err = req->out.h.error; 293 err = req->out.h.error;
285 fuse_put_request(fc, req); 294 fuse_put_request(fc, req);
286 if (err == -ENOSYS) { 295 if (err == -ENOSYS) {
@@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
344 req->in.numargs = 1; 353 req->in.numargs = 1;
345 req->in.args[0].size = sizeof(inarg); 354 req->in.args[0].size = sizeof(inarg);
346 req->in.args[0].value = &inarg; 355 req->in.args[0].value = &inarg;
347 request_send(fc, req); 356 fuse_request_send(fc, req);
348 err = req->out.h.error; 357 err = req->out.h.error;
349 fuse_put_request(fc, req); 358 fuse_put_request(fc, req);
350 if (err == -ENOSYS) { 359 if (err == -ENOSYS) {
@@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
396 inarg->read_flags |= FUSE_READ_LOCKOWNER; 405 inarg->read_flags |= FUSE_READ_LOCKOWNER;
397 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 406 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
398 } 407 }
399 request_send(fc, req); 408 fuse_request_send(fc, req);
400 return req->out.args[0].size; 409 return req->out.args[0].size;
401} 410}
402 411
@@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
493 } 502 }
494 if (req->ff) 503 if (req->ff)
495 fuse_file_put(req->ff); 504 fuse_file_put(req->ff);
496 fuse_put_request(fc, req);
497} 505}
498 506
499static void fuse_send_readpages(struct fuse_req *req, struct file *file, 507static void fuse_send_readpages(struct fuse_req *req, struct file *file,
@@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
509 struct fuse_file *ff = file->private_data; 517 struct fuse_file *ff = file->private_data;
510 req->ff = fuse_file_get(ff); 518 req->ff = fuse_file_get(ff);
511 req->end = fuse_readpages_end; 519 req->end = fuse_readpages_end;
512 request_send_background(fc, req); 520 fuse_request_send_background(fc, req);
513 } else { 521 } else {
514 request_send(fc, req); 522 fuse_request_send(fc, req);
515 fuse_readpages_end(fc, req); 523 fuse_readpages_end(fc, req);
524 fuse_put_request(fc, req);
516 } 525 }
517} 526}
518 527
@@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
543 } 552 }
544 } 553 }
545 req->pages[req->num_pages] = page; 554 req->pages[req->num_pages] = page;
546 req->num_pages ++; 555 req->num_pages++;
547 return 0; 556 return 0;
548} 557}
549 558
@@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
636 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 645 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
637 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 646 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
638 } 647 }
639 request_send(fc, req); 648 fuse_request_send(fc, req);
640 return req->misc.write.out.size; 649 return req->misc.write.out.size;
641} 650}
642 651
@@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1042{ 1051{
1043 __free_page(req->pages[0]); 1052 __free_page(req->pages[0]);
1044 fuse_file_put(req->ff); 1053 fuse_file_put(req->ff);
1045 fuse_put_request(fc, req);
1046} 1054}
1047 1055
1048static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) 1056static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1060 1068
1061/* Called under fc->lock, may release and reacquire it */ 1069/* Called under fc->lock, may release and reacquire it */
1062static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) 1070static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1071__releases(&fc->lock)
1072__acquires(&fc->lock)
1063{ 1073{
1064 struct fuse_inode *fi = get_fuse_inode(req->inode); 1074 struct fuse_inode *fi = get_fuse_inode(req->inode);
1065 loff_t size = i_size_read(req->inode); 1075 loff_t size = i_size_read(req->inode);
@@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1079 1089
1080 req->in.args[1].size = inarg->size; 1090 req->in.args[1].size = inarg->size;
1081 fi->writectr++; 1091 fi->writectr++;
1082 request_send_background_locked(fc, req); 1092 fuse_request_send_background_locked(fc, req);
1083 return; 1093 return;
1084 1094
1085 out_free: 1095 out_free:
1086 fuse_writepage_finish(fc, req); 1096 fuse_writepage_finish(fc, req);
1087 spin_unlock(&fc->lock); 1097 spin_unlock(&fc->lock);
1088 fuse_writepage_free(fc, req); 1098 fuse_writepage_free(fc, req);
1099 fuse_put_request(fc, req);
1089 spin_lock(&fc->lock); 1100 spin_lock(&fc->lock);
1090} 1101}
1091 1102
@@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1096 * Called with fc->lock 1107 * Called with fc->lock
1097 */ 1108 */
1098void fuse_flush_writepages(struct inode *inode) 1109void fuse_flush_writepages(struct inode *inode)
1110__releases(&fc->lock)
1111__acquires(&fc->lock)
1099{ 1112{
1100 struct fuse_conn *fc = get_fuse_conn(inode); 1113 struct fuse_conn *fc = get_fuse_conn(inode);
1101 struct fuse_inode *fi = get_fuse_inode(inode); 1114 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
1325 req->out.numargs = 1; 1338 req->out.numargs = 1;
1326 req->out.args[0].size = sizeof(outarg); 1339 req->out.args[0].size = sizeof(outarg);
1327 req->out.args[0].value = &outarg; 1340 req->out.args[0].value = &outarg;
1328 request_send(fc, req); 1341 fuse_request_send(fc, req);
1329 err = req->out.h.error; 1342 err = req->out.h.error;
1330 fuse_put_request(fc, req); 1343 fuse_put_request(fc, req);
1331 if (!err) 1344 if (!err)
@@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
1357 return PTR_ERR(req); 1370 return PTR_ERR(req);
1358 1371
1359 fuse_lk_fill(req, file, fl, opcode, pid, flock); 1372 fuse_lk_fill(req, file, fl, opcode, pid, flock);
1360 request_send(fc, req); 1373 fuse_request_send(fc, req);
1361 err = req->out.h.error; 1374 err = req->out.h.error;
1362 /* locking is restartable */ 1375 /* locking is restartable */
1363 if (err == -EINTR) 1376 if (err == -EINTR)
@@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1433 req->out.numargs = 1; 1446 req->out.numargs = 1;
1434 req->out.args[0].size = sizeof(outarg); 1447 req->out.args[0].size = sizeof(outarg);
1435 req->out.args[0].value = &outarg; 1448 req->out.args[0].value = &outarg;
1436 request_send(fc, req); 1449 fuse_request_send(fc, req);
1437 err = req->out.h.error; 1450 err = req->out.h.error;
1438 fuse_put_request(fc, req); 1451 fuse_put_request(fc, req);
1439 if (err == -ENOSYS) 1452 if (err == -ENOSYS)
@@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1470 return retval; 1483 return retval;
1471} 1484}
1472 1485
1486static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1487 unsigned int nr_segs, size_t bytes, bool to_user)
1488{
1489 struct iov_iter ii;
1490 int page_idx = 0;
1491
1492 if (!bytes)
1493 return 0;
1494
1495 iov_iter_init(&ii, iov, nr_segs, bytes, 0);
1496
1497 while (iov_iter_count(&ii)) {
1498 struct page *page = pages[page_idx++];
1499 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1500 void *kaddr, *map;
1501
1502 kaddr = map = kmap(page);
1503
1504 while (todo) {
1505 char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
1506 size_t iov_len = ii.iov->iov_len - ii.iov_offset;
1507 size_t copy = min(todo, iov_len);
1508 size_t left;
1509
1510 if (!to_user)
1511 left = copy_from_user(kaddr, uaddr, copy);
1512 else
1513 left = copy_to_user(uaddr, kaddr, copy);
1514
1515 if (unlikely(left))
1516 return -EFAULT;
1517
1518 iov_iter_advance(&ii, copy);
1519 todo -= copy;
1520 kaddr += copy;
1521 }
1522
1523 kunmap(map);
1524 }
1525
1526 return 0;
1527}
1528
1529/*
1530 * For ioctls, there is no generic way to determine how much memory
1531 * needs to be read and/or written. Furthermore, ioctls are allowed
1532 * to dereference the passed pointer, so the parameter requires deep
1533 * copying but FUSE has no idea whatsoever about what to copy in or
1534 * out.
1535 *
1536 * This is solved by allowing FUSE server to retry ioctl with
1537 * necessary in/out iovecs. Let's assume the ioctl implementation
1538 * needs to read in the following structure.
1539 *
1540 * struct a {
1541 * char *buf;
1542 * size_t buflen;
1543 * }
1544 *
1545 * On the first callout to FUSE server, inarg->in_size and
1546 * inarg->out_size will be NULL; then, the server completes the ioctl
1547 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
1548 * the actual iov array to
1549 *
1550 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
1551 *
1552 * which tells FUSE to copy in the requested area and retry the ioctl.
1553 * On the second round, the server has access to the structure and
1554 * from that it can tell what to look for next, so on the invocation,
1555 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
1556 *
1557 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
1558 * { .iov_base = a.buf, .iov_len = a.buflen } }
1559 *
1560 * FUSE will copy both struct a and the pointed buffer from the
1561 * process doing the ioctl and retry ioctl with both struct a and the
1562 * buffer.
1563 *
1564 * This time, FUSE server has everything it needs and completes ioctl
1565 * without FUSE_IOCTL_RETRY which finishes the ioctl call.
1566 *
1567 * Copying data out works the same way.
1568 *
1569 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
1570 * automatically initializes in and out iovs by decoding @cmd with
1571 * _IOC_* macros and the server is not allowed to request RETRY. This
1572 * limits ioctl data transfers to well-formed ioctls and is the forced
1573 * behavior for all FUSE servers.
1574 */
1575static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1576 unsigned long arg, unsigned int flags)
1577{
1578 struct inode *inode = file->f_dentry->d_inode;
1579 struct fuse_file *ff = file->private_data;
1580 struct fuse_conn *fc = get_fuse_conn(inode);
1581 struct fuse_ioctl_in inarg = {
1582 .fh = ff->fh,
1583 .cmd = cmd,
1584 .arg = arg,
1585 .flags = flags
1586 };
1587 struct fuse_ioctl_out outarg;
1588 struct fuse_req *req = NULL;
1589 struct page **pages = NULL;
1590 struct page *iov_page = NULL;
1591 struct iovec *in_iov = NULL, *out_iov = NULL;
1592 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1593 size_t in_size, out_size, transferred;
1594 int err;
1595
1596 /* assume all the iovs returned by client always fits in a page */
1597 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1598
1599 if (!fuse_allow_task(fc, current))
1600 return -EACCES;
1601
1602 err = -EIO;
1603 if (is_bad_inode(inode))
1604 goto out;
1605
1606 err = -ENOMEM;
1607 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1608 iov_page = alloc_page(GFP_KERNEL);
1609 if (!pages || !iov_page)
1610 goto out;
1611
1612 /*
1613 * If restricted, initialize IO parameters as encoded in @cmd.
1614 * RETRY from server is not allowed.
1615 */
1616 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1617 struct iovec *iov = page_address(iov_page);
1618
1619 iov->iov_base = (void __user *)arg;
1620 iov->iov_len = _IOC_SIZE(cmd);
1621
1622 if (_IOC_DIR(cmd) & _IOC_WRITE) {
1623 in_iov = iov;
1624 in_iovs = 1;
1625 }
1626
1627 if (_IOC_DIR(cmd) & _IOC_READ) {
1628 out_iov = iov;
1629 out_iovs = 1;
1630 }
1631 }
1632
1633 retry:
1634 inarg.in_size = in_size = iov_length(in_iov, in_iovs);
1635 inarg.out_size = out_size = iov_length(out_iov, out_iovs);
1636
1637 /*
1638 * Out data can be used either for actual out data or iovs,
1639 * make sure there always is at least one page.
1640 */
1641 out_size = max_t(size_t, out_size, PAGE_SIZE);
1642 max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
1643
1644 /* make sure there are enough buffer pages and init request with them */
1645 err = -ENOMEM;
1646 if (max_pages > FUSE_MAX_PAGES_PER_REQ)
1647 goto out;
1648 while (num_pages < max_pages) {
1649 pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
1650 if (!pages[num_pages])
1651 goto out;
1652 num_pages++;
1653 }
1654
1655 req = fuse_get_req(fc);
1656 if (IS_ERR(req)) {
1657 err = PTR_ERR(req);
1658 req = NULL;
1659 goto out;
1660 }
1661 memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
1662 req->num_pages = num_pages;
1663
1664 /* okay, let's send it to the client */
1665 req->in.h.opcode = FUSE_IOCTL;
1666 req->in.h.nodeid = get_node_id(inode);
1667 req->in.numargs = 1;
1668 req->in.args[0].size = sizeof(inarg);
1669 req->in.args[0].value = &inarg;
1670 if (in_size) {
1671 req->in.numargs++;
1672 req->in.args[1].size = in_size;
1673 req->in.argpages = 1;
1674
1675 err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
1676 false);
1677 if (err)
1678 goto out;
1679 }
1680
1681 req->out.numargs = 2;
1682 req->out.args[0].size = sizeof(outarg);
1683 req->out.args[0].value = &outarg;
1684 req->out.args[1].size = out_size;
1685 req->out.argpages = 1;
1686 req->out.argvar = 1;
1687
1688 fuse_request_send(fc, req);
1689 err = req->out.h.error;
1690 transferred = req->out.args[1].size;
1691 fuse_put_request(fc, req);
1692 req = NULL;
1693 if (err)
1694 goto out;
1695
1696 /* did it ask for retry? */
1697 if (outarg.flags & FUSE_IOCTL_RETRY) {
1698 char *vaddr;
1699
1700 /* no retry if in restricted mode */
1701 err = -EIO;
1702 if (!(flags & FUSE_IOCTL_UNRESTRICTED))
1703 goto out;
1704
1705 in_iovs = outarg.in_iovs;
1706 out_iovs = outarg.out_iovs;
1707
1708 /*
1709 * Make sure things are in boundary, separate checks
1710 * are to protect against overflow.
1711 */
1712 err = -ENOMEM;
1713 if (in_iovs > FUSE_IOCTL_MAX_IOV ||
1714 out_iovs > FUSE_IOCTL_MAX_IOV ||
1715 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1716 goto out;
1717
1718 err = -EIO;
1719 if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
1720 goto out;
1721
1722 /* okay, copy in iovs and retry */
1723 vaddr = kmap_atomic(pages[0], KM_USER0);
1724 memcpy(page_address(iov_page), vaddr, transferred);
1725 kunmap_atomic(vaddr, KM_USER0);
1726
1727 in_iov = page_address(iov_page);
1728 out_iov = in_iov + in_iovs;
1729
1730 goto retry;
1731 }
1732
1733 err = -EIO;
1734 if (transferred > inarg.out_size)
1735 goto out;
1736
1737 err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
1738 out:
1739 if (req)
1740 fuse_put_request(fc, req);
1741 if (iov_page)
1742 __free_page(iov_page);
1743 while (num_pages)
1744 __free_page(pages[--num_pages]);
1745 kfree(pages);
1746
1747 return err ? err : outarg.result;
1748}
1749
1750static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1751 unsigned long arg)
1752{
1753 return fuse_file_do_ioctl(file, cmd, arg, 0);
1754}
1755
1756static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
1757 unsigned long arg)
1758{
1759 return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
1760}
1761
1762/*
1763 * All files which have been polled are linked to RB tree
1764 * fuse_conn->polled_files which is indexed by kh. Walk the tree and
1765 * find the matching one.
1766 */
1767static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
1768 struct rb_node **parent_out)
1769{
1770 struct rb_node **link = &fc->polled_files.rb_node;
1771 struct rb_node *last = NULL;
1772
1773 while (*link) {
1774 struct fuse_file *ff;
1775
1776 last = *link;
1777 ff = rb_entry(last, struct fuse_file, polled_node);
1778
1779 if (kh < ff->kh)
1780 link = &last->rb_left;
1781 else if (kh > ff->kh)
1782 link = &last->rb_right;
1783 else
1784 return link;
1785 }
1786
1787 if (parent_out)
1788 *parent_out = last;
1789 return link;
1790}
1791
1792/*
1793 * The file is about to be polled. Make sure it's on the polled_files
1794 * RB tree. Note that files once added to the polled_files tree are
1795 * not removed before the file is released. This is because a file
1796 * polled once is likely to be polled again.
1797 */
1798static void fuse_register_polled_file(struct fuse_conn *fc,
1799 struct fuse_file *ff)
1800{
1801 spin_lock(&fc->lock);
1802 if (RB_EMPTY_NODE(&ff->polled_node)) {
1803 struct rb_node **link, *parent;
1804
1805 link = fuse_find_polled_node(fc, ff->kh, &parent);
1806 BUG_ON(*link);
1807 rb_link_node(&ff->polled_node, parent, link);
1808 rb_insert_color(&ff->polled_node, &fc->polled_files);
1809 }
1810 spin_unlock(&fc->lock);
1811}
1812
1813static unsigned fuse_file_poll(struct file *file, poll_table *wait)
1814{
1815 struct inode *inode = file->f_dentry->d_inode;
1816 struct fuse_file *ff = file->private_data;
1817 struct fuse_conn *fc = get_fuse_conn(inode);
1818 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
1819 struct fuse_poll_out outarg;
1820 struct fuse_req *req;
1821 int err;
1822
1823 if (fc->no_poll)
1824 return DEFAULT_POLLMASK;
1825
1826 poll_wait(file, &ff->poll_wait, wait);
1827
1828 /*
1829 * Ask for notification iff there's someone waiting for it.
1830 * The client may ignore the flag and always notify.
1831 */
1832 if (waitqueue_active(&ff->poll_wait)) {
1833 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
1834 fuse_register_polled_file(fc, ff);
1835 }
1836
1837 req = fuse_get_req(fc);
1838 if (IS_ERR(req))
1839 return PTR_ERR(req);
1840
1841 req->in.h.opcode = FUSE_POLL;
1842 req->in.h.nodeid = get_node_id(inode);
1843 req->in.numargs = 1;
1844 req->in.args[0].size = sizeof(inarg);
1845 req->in.args[0].value = &inarg;
1846 req->out.numargs = 1;
1847 req->out.args[0].size = sizeof(outarg);
1848 req->out.args[0].value = &outarg;
1849 fuse_request_send(fc, req);
1850 err = req->out.h.error;
1851 fuse_put_request(fc, req);
1852
1853 if (!err)
1854 return outarg.revents;
1855 if (err == -ENOSYS) {
1856 fc->no_poll = 1;
1857 return DEFAULT_POLLMASK;
1858 }
1859 return POLLERR;
1860}
1861
1862/*
1863 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
1864 * wakes up the poll waiters.
1865 */
1866int fuse_notify_poll_wakeup(struct fuse_conn *fc,
1867 struct fuse_notify_poll_wakeup_out *outarg)
1868{
1869 u64 kh = outarg->kh;
1870 struct rb_node **link;
1871
1872 spin_lock(&fc->lock);
1873
1874 link = fuse_find_polled_node(fc, kh, NULL);
1875 if (*link) {
1876 struct fuse_file *ff;
1877
1878 ff = rb_entry(*link, struct fuse_file, polled_node);
1879 wake_up_interruptible_sync(&ff->poll_wait);
1880 }
1881
1882 spin_unlock(&fc->lock);
1883 return 0;
1884}
1885
1473static const struct file_operations fuse_file_operations = { 1886static const struct file_operations fuse_file_operations = {
1474 .llseek = fuse_file_llseek, 1887 .llseek = fuse_file_llseek,
1475 .read = do_sync_read, 1888 .read = do_sync_read,
@@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = {
1484 .lock = fuse_file_lock, 1897 .lock = fuse_file_lock,
1485 .flock = fuse_file_flock, 1898 .flock = fuse_file_flock,
1486 .splice_read = generic_file_splice_read, 1899 .splice_read = generic_file_splice_read,
1900 .unlocked_ioctl = fuse_file_ioctl,
1901 .compat_ioctl = fuse_file_compat_ioctl,
1902 .poll = fuse_file_poll,
1487}; 1903};
1488 1904
1489static const struct file_operations fuse_direct_io_file_operations = { 1905static const struct file_operations fuse_direct_io_file_operations = {
@@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = {
1496 .fsync = fuse_fsync, 1912 .fsync = fuse_fsync,
1497 .lock = fuse_file_lock, 1913 .lock = fuse_file_lock,
1498 .flock = fuse_file_flock, 1914 .flock = fuse_file_flock,
1915 .unlocked_ioctl = fuse_file_ioctl,
1916 .compat_ioctl = fuse_file_compat_ioctl,
1917 .poll = fuse_file_poll,
1499 /* no mmap and splice_read */ 1918 /* no mmap and splice_read */
1500}; 1919};
1501 1920
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 35accfdd747f..5e64b815a5a1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -19,6 +19,8 @@
19#include <linux/backing-dev.h> 19#include <linux/backing-dev.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/rwsem.h> 21#include <linux/rwsem.h>
22#include <linux/rbtree.h>
23#include <linux/poll.h>
22 24
23/** Max number of pages that can be used in a single read request */ 25/** Max number of pages that can be used in a single read request */
24#define FUSE_MAX_PAGES_PER_REQ 32 26#define FUSE_MAX_PAGES_PER_REQ 32
@@ -100,6 +102,9 @@ struct fuse_file {
100 /** Request reserved for flush and release */ 102 /** Request reserved for flush and release */
101 struct fuse_req *reserved_req; 103 struct fuse_req *reserved_req;
102 104
105 /** Kernel file handle guaranteed to be unique */
106 u64 kh;
107
103 /** File handle used by userspace */ 108 /** File handle used by userspace */
104 u64 fh; 109 u64 fh;
105 110
@@ -108,6 +113,12 @@ struct fuse_file {
108 113
109 /** Entry on inode's write_files list */ 114 /** Entry on inode's write_files list */
110 struct list_head write_entry; 115 struct list_head write_entry;
116
117 /** RB node to be linked on fuse_conn->polled_files */
118 struct rb_node polled_node;
119
120 /** Wait queue head for poll */
121 wait_queue_head_t poll_wait;
111}; 122};
112 123
113/** One input argument of a request */ 124/** One input argument of a request */
@@ -322,6 +333,12 @@ struct fuse_conn {
322 /** The list of requests under I/O */ 333 /** The list of requests under I/O */
323 struct list_head io; 334 struct list_head io;
324 335
336 /** The next unique kernel file handle */
337 u64 khctr;
338
339 /** rbtree of fuse_files waiting for poll events indexed by ph */
340 struct rb_root polled_files;
341
325 /** Number of requests currently in the background */ 342 /** Number of requests currently in the background */
326 unsigned num_background; 343 unsigned num_background;
327 344
@@ -355,19 +372,19 @@ struct fuse_conn {
355 /** Connection failed (version mismatch). Cannot race with 372 /** Connection failed (version mismatch). Cannot race with
356 setting other bitfields since it is only set once in INIT 373 setting other bitfields since it is only set once in INIT
357 reply, before any other request, and never cleared */ 374 reply, before any other request, and never cleared */
358 unsigned conn_error : 1; 375 unsigned conn_error:1;
359 376
360 /** Connection successful. Only set in INIT */ 377 /** Connection successful. Only set in INIT */
361 unsigned conn_init : 1; 378 unsigned conn_init:1;
362 379
363 /** Do readpages asynchronously? Only set in INIT */ 380 /** Do readpages asynchronously? Only set in INIT */
364 unsigned async_read : 1; 381 unsigned async_read:1;
365 382
366 /** Do not send separate SETATTR request before open(O_TRUNC) */ 383 /** Do not send separate SETATTR request before open(O_TRUNC) */
367 unsigned atomic_o_trunc : 1; 384 unsigned atomic_o_trunc:1;
368 385
369 /** Filesystem supports NFS exporting. Only set in INIT */ 386 /** Filesystem supports NFS exporting. Only set in INIT */
370 unsigned export_support : 1; 387 unsigned export_support:1;
371 388
372 /* 389 /*
373 * The following bitfields are only for optimization purposes 390 * The following bitfields are only for optimization purposes
@@ -375,43 +392,46 @@ struct fuse_conn {
375 */ 392 */
376 393
377 /** Is fsync not implemented by fs? */ 394 /** Is fsync not implemented by fs? */
378 unsigned no_fsync : 1; 395 unsigned no_fsync:1;
379 396
380 /** Is fsyncdir not implemented by fs? */ 397 /** Is fsyncdir not implemented by fs? */
381 unsigned no_fsyncdir : 1; 398 unsigned no_fsyncdir:1;
382 399
383 /** Is flush not implemented by fs? */ 400 /** Is flush not implemented by fs? */
384 unsigned no_flush : 1; 401 unsigned no_flush:1;
385 402
386 /** Is setxattr not implemented by fs? */ 403 /** Is setxattr not implemented by fs? */
387 unsigned no_setxattr : 1; 404 unsigned no_setxattr:1;
388 405
389 /** Is getxattr not implemented by fs? */ 406 /** Is getxattr not implemented by fs? */
390 unsigned no_getxattr : 1; 407 unsigned no_getxattr:1;
391 408
392 /** Is listxattr not implemented by fs? */ 409 /** Is listxattr not implemented by fs? */
393 unsigned no_listxattr : 1; 410 unsigned no_listxattr:1;
394 411
395 /** Is removexattr not implemented by fs? */ 412 /** Is removexattr not implemented by fs? */
396 unsigned no_removexattr : 1; 413 unsigned no_removexattr:1;
397 414
398 /** Are file locking primitives not implemented by fs? */ 415 /** Are file locking primitives not implemented by fs? */
399 unsigned no_lock : 1; 416 unsigned no_lock:1;
400 417
401 /** Is access not implemented by fs? */ 418 /** Is access not implemented by fs? */
402 unsigned no_access : 1; 419 unsigned no_access:1;
403 420
404 /** Is create not implemented by fs? */ 421 /** Is create not implemented by fs? */
405 unsigned no_create : 1; 422 unsigned no_create:1;
406 423
407 /** Is interrupt not implemented by fs? */ 424 /** Is interrupt not implemented by fs? */
408 unsigned no_interrupt : 1; 425 unsigned no_interrupt:1;
409 426
410 /** Is bmap not implemented by fs? */ 427 /** Is bmap not implemented by fs? */
411 unsigned no_bmap : 1; 428 unsigned no_bmap:1;
429
430 /** Is poll not implemented by fs? */
431 unsigned no_poll:1;
412 432
413 /** Do multi-page cached writes */ 433 /** Do multi-page cached writes */
414 unsigned big_writes : 1; 434 unsigned big_writes:1;
415 435
416 /** The number of requests waiting for completion */ 436 /** The number of requests waiting for completion */
417 atomic_t num_waiting; 437 atomic_t num_waiting;
@@ -445,6 +465,9 @@ struct fuse_conn {
445 465
446 /** Version counter for attribute changes */ 466 /** Version counter for attribute changes */
447 u64 attr_version; 467 u64 attr_version;
468
469 /** Called on final put */
470 void (*release)(struct fuse_conn *);
448}; 471};
449 472
450static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) 473static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
499 */ 522 */
500int fuse_open_common(struct inode *inode, struct file *file, int isdir); 523int fuse_open_common(struct inode *inode, struct file *file, int isdir);
501 524
502struct fuse_file *fuse_file_alloc(void); 525struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
503void fuse_file_free(struct fuse_file *ff); 526void fuse_file_free(struct fuse_file *ff);
504void fuse_finish_open(struct inode *inode, struct file *file, 527void fuse_finish_open(struct inode *inode, struct file *file,
505 struct fuse_file *ff, struct fuse_open_out *outarg); 528 struct fuse_file *ff, struct fuse_open_out *outarg);
@@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
519 int isdir); 542 int isdir);
520 543
521/** 544/**
545 * Notify poll wakeup
546 */
547int fuse_notify_poll_wakeup(struct fuse_conn *fc,
548 struct fuse_notify_poll_wakeup_out *outarg);
549
550/**
522 * Initialize file operations on a regular file 551 * Initialize file operations on a regular file
523 */ 552 */
524void fuse_init_file_inode(struct inode *inode); 553void fuse_init_file_inode(struct inode *inode);
@@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
593/** 622/**
594 * Send a request (synchronous) 623 * Send a request (synchronous)
595 */ 624 */
596void request_send(struct fuse_conn *fc, struct fuse_req *req); 625void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
597 626
598/** 627/**
599 * Send a request with no reply 628 * Send a request with no reply
600 */ 629 */
601void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); 630void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
602 631
603/** 632/**
604 * Send a request in the background 633 * Send a request in the background
605 */ 634 */
606void request_send_background(struct fuse_conn *fc, struct fuse_req *req); 635void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
607 636
608void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); 637void fuse_request_send_background_locked(struct fuse_conn *fc,
638 struct fuse_req *req);
609 639
610/* Abort all requests */ 640/* Abort all requests */
611void fuse_abort_conn(struct fuse_conn *fc); 641void fuse_abort_conn(struct fuse_conn *fc);
@@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
623struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); 653struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
624 654
625/** 655/**
656 * Initialize fuse_conn
657 */
658int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
659
660/**
626 * Release reference to fuse_conn 661 * Release reference to fuse_conn
627 */ 662 */
628void fuse_conn_put(struct fuse_conn *fc); 663void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e99f34b4435..47c96fdca1ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -37,10 +37,10 @@ struct fuse_mount_data {
37 unsigned rootmode; 37 unsigned rootmode;
38 unsigned user_id; 38 unsigned user_id;
39 unsigned group_id; 39 unsigned group_id;
40 unsigned fd_present : 1; 40 unsigned fd_present:1;
41 unsigned rootmode_present : 1; 41 unsigned rootmode_present:1;
42 unsigned user_id_present : 1; 42 unsigned user_id_present:1;
43 unsigned group_id_present : 1; 43 unsigned group_id_present:1;
44 unsigned flags; 44 unsigned flags;
45 unsigned max_read; 45 unsigned max_read;
46 unsigned blksize; 46 unsigned blksize;
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
94 req->in.numargs = 1; 94 req->in.numargs = 1;
95 req->in.args[0].size = sizeof(struct fuse_forget_in); 95 req->in.args[0].size = sizeof(struct fuse_forget_in);
96 req->in.args[0].value = inarg; 96 req->in.args[0].value = inarg;
97 request_send_noreply(fc, req); 97 fuse_request_send_noreply(fc, req);
98} 98}
99 99
100static void fuse_clear_inode(struct inode *inode) 100static void fuse_clear_inode(struct inode *inode)
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
250 250
251 fi = get_fuse_inode(inode); 251 fi = get_fuse_inode(inode);
252 spin_lock(&fc->lock); 252 spin_lock(&fc->lock);
253 fi->nlookup ++; 253 fi->nlookup++;
254 spin_unlock(&fc->lock); 254 spin_unlock(&fc->lock);
255 fuse_change_attributes(inode, attr, attr_valid, attr_version); 255 fuse_change_attributes(inode, attr, attr_valid, attr_version);
256 256
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
269 fc->destroy_req = NULL; 269 fc->destroy_req = NULL;
270 req->in.h.opcode = FUSE_DESTROY; 270 req->in.h.opcode = FUSE_DESTROY;
271 req->force = 1; 271 req->force = 1;
272 request_send(fc, req); 272 fuse_request_send(fc, req);
273 fuse_put_request(fc, req); 273 fuse_put_request(fc, req);
274 } 274 }
275} 275}
@@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
334 req->out.args[0].size = 334 req->out.args[0].size =
335 fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); 335 fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
336 req->out.args[0].value = &outarg; 336 req->out.args[0].value = &outarg;
337 request_send(fc, req); 337 fuse_request_send(fc, req);
338 err = req->out.h.error; 338 err = req->out.h.error;
339 if (!err) 339 if (!err)
340 convert_fuse_statfs(buf, &outarg.st); 340 convert_fuse_statfs(buf, &outarg.st);
@@ -462,68 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
462 return 0; 462 return 0;
463} 463}
464 464
465static struct fuse_conn *new_conn(struct super_block *sb) 465int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
466{ 466{
467 struct fuse_conn *fc;
468 int err; 467 int err;
469 468
470 fc = kzalloc(sizeof(*fc), GFP_KERNEL); 469 memset(fc, 0, sizeof(*fc));
471 if (fc) { 470 spin_lock_init(&fc->lock);
472 spin_lock_init(&fc->lock); 471 mutex_init(&fc->inst_mutex);
473 mutex_init(&fc->inst_mutex); 472 atomic_set(&fc->count, 1);
474 atomic_set(&fc->count, 1); 473 init_waitqueue_head(&fc->waitq);
475 init_waitqueue_head(&fc->waitq); 474 init_waitqueue_head(&fc->blocked_waitq);
476 init_waitqueue_head(&fc->blocked_waitq); 475 init_waitqueue_head(&fc->reserved_req_waitq);
477 init_waitqueue_head(&fc->reserved_req_waitq); 476 INIT_LIST_HEAD(&fc->pending);
478 INIT_LIST_HEAD(&fc->pending); 477 INIT_LIST_HEAD(&fc->processing);
479 INIT_LIST_HEAD(&fc->processing); 478 INIT_LIST_HEAD(&fc->io);
480 INIT_LIST_HEAD(&fc->io); 479 INIT_LIST_HEAD(&fc->interrupts);
481 INIT_LIST_HEAD(&fc->interrupts); 480 INIT_LIST_HEAD(&fc->bg_queue);
482 INIT_LIST_HEAD(&fc->bg_queue); 481 INIT_LIST_HEAD(&fc->entry);
483 atomic_set(&fc->num_waiting, 0); 482 atomic_set(&fc->num_waiting, 0);
484 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 483 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
485 fc->bdi.unplug_io_fn = default_unplug_io_fn; 484 fc->bdi.unplug_io_fn = default_unplug_io_fn;
486 /* fuse does it's own writeback accounting */ 485 /* fuse does it's own writeback accounting */
487 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; 486 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
488 fc->dev = sb->s_dev; 487 fc->khctr = 0;
489 err = bdi_init(&fc->bdi); 488 fc->polled_files = RB_ROOT;
490 if (err) 489 fc->dev = sb->s_dev;
491 goto error_kfree; 490 err = bdi_init(&fc->bdi);
492 if (sb->s_bdev) { 491 if (err)
493 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", 492 goto error_mutex_destroy;
494 MAJOR(fc->dev), MINOR(fc->dev)); 493 if (sb->s_bdev) {
495 } else { 494 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
496 err = bdi_register_dev(&fc->bdi, fc->dev); 495 MAJOR(fc->dev), MINOR(fc->dev));
497 } 496 } else {
498 if (err) 497 err = bdi_register_dev(&fc->bdi, fc->dev);
499 goto error_bdi_destroy;
500 /*
501 * For a single fuse filesystem use max 1% of dirty +
502 * writeback threshold.
503 *
504 * This gives about 1M of write buffer for memory maps on a
505 * machine with 1G and 10% dirty_ratio, which should be more
506 * than enough.
507 *
508 * Privileged users can raise it by writing to
509 *
510 * /sys/class/bdi/<bdi>/max_ratio
511 */
512 bdi_set_max_ratio(&fc->bdi, 1);
513 fc->reqctr = 0;
514 fc->blocked = 1;
515 fc->attr_version = 1;
516 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
517 } 498 }
518 return fc; 499 if (err)
500 goto error_bdi_destroy;
501 /*
502 * For a single fuse filesystem use max 1% of dirty +
503 * writeback threshold.
504 *
505 * This gives about 1M of write buffer for memory maps on a
506 * machine with 1G and 10% dirty_ratio, which should be more
507 * than enough.
508 *
509 * Privileged users can raise it by writing to
510 *
511 * /sys/class/bdi/<bdi>/max_ratio
512 */
513 bdi_set_max_ratio(&fc->bdi, 1);
514 fc->reqctr = 0;
515 fc->blocked = 1;
516 fc->attr_version = 1;
517 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
519 518
520error_bdi_destroy: 519 return 0;
520
521 error_bdi_destroy:
521 bdi_destroy(&fc->bdi); 522 bdi_destroy(&fc->bdi);
522error_kfree: 523 error_mutex_destroy:
523 mutex_destroy(&fc->inst_mutex); 524 mutex_destroy(&fc->inst_mutex);
524 kfree(fc); 525 return err;
525 return NULL;
526} 526}
527EXPORT_SYMBOL_GPL(fuse_conn_init);
527 528
528void fuse_conn_put(struct fuse_conn *fc) 529void fuse_conn_put(struct fuse_conn *fc)
529{ 530{
@@ -532,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc)
532 fuse_request_free(fc->destroy_req); 533 fuse_request_free(fc->destroy_req);
533 mutex_destroy(&fc->inst_mutex); 534 mutex_destroy(&fc->inst_mutex);
534 bdi_destroy(&fc->bdi); 535 bdi_destroy(&fc->bdi);
535 kfree(fc); 536 fc->release(fc);
536 } 537 }
537} 538}
538 539
@@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
542 return fc; 543 return fc;
543} 544}
544 545
545static struct inode *get_root_inode(struct super_block *sb, unsigned mode) 546static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
546{ 547{
547 struct fuse_attr attr; 548 struct fuse_attr attr;
548 memset(&attr, 0, sizeof(attr)); 549 memset(&attr, 0, sizeof(attr));
@@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
553 return fuse_iget(sb, 1, 0, &attr, 0, 0); 554 return fuse_iget(sb, 1, 0, &attr, 0, 0);
554} 555}
555 556
556struct fuse_inode_handle 557struct fuse_inode_handle {
557{
558 u64 nodeid; 558 u64 nodeid;
559 u32 generation; 559 u32 generation;
560}; 560};
@@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
761 fc->max_write = max_t(unsigned, 4096, fc->max_write); 761 fc->max_write = max_t(unsigned, 4096, fc->max_write);
762 fc->conn_init = 1; 762 fc->conn_init = 1;
763 } 763 }
764 fuse_put_request(fc, req);
765 fc->blocked = 0; 764 fc->blocked = 0;
766 wake_up_all(&fc->blocked_waitq); 765 wake_up_all(&fc->blocked_waitq);
767} 766}
@@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
787 req->out.args[0].size = sizeof(struct fuse_init_out); 786 req->out.args[0].size = sizeof(struct fuse_init_out);
788 req->out.args[0].value = &req->misc.init_out; 787 req->out.args[0].value = &req->misc.init_out;
789 req->end = process_init_reply; 788 req->end = process_init_reply;
790 request_send_background(fc, req); 789 fuse_request_send_background(fc, req);
790}
791
792static void fuse_free_conn(struct fuse_conn *fc)
793{
794 kfree(fc);
791} 795}
792 796
793static int fuse_fill_super(struct super_block *sb, void *data, int silent) 797static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -828,10 +832,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
828 if (file->f_op != &fuse_dev_operations) 832 if (file->f_op != &fuse_dev_operations)
829 return -EINVAL; 833 return -EINVAL;
830 834
831 fc = new_conn(sb); 835 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
832 if (!fc) 836 if (!fc)
833 return -ENOMEM; 837 return -ENOMEM;
834 838
839 err = fuse_conn_init(fc, sb);
840 if (err) {
841 kfree(fc);
842 return err;
843 }
844
845 fc->release = fuse_free_conn;
835 fc->flags = d.flags; 846 fc->flags = d.flags;
836 fc->user_id = d.user_id; 847 fc->user_id = d.user_id;
837 fc->group_id = d.group_id; 848 fc->group_id = d.group_id;
@@ -841,7 +852,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
841 sb->s_fs_info = fc; 852 sb->s_fs_info = fc;
842 853
843 err = -ENOMEM; 854 err = -ENOMEM;
844 root = get_root_inode(sb, d.rootmode); 855 root = fuse_get_root_inode(sb, d.rootmode);
845 if (!root) 856 if (!root)
846 goto err; 857 goto err;
847 858
@@ -952,7 +963,7 @@ static inline void unregister_fuseblk(void)
952 963
953static void fuse_inode_init_once(void *foo) 964static void fuse_inode_init_once(void *foo)
954{ 965{
955 struct inode * inode = foo; 966 struct inode *inode = foo;
956 967
957 inode_init_once(inode); 968 inode_init_once(inode);
958} 969}
@@ -1031,7 +1042,7 @@ static int __init fuse_init(void)
1031{ 1042{
1032 int res; 1043 int res;
1033 1044
1034 printk("fuse init (API version %i.%i)\n", 1045 printk(KERN_INFO "fuse init (API version %i.%i)\n",
1035 FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); 1046 FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
1036 1047
1037 INIT_LIST_HEAD(&fuse_conn_list); 1048 INIT_LIST_HEAD(&fuse_conn_list);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index ab2f57e3fb87..e563a6449811 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || (LSF && LBD)) 3 depends on EXPERIMENTAL && (64BIT || LBD)
4 select FS_POSIX_ACL 4 select FS_POSIX_ACL
5 select CRC32 5 select CRC32
6 help 6 help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index ec65851ec80a..c1b4ec6a9650 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,5 +1,5 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o 1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ 2gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o log.o lops.o locking.o main.o meta_io.o \ 3 glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e9bd46f27e3..e335dceb6a4f 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
91 struct gfs2_ea_location el_this; 91 struct gfs2_ea_location el_this;
92 int error; 92 int error;
93 93
94 if (!ip->i_di.di_eattr) 94 if (!ip->i_eattr)
95 return 0; 95 return 0;
96 96
97 memset(&er, 0, sizeof(struct gfs2_ea_request)); 97 memset(&er, 0, sizeof(struct gfs2_ea_request));
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bec76b1c2bb0..11ffc56f1f81 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
75 void *kaddr = kmap(page); 75 void *kaddr = kmap(page);
76 76
77 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 77 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
78 ip->i_di.di_size); 78 ip->i_disksize);
79 memset(kaddr + ip->i_di.di_size, 0, 79 memset(kaddr + ip->i_disksize, 0,
80 PAGE_CACHE_SIZE - ip->i_di.di_size); 80 PAGE_CACHE_SIZE - ip->i_disksize);
81 kunmap(page); 81 kunmap(page);
82 82
83 SetPageUptodate(page); 83 SetPageUptodate(page);
@@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
132 if (error) 132 if (error)
133 goto out; 133 goto out;
134 134
135 if (ip->i_di.di_size) { 135 if (ip->i_disksize) {
136 /* Get a free block, fill it with the stuffed data, 136 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 137 and write it out to disk */
138 138
@@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
159 di = (struct gfs2_dinode *)dibh->b_data; 159 di = (struct gfs2_dinode *)dibh->b_data;
160 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 160 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
161 161
162 if (ip->i_di.di_size) { 162 if (ip->i_disksize) {
163 *(__be64 *)(di + 1) = cpu_to_be64(block); 163 *(__be64 *)(di + 1) = cpu_to_be64(block);
164 gfs2_add_inode_blocks(&ip->i_inode, 1); 164 gfs2_add_inode_blocks(&ip->i_inode, 1);
165 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 165 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
926 } 926 }
927 } 927 }
928 928
929 ip->i_di.di_size = size; 929 ip->i_disksize = size;
930 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 930 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
931 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 931 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
932 gfs2_dinode_out(ip, dibh->b_data); 932 gfs2_dinode_out(ip, dibh->b_data);
@@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1033 goto out; 1033 goto out;
1034 1034
1035 if (gfs2_is_stuffed(ip)) { 1035 if (gfs2_is_stuffed(ip)) {
1036 ip->i_di.di_size = size; 1036 ip->i_disksize = size;
1037 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1037 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1038 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1038 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1039 gfs2_dinode_out(ip, dibh->b_data); 1039 gfs2_dinode_out(ip, dibh->b_data);
@@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1045 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 1045 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
1046 1046
1047 if (!error) { 1047 if (!error) {
1048 ip->i_di.di_size = size; 1048 ip->i_disksize = size;
1049 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1049 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1050 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; 1050 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1051 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1051 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1052 gfs2_dinode_out(ip, dibh->b_data); 1052 gfs2_dinode_out(ip, dibh->b_data);
1053 } 1053 }
@@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip)
1114 if (error) 1114 if (error)
1115 goto out; 1115 goto out;
1116 1116
1117 if (!ip->i_di.di_size) { 1117 if (!ip->i_disksize) {
1118 ip->i_height = 0; 1118 ip->i_height = 0;
1119 ip->i_goal = ip->i_no_addr; 1119 ip->i_goal = ip->i_no_addr;
1120 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1120 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1121 } 1121 }
1122 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1122 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1123 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; 1123 ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1124 1124
1125 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1125 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1126 gfs2_dinode_out(ip, dibh->b_data); 1126 gfs2_dinode_out(ip, dibh->b_data);
@@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1205 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) 1205 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
1206 return -EINVAL; 1206 return -EINVAL;
1207 1207
1208 if (size > ip->i_di.di_size) 1208 if (size > ip->i_disksize)
1209 error = do_grow(ip, size); 1209 error = do_grow(ip, size);
1210 else if (size < ip->i_di.di_size) 1210 else if (size < ip->i_disksize)
1211 error = do_shrink(ip, size); 1211 error = do_shrink(ip, size);
1212 else 1212 else
1213 /* update time stamps */ 1213 /* update time stamps */
@@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1219int gfs2_truncatei_resume(struct gfs2_inode *ip) 1219int gfs2_truncatei_resume(struct gfs2_inode *ip)
1220{ 1220{
1221 int error; 1221 int error;
1222 error = trunc_dealloc(ip, ip->i_di.di_size); 1222 error = trunc_dealloc(ip, ip->i_disksize);
1223 if (!error) 1223 if (!error)
1224 error = trunc_end(ip); 1224 error = trunc_end(ip);
1225 return error; 1225 return error;
@@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
1231} 1231}
1232 1232
1233/** 1233/**
1234 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1235 * @ip: the file
1236 * @len: the number of bytes to be written to the file
1237 * @data_blocks: returns the number of data blocks required
1238 * @ind_blocks: returns the number of indirect blocks required
1239 *
1240 */
1241
1242void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1243 unsigned int *data_blocks, unsigned int *ind_blocks)
1244{
1245 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1246 unsigned int tmp;
1247
1248 if (gfs2_is_dir(ip)) {
1249 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1250 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1251 } else {
1252 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1253 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1254 }
1255
1256 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1257 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1258 *ind_blocks += tmp;
1259 }
1260}
1261
1262/**
1263 * gfs2_write_alloc_required - figure out if a write will require an allocation 1234 * gfs2_write_alloc_required - figure out if a write will require an allocation
1264 * @ip: the file being written to 1235 * @ip: the file being written to
1265 * @offset: the offset to write to 1236 * @offset: the offset to write to
@@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1276 struct buffer_head bh; 1247 struct buffer_head bh;
1277 unsigned int shift; 1248 unsigned int shift;
1278 u64 lblock, lblock_stop, size; 1249 u64 lblock, lblock_stop, size;
1250 u64 end_of_file;
1279 1251
1280 *alloc_required = 0; 1252 *alloc_required = 0;
1281 1253
@@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1291 1263
1292 *alloc_required = 1; 1264 *alloc_required = 1;
1293 shift = sdp->sd_sb.sb_bsize_shift; 1265 shift = sdp->sd_sb.sb_bsize_shift;
1294 if (gfs2_is_dir(ip)) { 1266 BUG_ON(gfs2_is_dir(ip));
1295 unsigned int bsize = sdp->sd_jbsize; 1267 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
1296 lblock = offset; 1268 lblock = offset >> shift;
1297 do_div(lblock, bsize); 1269 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1298 lblock_stop = offset + len + bsize - 1; 1270 if (lblock_stop > end_of_file)
1299 do_div(lblock_stop, bsize); 1271 return 0;
1300 } else {
1301 u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
1302 lblock = offset >> shift;
1303 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1304 if (lblock_stop > end_of_file)
1305 return 0;
1306 }
1307 1272
1308 size = (lblock_stop - lblock) << shift; 1273 size = (lblock_stop - lblock) << shift;
1309 do { 1274 do {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e6cde2943bd..c983177e05ac 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,10 +10,40 @@
10#ifndef __BMAP_DOT_H__ 10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__ 11#define __BMAP_DOT_H__
12 12
13#include "inode.h"
14
13struct inode; 15struct inode;
14struct gfs2_inode; 16struct gfs2_inode;
15struct page; 17struct page;
16 18
19
20/**
21 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
22 * @ip: the file
23 * @len: the number of bytes to be written to the file
24 * @data_blocks: returns the number of data blocks required
25 * @ind_blocks: returns the number of indirect blocks required
26 *
27 */
28
29static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
30 unsigned int len,
31 unsigned int *data_blocks,
32 unsigned int *ind_blocks)
33{
34 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
35 unsigned int tmp;
36
37 BUG_ON(gfs2_is_dir(ip));
38 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
39 *ind_blocks = 3 * (sdp->sd_max_height - 1);
40
41 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
42 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
43 *ind_blocks += tmp;
44 }
45}
46
17int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 47int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
18int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); 48int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
19int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 49int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
@@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
21int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 51int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
22int gfs2_truncatei_resume(struct gfs2_inode *ip); 52int gfs2_truncatei_resume(struct gfs2_inode *ip);
23int gfs2_file_dealloc(struct gfs2_inode *ip); 53int gfs2_file_dealloc(struct gfs2_inode *ip);
24
25void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
26 unsigned int *data_blocks,
27 unsigned int *ind_blocks);
28int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
29 unsigned int len, int *alloc_required); 55 unsigned int len, int *alloc_required);
30 56
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
deleted file mode 100644
index e51991947d2c..000000000000
--- a/fs/gfs2/daemon.c
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19#include <linux/freezer.h>
20
21#include "gfs2.h"
22#include "incore.h"
23#include "daemon.h"
24#include "glock.h"
25#include "log.h"
26#include "quota.h"
27#include "recovery.h"
28#include "super.h"
29#include "util.h"
30
31/* This uses schedule_timeout() instead of msleep() because it's good for
32 the daemons to wake up more often than the timeout when unmounting so
33 the user's unmount doesn't sit there forever.
34
35 The kthread functions used to start these daemons block and flush signals. */
36
37/**
38 * gfs2_glockd - Reclaim unused glock structures
39 * @sdp: Pointer to GFS2 superblock
40 *
41 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
42 * Number of daemons can be set by user, with num_glockd mount option.
43 */
44
45int gfs2_glockd(void *data)
46{
47 struct gfs2_sbd *sdp = data;
48
49 while (!kthread_should_stop()) {
50 while (atomic_read(&sdp->sd_reclaim_count))
51 gfs2_reclaim_glock(sdp);
52
53 wait_event_interruptible(sdp->sd_reclaim_wq,
54 (atomic_read(&sdp->sd_reclaim_count) ||
55 kthread_should_stop()));
56 if (freezing(current))
57 refrigerator();
58 }
59
60 return 0;
61}
62
63/**
64 * gfs2_recoverd - Recover dead machine's journals
65 * @sdp: Pointer to GFS2 superblock
66 *
67 */
68
69int gfs2_recoverd(void *data)
70{
71 struct gfs2_sbd *sdp = data;
72 unsigned long t;
73
74 while (!kthread_should_stop()) {
75 gfs2_check_journals(sdp);
76 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
77 if (freezing(current))
78 refrigerator();
79 schedule_timeout_interruptible(t);
80 }
81
82 return 0;
83}
84
85/**
86 * gfs2_quotad - Write cached quota changes into the quota file
87 * @sdp: Pointer to GFS2 superblock
88 *
89 */
90
91int gfs2_quotad(void *data)
92{
93 struct gfs2_sbd *sdp = data;
94 unsigned long t;
95 int error;
96
97 while (!kthread_should_stop()) {
98 /* Update the master statfs file */
99
100 t = sdp->sd_statfs_sync_time +
101 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
102
103 if (time_after_eq(jiffies, t)) {
104 error = gfs2_statfs_sync(sdp);
105 if (error &&
106 error != -EROFS &&
107 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
108 fs_err(sdp, "quotad: (1) error=%d\n", error);
109 sdp->sd_statfs_sync_time = jiffies;
110 }
111
112 /* Update quota file */
113
114 t = sdp->sd_quota_sync_time +
115 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
116
117 if (time_after_eq(jiffies, t)) {
118 error = gfs2_quota_sync(sdp);
119 if (error &&
120 error != -EROFS &&
121 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
122 fs_err(sdp, "quotad: (2) error=%d\n", error);
123 sdp->sd_quota_sync_time = jiffies;
124 }
125
126 gfs2_quota_scan(sdp);
127
128 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
129 if (freezing(current))
130 refrigerator();
131 schedule_timeout_interruptible(t);
132 }
133
134 return 0;
135}
136
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
deleted file mode 100644
index 4be084fb6a62..000000000000
--- a/fs/gfs2/daemon.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_glockd(void *data);
14int gfs2_recoverd(void *data);
15int gfs2_quotad(void *data);
16
17#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index eed040d8ba3a..b7c8e5c70791 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -36,7 +36,7 @@
36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the 36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when 37 * beginning of the leaf block. The dirents reside in leaves when
38 * 38 *
39 * dip->i_di.di_flags & GFS2_DIF_EXHASH is true 39 * dip->i_diskflags & GFS2_DIF_EXHASH is true
40 * 40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block. 41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 * 42 *
@@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
128 128
129 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 129 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
130 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 130 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
131 if (ip->i_di.di_size < offset + size) 131 if (ip->i_disksize < offset + size)
132 ip->i_di.di_size = offset + size; 132 ip->i_disksize = offset + size;
133 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 133 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
134 gfs2_dinode_out(ip, dibh->b_data); 134 gfs2_dinode_out(ip, dibh->b_data);
135 135
@@ -226,8 +226,8 @@ out:
226 if (error) 226 if (error)
227 return error; 227 return error;
228 228
229 if (ip->i_di.di_size < offset + copied) 229 if (ip->i_disksize < offset + copied)
230 ip->i_di.di_size = offset + copied; 230 ip->i_disksize = offset + copied;
231 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 231 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
232 232
233 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 233 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
277 int copied = 0; 277 int copied = 0;
278 int error = 0; 278 int error = 0;
279 279
280 if (offset >= ip->i_di.di_size) 280 if (offset >= ip->i_disksize)
281 return 0; 281 return 0;
282 282
283 if (offset + size > ip->i_di.di_size) 283 if (offset + size > ip->i_disksize)
284 size = ip->i_di.di_size - offset; 284 size = ip->i_disksize - offset;
285 285
286 if (!size) 286 if (!size)
287 return 0; 287 return 0;
@@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
755 struct gfs2_inode *ip = GFS2_I(inode); 755 struct gfs2_inode *ip = GFS2_I(inode);
756 int error; 756 int error;
757 757
758 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { 758 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
759 struct gfs2_leaf *leaf; 759 struct gfs2_leaf *leaf;
760 unsigned hsize = 1 << ip->i_depth; 760 unsigned hsize = 1 << ip->i_depth;
761 unsigned index; 761 unsigned index;
762 u64 ln; 762 u64 ln;
763 if (hsize * sizeof(u64) != ip->i_di.di_size) { 763 if (hsize * sizeof(u64) != ip->i_disksize) {
764 gfs2_consist_inode(ip); 764 gfs2_consist_inode(ip);
765 return ERR_PTR(-EIO); 765 return ERR_PTR(-EIO);
766 } 766 }
@@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode)
858 return -ENOSPC; 858 return -ENOSPC;
859 bn = bh->b_blocknr; 859 bn = bh->b_blocknr;
860 860
861 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16)); 861 gfs2_assert(sdp, dip->i_entries < (1 << 16));
862 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries); 862 leaf->lf_entries = cpu_to_be16(dip->i_entries);
863 863
864 /* Copy dirents */ 864 /* Copy dirents */
865 865
@@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode)
905 for (x = sdp->sd_hash_ptrs; x--; lp++) 905 for (x = sdp->sd_hash_ptrs; x--; lp++)
906 *lp = cpu_to_be64(bn); 906 *lp = cpu_to_be64(bn);
907 907
908 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; 908 dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
909 gfs2_add_inode_blocks(&dip->i_inode, 1); 909 gfs2_add_inode_blocks(&dip->i_inode, 1);
910 dip->i_di.di_flags |= GFS2_DIF_EXHASH; 910 dip->i_diskflags |= GFS2_DIF_EXHASH;
911 911
912 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; 912 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
913 dip->i_depth = y; 913 dip->i_depth = y;
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1082 int error = 0; 1082 int error = 0;
1083 1083
1084 hsize = 1 << dip->i_depth; 1084 hsize = 1 << dip->i_depth;
1085 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1085 if (hsize * sizeof(u64) != dip->i_disksize) {
1086 gfs2_consist_inode(dip); 1086 gfs2_consist_inode(dip);
1087 return -EIO; 1087 return -EIO;
1088 } 1088 }
@@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1091 1091
1092 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); 1092 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
1093 1093
1094 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { 1094 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
1095 error = gfs2_dir_read_data(dip, (char *)buf, 1095 error = gfs2_dir_read_data(dip, (char *)buf,
1096 block * sdp->sd_hash_bsize, 1096 block * sdp->sd_hash_bsize,
1097 sdp->sd_hash_bsize, 1); 1097 sdp->sd_hash_bsize, 1);
@@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1370 unsigned depth = 0; 1370 unsigned depth = 0;
1371 1371
1372 hsize = 1 << dip->i_depth; 1372 hsize = 1 << dip->i_depth;
1373 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1373 if (hsize * sizeof(u64) != dip->i_disksize) {
1374 gfs2_consist_inode(dip); 1374 gfs2_consist_inode(dip);
1375 return -EIO; 1375 return -EIO;
1376 } 1376 }
@@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1426 int copied = 0; 1426 int copied = 0;
1427 int error; 1427 int error;
1428 1428
1429 if (!dip->i_di.di_entries) 1429 if (!dip->i_entries)
1430 return 0; 1430 return 0;
1431 1431
1432 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) 1432 if (dip->i_diskflags & GFS2_DIF_EXHASH)
1433 return dir_e_read(inode, offset, opaque, filldir); 1433 return dir_e_read(inode, offset, opaque, filldir);
1434 1434
1435 if (!gfs2_is_stuffed(dip)) { 1435 if (!gfs2_is_stuffed(dip)) {
@@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1453 error = PTR_ERR(dent); 1453 error = PTR_ERR(dent);
1454 goto out; 1454 goto out;
1455 } 1455 }
1456 if (dip->i_di.di_entries != g.offset) { 1456 if (dip->i_entries != g.offset) {
1457 fs_warn(sdp, "Number of entries corrupt in dir %llu, " 1457 fs_warn(sdp, "Number of entries corrupt in dir %llu, "
1458 "ip->i_di.di_entries (%u) != g.offset (%u)\n", 1458 "ip->i_entries (%u) != g.offset (%u)\n",
1459 (unsigned long long)dip->i_no_addr, 1459 (unsigned long long)dip->i_no_addr,
1460 dip->i_di.di_entries, 1460 dip->i_entries,
1461 g.offset); 1461 g.offset);
1462 error = -EIO; 1462 error = -EIO;
1463 goto out; 1463 goto out;
1464 } 1464 }
1465 error = do_filldir_main(dip, offset, opaque, filldir, darr, 1465 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1466 dip->i_di.di_entries, &copied); 1466 dip->i_entries, &copied);
1467out: 1467out:
1468 kfree(darr); 1468 kfree(darr);
1469 } 1469 }
@@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1612 dent = gfs2_init_dirent(inode, dent, name, bh); 1612 dent = gfs2_init_dirent(inode, dent, name, bh);
1613 gfs2_inum_out(nip, dent); 1613 gfs2_inum_out(nip, dent);
1614 dent->de_type = cpu_to_be16(type); 1614 dent->de_type = cpu_to_be16(type);
1615 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { 1615 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1616 leaf = (struct gfs2_leaf *)bh->b_data; 1616 leaf = (struct gfs2_leaf *)bh->b_data;
1617 be16_add_cpu(&leaf->lf_entries, 1); 1617 be16_add_cpu(&leaf->lf_entries, 1);
1618 } 1618 }
@@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1621 if (error) 1621 if (error)
1622 break; 1622 break;
1623 gfs2_trans_add_bh(ip->i_gl, bh, 1); 1623 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1624 ip->i_di.di_entries++; 1624 ip->i_entries++;
1625 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1625 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1626 gfs2_dinode_out(ip, bh->b_data); 1626 gfs2_dinode_out(ip, bh->b_data);
1627 brelse(bh); 1627 brelse(bh);
1628 error = 0; 1628 error = 0;
1629 break; 1629 break;
1630 } 1630 }
1631 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) { 1631 if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
1632 error = dir_make_exhash(inode); 1632 error = dir_make_exhash(inode);
1633 if (error) 1633 if (error)
1634 break; 1634 break;
@@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1691 } 1691 }
1692 1692
1693 dirent_del(dip, bh, prev, dent); 1693 dirent_del(dip, bh, prev, dent);
1694 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { 1694 if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1695 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; 1695 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1696 u16 entries = be16_to_cpu(leaf->lf_entries); 1696 u16 entries = be16_to_cpu(leaf->lf_entries);
1697 if (!entries) 1697 if (!entries)
@@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1704 if (error) 1704 if (error)
1705 return error; 1705 return error;
1706 1706
1707 if (!dip->i_di.di_entries) 1707 if (!dip->i_entries)
1708 gfs2_consist_inode(dip); 1708 gfs2_consist_inode(dip);
1709 gfs2_trans_add_bh(dip->i_gl, bh, 1); 1709 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1710 dip->i_di.di_entries--; 1710 dip->i_entries--;
1711 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; 1711 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1712 gfs2_dinode_out(dip, bh->b_data); 1712 gfs2_dinode_out(dip, bh->b_data);
1713 brelse(bh); 1713 brelse(bh);
@@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1748 gfs2_inum_out(nip, dent); 1748 gfs2_inum_out(nip, dent);
1749 dent->de_type = cpu_to_be16(new_type); 1749 dent->de_type = cpu_to_be16(new_type);
1750 1750
1751 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { 1751 if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1752 brelse(bh); 1752 brelse(bh);
1753 error = gfs2_meta_inode_buffer(dip, &bh); 1753 error = gfs2_meta_inode_buffer(dip, &bh);
1754 if (error) 1754 if (error)
@@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1784 int error = 0; 1784 int error = 0;
1785 1785
1786 hsize = 1 << dip->i_depth; 1786 hsize = 1 << dip->i_depth;
1787 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1787 if (hsize * sizeof(u64) != dip->i_disksize) {
1788 gfs2_consist_inode(dip); 1788 gfs2_consist_inode(dip);
1789 return -EIO; 1789 return -EIO;
1790 } 1790 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 8a468cac9328..4f919440c3be 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -11,6 +11,7 @@
11#define __DIR_DOT_H__ 11#define __DIR_DOT_H__
12 12
13#include <linux/dcache.h> 13#include <linux/dcache.h>
14#include <linux/crc32.h>
14 15
15struct inode; 16struct inode;
16struct gfs2_inode; 17struct gfs2_inode;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index e3f76f451b0a..0d1c76d906ae 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
114 __be64 *eablk, *end; 114 __be64 *eablk, *end;
115 int error; 115 int error;
116 116
117 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh); 117 error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
118 if (error) 118 if (error)
119 return error; 119 return error;
120 120
121 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) { 121 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
122 error = ea_foreach_i(ip, bh, ea_call, data); 122 error = ea_foreach_i(ip, bh, ea_call, data);
123 goto out; 123 goto out;
124 } 124 }
@@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
414 if (error) 414 if (error)
415 return error; 415 return error;
416 416
417 if (ip->i_di.di_eattr) { 417 if (ip->i_eattr) {
418 struct ea_list ei = { .ei_er = er, .ei_size = 0 }; 418 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
419 419
420 error = ea_foreach(ip, ea_list_i, &ei); 420 error = ea_foreach(ip, ea_list_i, &ei);
@@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
514 struct gfs2_ea_location el; 514 struct gfs2_ea_location el;
515 int error; 515 int error;
516 516
517 if (!ip->i_di.di_eattr) 517 if (!ip->i_eattr)
518 return -ENODATA; 518 return -ENODATA;
519 519
520 error = gfs2_ea_find(ip, er, &el); 520 error = gfs2_ea_find(ip, er, &el);
@@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
741 if (error) 741 if (error)
742 return error; 742 return error;
743 743
744 ip->i_di.di_eattr = bh->b_blocknr; 744 ip->i_eattr = bh->b_blocknr;
745 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); 745 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
746 746
747 brelse(bh); 747 brelse(bh);
@@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
935 int error; 935 int error;
936 int mh_size = sizeof(struct gfs2_meta_header); 936 int mh_size = sizeof(struct gfs2_meta_header);
937 937
938 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { 938 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
939 __be64 *end; 939 __be64 *end;
940 940
941 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, 941 error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
942 &indbh); 942 &indbh);
943 if (error) 943 if (error)
944 return error; 944 return error;
@@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
972 gfs2_buffer_clear_tail(indbh, mh_size); 972 gfs2_buffer_clear_tail(indbh, mh_size);
973 973
974 eablk = (__be64 *)(indbh->b_data + mh_size); 974 eablk = (__be64 *)(indbh->b_data + mh_size);
975 *eablk = cpu_to_be64(ip->i_di.di_eattr); 975 *eablk = cpu_to_be64(ip->i_eattr);
976 ip->i_di.di_eattr = blk; 976 ip->i_eattr = blk;
977 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; 977 ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
978 gfs2_add_inode_blocks(&ip->i_inode, 1); 978 gfs2_add_inode_blocks(&ip->i_inode, 1);
979 979
980 eablk++; 980 eablk++;
@@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1015 if (error) 1015 if (error)
1016 return error; 1016 return error;
1017 1017
1018 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) 1018 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
1019 blks++; 1019 blks++;
1020 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) 1020 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1021 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); 1021 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
@@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1040 struct gfs2_ea_location el; 1040 struct gfs2_ea_location el;
1041 int error; 1041 int error;
1042 1042
1043 if (!ip->i_di.di_eattr) { 1043 if (!ip->i_eattr) {
1044 if (er->er_flags & XATTR_REPLACE) 1044 if (er->er_flags & XATTR_REPLACE)
1045 return -ENODATA; 1045 return -ENODATA;
1046 return ea_init(ip, er); 1046 return ea_init(ip, er);
@@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1051 return error; 1051 return error;
1052 1052
1053 if (el.el_ea) { 1053 if (el.el_ea) {
1054 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) { 1054 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
1055 brelse(el.el_bh); 1055 brelse(el.el_bh);
1056 return -EPERM; 1056 return -EPERM;
1057 } 1057 }
@@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1145 struct gfs2_ea_location el; 1145 struct gfs2_ea_location el;
1146 int error; 1146 int error;
1147 1147
1148 if (!ip->i_di.di_eattr) 1148 if (!ip->i_eattr)
1149 return -ENODATA; 1149 return -ENODATA;
1150 1150
1151 error = gfs2_ea_find(ip, er, &el); 1151 error = gfs2_ea_find(ip, er, &el);
@@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1309 1309
1310 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); 1310 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1311 1311
1312 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh); 1312 error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
1313 if (error) 1313 if (error)
1314 return error; 1314 return error;
1315 1315
@@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1388 if (bstart) 1388 if (bstart)
1389 gfs2_free_meta(ip, bstart, blen); 1389 gfs2_free_meta(ip, bstart, blen);
1390 1390
1391 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; 1391 ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
1392 1392
1393 error = gfs2_meta_inode_buffer(ip, &dibh); 1393 error = gfs2_meta_inode_buffer(ip, &dibh);
1394 if (!error) { 1394 if (!error) {
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
1416 struct buffer_head *dibh; 1416 struct buffer_head *dibh;
1417 int error; 1417 int error;
1418 1418
1419 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr); 1419 rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
1420 if (!rgd) { 1420 if (!rgd) {
1421 gfs2_consist_inode(ip); 1421 gfs2_consist_inode(ip);
1422 return -EIO; 1422 return -EIO;
@@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
1432 if (error) 1432 if (error)
1433 goto out_gunlock; 1433 goto out_gunlock;
1434 1434
1435 gfs2_free_meta(ip, ip->i_di.di_eattr, 1); 1435 gfs2_free_meta(ip, ip->i_eattr, 1);
1436 1436
1437 ip->i_di.di_eattr = 0; 1437 ip->i_eattr = 0;
1438 gfs2_add_inode_blocks(&ip->i_inode, -1); 1438 gfs2_add_inode_blocks(&ip->i_inode, -1);
1439 1439
1440 error = gfs2_meta_inode_buffer(ip, &dibh); 1440 error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
1479 if (error) 1479 if (error)
1480 goto out_rindex; 1480 goto out_rindex;
1481 1481
1482 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { 1482 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
1483 error = ea_dealloc_indirect(ip); 1483 error = ea_dealloc_indirect(ip);
1484 if (error) 1484 if (error)
1485 goto out_rindex; 1485 goto out_rindex;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c962283d4e7f..6b983aef785d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -40,6 +40,7 @@
40#include "quota.h" 40#include "quota.h"
41#include "super.h" 41#include "super.h"
42#include "util.h" 42#include "util.h"
43#include "bmap.h"
43 44
44struct gfs2_gl_hash_bucket { 45struct gfs2_gl_hash_bucket {
45 struct hlist_head hb_list; 46 struct hlist_head hb_list;
@@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
61 62
62static DECLARE_RWSEM(gfs2_umount_flush_sem); 63static DECLARE_RWSEM(gfs2_umount_flush_sem);
63static struct dentry *gfs2_root; 64static struct dentry *gfs2_root;
64static struct task_struct *scand_process;
65static unsigned int scand_secs = 5;
66static struct workqueue_struct *glock_workqueue; 65static struct workqueue_struct *glock_workqueue;
66static LIST_HEAD(lru_list);
67static atomic_t lru_count = ATOMIC_INIT(0);
68static DEFINE_SPINLOCK(lru_lock);
67 69
68#define GFS2_GL_HASH_SHIFT 15 70#define GFS2_GL_HASH_SHIFT 15
69#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) 71#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
@@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
174} 176}
175 177
176/** 178/**
179 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
180 * @gl: the glock
181 *
182 */
183
184static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
185{
186 spin_lock(&lru_lock);
187 if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
188 list_add_tail(&gl->gl_lru, &lru_list);
189 atomic_inc(&lru_count);
190 }
191 spin_unlock(&lru_lock);
192}
193
194/**
177 * gfs2_glock_put() - Decrement reference count on glock 195 * gfs2_glock_put() - Decrement reference count on glock
178 * @gl: The glock to put 196 * @gl: The glock to put
179 * 197 *
@@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl)
187 if (atomic_dec_and_test(&gl->gl_ref)) { 205 if (atomic_dec_and_test(&gl->gl_ref)) {
188 hlist_del(&gl->gl_list); 206 hlist_del(&gl->gl_list);
189 write_unlock(gl_lock_addr(gl->gl_hash)); 207 write_unlock(gl_lock_addr(gl->gl_hash));
208 spin_lock(&lru_lock);
209 if (!list_empty(&gl->gl_lru)) {
210 list_del_init(&gl->gl_lru);
211 atomic_dec(&lru_count);
212 }
213 spin_unlock(&lru_lock);
190 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED); 214 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
191 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim)); 215 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
192 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); 216 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
193 glock_free(gl); 217 glock_free(gl);
194 rv = 1; 218 rv = 1;
195 goto out; 219 goto out;
196 } 220 }
197 write_unlock(gl_lock_addr(gl->gl_hash)); 221 write_unlock(gl_lock_addr(gl->gl_hash));
222 /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
223 if (atomic_read(&gl->gl_ref) == 2)
224 gfs2_glock_schedule_for_reclaim(gl);
198out: 225out:
199 return rv; 226 return rv;
200} 227}
@@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
289 * do_promote - promote as many requests as possible on the current queue 316 * do_promote - promote as many requests as possible on the current queue
290 * @gl: The glock 317 * @gl: The glock
291 * 318 *
292 * Returns: true if there is a blocked holder at the head of the list 319 * Returns: 1 if there is a blocked holder at the head of the list, or 2
320 * if a type specific operation is underway.
293 */ 321 */
294 322
295static int do_promote(struct gfs2_glock *gl) 323static int do_promote(struct gfs2_glock *gl)
324__releases(&gl->gl_spin)
325__acquires(&gl->gl_spin)
296{ 326{
297 const struct gfs2_glock_operations *glops = gl->gl_ops; 327 const struct gfs2_glock_operations *glops = gl->gl_ops;
298 struct gfs2_holder *gh, *tmp; 328 struct gfs2_holder *gh, *tmp;
@@ -310,6 +340,8 @@ restart:
310 ret = glops->go_lock(gh); 340 ret = glops->go_lock(gh);
311 spin_lock(&gl->gl_spin); 341 spin_lock(&gl->gl_spin);
312 if (ret) { 342 if (ret) {
343 if (ret == 1)
344 return 2;
313 gh->gh_error = ret; 345 gh->gh_error = ret;
314 list_del_init(&gh->gh_list); 346 list_del_init(&gh->gh_list);
315 gfs2_holder_wake(gh); 347 gfs2_holder_wake(gh);
@@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
414 const struct gfs2_glock_operations *glops = gl->gl_ops; 446 const struct gfs2_glock_operations *glops = gl->gl_ops;
415 struct gfs2_holder *gh; 447 struct gfs2_holder *gh;
416 unsigned state = ret & LM_OUT_ST_MASK; 448 unsigned state = ret & LM_OUT_ST_MASK;
449 int rv;
417 450
418 spin_lock(&gl->gl_spin); 451 spin_lock(&gl->gl_spin);
419 state_change(gl, state); 452 state_change(gl, state);
@@ -468,7 +501,6 @@ retry:
468 gfs2_demote_wake(gl); 501 gfs2_demote_wake(gl);
469 if (state != LM_ST_UNLOCKED) { 502 if (state != LM_ST_UNLOCKED) {
470 if (glops->go_xmote_bh) { 503 if (glops->go_xmote_bh) {
471 int rv;
472 spin_unlock(&gl->gl_spin); 504 spin_unlock(&gl->gl_spin);
473 rv = glops->go_xmote_bh(gl, gh); 505 rv = glops->go_xmote_bh(gl, gh);
474 if (rv == -EAGAIN) 506 if (rv == -EAGAIN)
@@ -479,10 +511,13 @@ retry:
479 goto out; 511 goto out;
480 } 512 }
481 } 513 }
482 do_promote(gl); 514 rv = do_promote(gl);
515 if (rv == 2)
516 goto out_locked;
483 } 517 }
484out: 518out:
485 clear_bit(GLF_LOCK, &gl->gl_flags); 519 clear_bit(GLF_LOCK, &gl->gl_flags);
520out_locked:
486 spin_unlock(&gl->gl_spin); 521 spin_unlock(&gl->gl_spin);
487 gfs2_glock_put(gl); 522 gfs2_glock_put(gl);
488} 523}
@@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
511 */ 546 */
512 547
513static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) 548static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
549__releases(&gl->gl_spin)
550__acquires(&gl->gl_spin)
514{ 551{
515 const struct gfs2_glock_operations *glops = gl->gl_ops; 552 const struct gfs2_glock_operations *glops = gl->gl_ops;
516 struct gfs2_sbd *sdp = gl->gl_sbd; 553 struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
576 */ 613 */
577 614
578static void run_queue(struct gfs2_glock *gl, const int nonblock) 615static void run_queue(struct gfs2_glock *gl, const int nonblock)
616__releases(&gl->gl_spin)
617__acquires(&gl->gl_spin)
579{ 618{
580 struct gfs2_holder *gh = NULL; 619 struct gfs2_holder *gh = NULL;
620 int ret;
581 621
582 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) 622 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
583 return; 623 return;
@@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
596 } else { 636 } else {
597 if (test_bit(GLF_DEMOTE, &gl->gl_flags)) 637 if (test_bit(GLF_DEMOTE, &gl->gl_flags))
598 gfs2_demote_wake(gl); 638 gfs2_demote_wake(gl);
599 if (do_promote(gl) == 0) 639 ret = do_promote(gl);
640 if (ret == 0)
600 goto out; 641 goto out;
642 if (ret == 2)
643 return;
601 gh = find_first_waiter(gl); 644 gh = find_first_waiter(gl);
602 gl->gl_target = gh->gh_state; 645 gl->gl_target = gh->gh_state;
603 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 646 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
@@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl)
820 */ 863 */
821 864
822static void handle_callback(struct gfs2_glock *gl, unsigned int state, 865static void handle_callback(struct gfs2_glock *gl, unsigned int state,
823 int remote, unsigned long delay) 866 unsigned long delay)
824{ 867{
825 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; 868 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
826 869
@@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
828 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { 871 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
829 gl->gl_demote_state = state; 872 gl->gl_demote_state = state;
830 gl->gl_demote_time = jiffies; 873 gl->gl_demote_time = jiffies;
831 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
832 gl->gl_object)
833 gfs2_glock_schedule_for_reclaim(gl);
834 } else if (gl->gl_demote_state != LM_ST_UNLOCKED && 874 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
835 gl->gl_demote_state != state) { 875 gl->gl_demote_state != state) {
836 gl->gl_demote_state = LM_ST_UNLOCKED; 876 gl->gl_demote_state = LM_ST_UNLOCKED;
@@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
877 */ 917 */
878 918
879static inline void add_to_queue(struct gfs2_holder *gh) 919static inline void add_to_queue(struct gfs2_holder *gh)
920__releases(&gl->gl_spin)
921__acquires(&gl->gl_spin)
880{ 922{
881 struct gfs2_glock *gl = gh->gh_gl; 923 struct gfs2_glock *gl = gh->gh_gl;
882 struct gfs2_sbd *sdp = gl->gl_sbd; 924 struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
998 1040
999 spin_lock(&gl->gl_spin); 1041 spin_lock(&gl->gl_spin);
1000 if (gh->gh_flags & GL_NOCACHE) 1042 if (gh->gh_flags & GL_NOCACHE)
1001 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1043 handle_callback(gl, LM_ST_UNLOCKED, 0);
1002 1044
1003 list_del_init(&gh->gh_list); 1045 list_del_init(&gh->gh_list);
1004 if (find_first_holder(gl) == NULL) { 1046 if (find_first_holder(gl) == NULL) {
@@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1269 delay = gl->gl_ops->go_min_hold_time; 1311 delay = gl->gl_ops->go_min_hold_time;
1270 1312
1271 spin_lock(&gl->gl_spin); 1313 spin_lock(&gl->gl_spin);
1272 handle_callback(gl, state, 1, delay); 1314 handle_callback(gl, state, delay);
1273 spin_unlock(&gl->gl_spin); 1315 spin_unlock(&gl->gl_spin);
1274 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 1316 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
1275 gfs2_glock_put(gl); 1317 gfs2_glock_put(gl);
1276} 1318}
1277 1319
1320static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
1321{
1322 struct gfs2_jdesc *jd;
1323
1324 spin_lock(&sdp->sd_jindex_spin);
1325 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
1326 if (jd->jd_jid != jid)
1327 continue;
1328 jd->jd_dirty = 1;
1329 break;
1330 }
1331 spin_unlock(&sdp->sd_jindex_spin);
1332}
1333
1278/** 1334/**
1279 * gfs2_glock_cb - Callback used by locking module 1335 * gfs2_glock_cb - Callback used by locking module
1280 * @sdp: Pointer to the superblock 1336 * @sdp: Pointer to the superblock
@@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1338 * Returns: 1 if it's ok 1394 * Returns: 1 if it's ok
1339 */ 1395 */
1340 1396
1341static int demote_ok(struct gfs2_glock *gl) 1397static int demote_ok(const struct gfs2_glock *gl)
1342{ 1398{
1343 const struct gfs2_glock_operations *glops = gl->gl_ops; 1399 const struct gfs2_glock_operations *glops = gl->gl_ops;
1344 int demote = 1;
1345
1346 if (test_bit(GLF_STICKY, &gl->gl_flags))
1347 demote = 0;
1348 else if (glops->go_demote_ok)
1349 demote = glops->go_demote_ok(gl);
1350
1351 return demote;
1352}
1353
1354/**
1355 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
1356 * @gl: the glock
1357 *
1358 */
1359
1360void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1361{
1362 struct gfs2_sbd *sdp = gl->gl_sbd;
1363 1400
1364 spin_lock(&sdp->sd_reclaim_lock); 1401 if (gl->gl_state == LM_ST_UNLOCKED)
1365 if (list_empty(&gl->gl_reclaim)) { 1402 return 0;
1366 gfs2_glock_hold(gl); 1403 if (!list_empty(&gl->gl_holders))
1367 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); 1404 return 0;
1368 atomic_inc(&sdp->sd_reclaim_count); 1405 if (glops->go_demote_ok)
1369 spin_unlock(&sdp->sd_reclaim_lock); 1406 return glops->go_demote_ok(gl);
1370 wake_up(&sdp->sd_reclaim_wq); 1407 return 1;
1371 } else
1372 spin_unlock(&sdp->sd_reclaim_lock);
1373} 1408}
1374 1409
1375/**
1376 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
1377 * @sdp: the filesystem
1378 *
1379 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
1380 * different glock and we notice that there are a lot of glocks in the
1381 * reclaim list.
1382 *
1383 */
1384 1410
1385void gfs2_reclaim_glock(struct gfs2_sbd *sdp) 1411static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1386{ 1412{
1387 struct gfs2_glock *gl; 1413 struct gfs2_glock *gl;
1388 int done_callback = 0; 1414 int may_demote;
1415 int nr_skipped = 0;
1416 int got_ref = 0;
1417 LIST_HEAD(skipped);
1389 1418
1390 spin_lock(&sdp->sd_reclaim_lock); 1419 if (nr == 0)
1391 if (list_empty(&sdp->sd_reclaim_list)) { 1420 goto out;
1392 spin_unlock(&sdp->sd_reclaim_lock);
1393 return;
1394 }
1395 gl = list_entry(sdp->sd_reclaim_list.next,
1396 struct gfs2_glock, gl_reclaim);
1397 list_del_init(&gl->gl_reclaim);
1398 spin_unlock(&sdp->sd_reclaim_lock);
1399 1421
1400 atomic_dec(&sdp->sd_reclaim_count); 1422 if (!(gfp_mask & __GFP_FS))
1401 atomic_inc(&sdp->sd_reclaimed); 1423 return -1;
1402 1424
1403 spin_lock(&gl->gl_spin); 1425 spin_lock(&lru_lock);
1404 if (find_first_holder(gl) == NULL && 1426 while(nr && !list_empty(&lru_list)) {
1405 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) { 1427 gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
1406 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1428 list_del_init(&gl->gl_lru);
1407 done_callback = 1; 1429 atomic_dec(&lru_count);
1430
1431 /* Test for being demotable */
1432 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
1433 gfs2_glock_hold(gl);
1434 got_ref = 1;
1435 spin_unlock(&lru_lock);
1436 spin_lock(&gl->gl_spin);
1437 may_demote = demote_ok(gl);
1438 spin_unlock(&gl->gl_spin);
1439 clear_bit(GLF_LOCK, &gl->gl_flags);
1440 if (may_demote) {
1441 handle_callback(gl, LM_ST_UNLOCKED, 0);
1442 nr--;
1443 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1444 gfs2_glock_put(gl);
1445 }
1446 spin_lock(&lru_lock);
1447 if (may_demote)
1448 continue;
1449 }
1450 if (list_empty(&gl->gl_lru) &&
1451 (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
1452 nr_skipped++;
1453 list_add(&gl->gl_lru, &skipped);
1454 }
1455 if (got_ref) {
1456 spin_unlock(&lru_lock);
1457 gfs2_glock_put(gl);
1458 spin_lock(&lru_lock);
1459 got_ref = 0;
1460 }
1408 } 1461 }
1409 spin_unlock(&gl->gl_spin); 1462 list_splice(&skipped, &lru_list);
1410 if (!done_callback || 1463 atomic_add(nr_skipped, &lru_count);
1411 queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1464 spin_unlock(&lru_lock);
1412 gfs2_glock_put(gl); 1465out:
1466 return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
1413} 1467}
1414 1468
1469static struct shrinker glock_shrinker = {
1470 .shrink = gfs2_shrink_glock_memory,
1471 .seeks = DEFAULT_SEEKS,
1472};
1473
1415/** 1474/**
1416 * examine_bucket - Call a function for glock in a hash bucket 1475 * examine_bucket - Call a function for glock in a hash bucket
1417 * @examiner: the function 1476 * @examiner: the function
@@ -1457,26 +1516,6 @@ out:
1457} 1516}
1458 1517
1459/** 1518/**
1460 * scan_glock - look at a glock and see if we can reclaim it
1461 * @gl: the glock to look at
1462 *
1463 */
1464
1465static void scan_glock(struct gfs2_glock *gl)
1466{
1467 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
1468 return;
1469 if (test_bit(GLF_LOCK, &gl->gl_flags))
1470 return;
1471
1472 spin_lock(&gl->gl_spin);
1473 if (find_first_holder(gl) == NULL &&
1474 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1475 gfs2_glock_schedule_for_reclaim(gl);
1476 spin_unlock(&gl->gl_spin);
1477}
1478
1479/**
1480 * clear_glock - look at a glock and see if we can free it from glock cache 1519 * clear_glock - look at a glock and see if we can free it from glock cache
1481 * @gl: the glock to look at 1520 * @gl: the glock to look at
1482 * 1521 *
@@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl)
1484 1523
1485static void clear_glock(struct gfs2_glock *gl) 1524static void clear_glock(struct gfs2_glock *gl)
1486{ 1525{
1487 struct gfs2_sbd *sdp = gl->gl_sbd; 1526 spin_lock(&lru_lock);
1488 int released; 1527 if (!list_empty(&gl->gl_lru)) {
1489 1528 list_del_init(&gl->gl_lru);
1490 spin_lock(&sdp->sd_reclaim_lock); 1529 atomic_dec(&lru_count);
1491 if (!list_empty(&gl->gl_reclaim)) {
1492 list_del_init(&gl->gl_reclaim);
1493 atomic_dec(&sdp->sd_reclaim_count);
1494 spin_unlock(&sdp->sd_reclaim_lock);
1495 released = gfs2_glock_put(gl);
1496 gfs2_assert(sdp, !released);
1497 } else {
1498 spin_unlock(&sdp->sd_reclaim_lock);
1499 } 1530 }
1531 spin_unlock(&lru_lock);
1500 1532
1501 spin_lock(&gl->gl_spin); 1533 spin_lock(&gl->gl_spin);
1502 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) 1534 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
1503 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1535 handle_callback(gl, LM_ST_UNLOCKED, 0);
1504 spin_unlock(&gl->gl_spin); 1536 spin_unlock(&gl->gl_spin);
1505 gfs2_glock_hold(gl); 1537 gfs2_glock_hold(gl);
1506 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1538 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1548 } 1580 }
1549} 1581}
1550 1582
1583void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
1584{
1585 struct gfs2_glock *gl = ip->i_gl;
1586 int ret;
1587
1588 ret = gfs2_truncatei_resume(ip);
1589 gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
1590
1591 spin_lock(&gl->gl_spin);
1592 clear_bit(GLF_LOCK, &gl->gl_flags);
1593 run_queue(gl, 1);
1594 spin_unlock(&gl->gl_spin);
1595}
1596
1551static const char *state2str(unsigned state) 1597static const char *state2str(unsigned state)
1552{ 1598{
1553 switch(state) { 1599 switch(state) {
@@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1623 char *p = buf; 1669 char *p = buf;
1624 if (test_bit(GLF_LOCK, gflags)) 1670 if (test_bit(GLF_LOCK, gflags))
1625 *p++ = 'l'; 1671 *p++ = 'l';
1626 if (test_bit(GLF_STICKY, gflags))
1627 *p++ = 's';
1628 if (test_bit(GLF_DEMOTE, gflags)) 1672 if (test_bit(GLF_DEMOTE, gflags))
1629 *p++ = 'D'; 1673 *p++ = 'D';
1630 if (test_bit(GLF_PENDING_DEMOTE, gflags)) 1674 if (test_bit(GLF_PENDING_DEMOTE, gflags))
@@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
1743 return error; 1787 return error;
1744} 1788}
1745 1789
1746/**
1747 * gfs2_scand - Look for cached glocks and inodes to toss from memory
1748 * @sdp: Pointer to GFS2 superblock
1749 *
1750 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
1751 * See gfs2_glockd()
1752 */
1753
1754static int gfs2_scand(void *data)
1755{
1756 unsigned x;
1757 unsigned delay;
1758
1759 while (!kthread_should_stop()) {
1760 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1761 examine_bucket(scan_glock, NULL, x);
1762 if (freezing(current))
1763 refrigerator();
1764 delay = scand_secs;
1765 if (delay < 1)
1766 delay = 1;
1767 schedule_timeout_interruptible(delay * HZ);
1768 }
1769
1770 return 0;
1771}
1772
1773
1774 1790
1775int __init gfs2_glock_init(void) 1791int __init gfs2_glock_init(void)
1776{ 1792{
@@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void)
1784 } 1800 }
1785#endif 1801#endif
1786 1802
1787 scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
1788 if (IS_ERR(scand_process))
1789 return PTR_ERR(scand_process);
1790
1791 glock_workqueue = create_workqueue("glock_workqueue"); 1803 glock_workqueue = create_workqueue("glock_workqueue");
1792 if (IS_ERR(glock_workqueue)) { 1804 if (IS_ERR(glock_workqueue))
1793 kthread_stop(scand_process);
1794 return PTR_ERR(glock_workqueue); 1805 return PTR_ERR(glock_workqueue);
1795 } 1806
1807 register_shrinker(&glock_shrinker);
1796 1808
1797 return 0; 1809 return 0;
1798} 1810}
1799 1811
1800void gfs2_glock_exit(void) 1812void gfs2_glock_exit(void)
1801{ 1813{
1814 unregister_shrinker(&glock_shrinker);
1802 destroy_workqueue(glock_workqueue); 1815 destroy_workqueue(glock_workqueue);
1803 kthread_stop(scand_process);
1804} 1816}
1805 1817
1806module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
1807MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
1808
1809static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) 1818static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
1810{ 1819{
1811 struct gfs2_glock *gl; 1820 struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 695c6b193611..543ec7ecfbda 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
129void gfs2_lvb_unhold(struct gfs2_glock *gl); 129void gfs2_lvb_unhold(struct gfs2_glock *gl);
130 130
131void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); 131void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
132void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
133void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 132void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
134void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); 133void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
134void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
135 135
136int __init gfs2_glock_init(void); 136int __init gfs2_glock_init(void);
137void gfs2_glock_exit(void); 137void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c6c318c2a0f6..8522d3aa64fc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
201 * Returns: 1 if it's ok 201 * Returns: 1 if it's ok
202 */ 202 */
203 203
204static int inode_go_demote_ok(struct gfs2_glock *gl) 204static int inode_go_demote_ok(const struct gfs2_glock *gl)
205{ 205{
206 struct gfs2_sbd *sdp = gl->gl_sbd; 206 struct gfs2_sbd *sdp = gl->gl_sbd;
207 int demote = 0; 207 if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
208 208 return 0;
209 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages) 209 return 1;
210 demote = 1;
211 else if (!sdp->sd_args.ar_localcaching &&
212 time_after_eq(jiffies, gl->gl_stamp +
213 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
214 demote = 1;
215
216 return demote;
217} 210}
218 211
219/** 212/**
@@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl)
227static int inode_go_lock(struct gfs2_holder *gh) 220static int inode_go_lock(struct gfs2_holder *gh)
228{ 221{
229 struct gfs2_glock *gl = gh->gh_gl; 222 struct gfs2_glock *gl = gh->gh_gl;
223 struct gfs2_sbd *sdp = gl->gl_sbd;
230 struct gfs2_inode *ip = gl->gl_object; 224 struct gfs2_inode *ip = gl->gl_object;
231 int error = 0; 225 int error = 0;
232 226
@@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh)
239 return error; 233 return error;
240 } 234 }
241 235
242 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && 236 if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
243 (gl->gl_state == LM_ST_EXCLUSIVE) && 237 (gl->gl_state == LM_ST_EXCLUSIVE) &&
244 (gh->gh_state == LM_ST_EXCLUSIVE)) 238 (gh->gh_state == LM_ST_EXCLUSIVE)) {
245 error = gfs2_truncatei_resume(ip); 239 spin_lock(&sdp->sd_trunc_lock);
240 if (list_empty(&ip->i_trunc_list))
241 list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
242 spin_unlock(&sdp->sd_trunc_lock);
243 wake_up(&sdp->sd_quota_wait);
244 return 1;
245 }
246 246
247 return error; 247 return error;
248} 248}
@@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
260 const struct gfs2_inode *ip = gl->gl_object; 260 const struct gfs2_inode *ip = gl->gl_object;
261 if (ip == NULL) 261 if (ip == NULL)
262 return 0; 262 return 0;
263 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n", 263 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
264 (unsigned long long)ip->i_no_formal_ino, 264 (unsigned long long)ip->i_no_formal_ino,
265 (unsigned long long)ip->i_no_addr, 265 (unsigned long long)ip->i_no_addr,
266 IF2DT(ip->i_inode.i_mode), ip->i_flags); 266 IF2DT(ip->i_inode.i_mode), ip->i_flags,
267 (unsigned int)ip->i_diskflags,
268 (unsigned long long)ip->i_inode.i_size,
269 (unsigned long long)ip->i_disksize);
267 return 0; 270 return 0;
268} 271}
269 272
@@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
274 * Returns: 1 if it's ok 277 * Returns: 1 if it's ok
275 */ 278 */
276 279
277static int rgrp_go_demote_ok(struct gfs2_glock *gl) 280static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
278{ 281{
279 return !gl->gl_aspace->i_mapping->nrpages; 282 return !gl->gl_aspace->i_mapping->nrpages;
280} 283}
@@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
318 const struct gfs2_rgrpd *rgd = gl->gl_object; 321 const struct gfs2_rgrpd *rgd = gl->gl_object;
319 if (rgd == NULL) 322 if (rgd == NULL)
320 return 0; 323 return 0;
321 gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr); 324 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
325 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
326 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
322 return 0; 327 return 0;
323} 328}
324 329
@@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
374} 379}
375 380
376/** 381/**
382 * trans_go_demote_ok
383 * @gl: the glock
384 *
385 * Always returns 0
386 */
387
388static int trans_go_demote_ok(const struct gfs2_glock *gl)
389{
390 return 0;
391}
392
393/**
377 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock 394 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
378 * @gl: the glock 395 * @gl: the glock
379 * 396 *
380 * Returns: 1 if it's ok 397 * Returns: 1 if it's ok
381 */ 398 */
382 399
383static int quota_go_demote_ok(struct gfs2_glock *gl) 400static int quota_go_demote_ok(const struct gfs2_glock *gl)
384{ 401{
385 return !atomic_read(&gl->gl_lvb_count); 402 return !atomic_read(&gl->gl_lvb_count);
386} 403}
@@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
414const struct gfs2_glock_operations gfs2_trans_glops = { 431const struct gfs2_glock_operations gfs2_trans_glops = {
415 .go_xmote_th = trans_go_sync, 432 .go_xmote_th = trans_go_sync,
416 .go_xmote_bh = trans_go_xmote_bh, 433 .go_xmote_bh = trans_go_xmote_bh,
434 .go_demote_ok = trans_go_demote_ok,
417 .go_type = LM_TYPE_NONDISK, 435 .go_type = LM_TYPE_NONDISK,
418}; 436};
419 437
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f566ec1b4e8e..608849d00021 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,12 +68,6 @@ struct gfs2_bitmap {
68 u32 bi_len; 68 u32 bi_len;
69}; 69};
70 70
71struct gfs2_rgrp_host {
72 u32 rg_free;
73 u32 rg_dinodes;
74 u64 rg_igeneration;
75};
76
77struct gfs2_rgrpd { 71struct gfs2_rgrpd {
78 struct list_head rd_list; /* Link with superblock */ 72 struct list_head rd_list; /* Link with superblock */
79 struct list_head rd_list_mru; 73 struct list_head rd_list_mru;
@@ -83,14 +77,16 @@ struct gfs2_rgrpd {
83 u32 rd_length; /* length of rgrp header in fs blocks */ 77 u32 rd_length; /* length of rgrp header in fs blocks */
84 u32 rd_data; /* num of data blocks in rgrp */ 78 u32 rd_data; /* num of data blocks in rgrp */
85 u32 rd_bitbytes; /* number of bytes in data bitmaps */ 79 u32 rd_bitbytes; /* number of bytes in data bitmaps */
86 struct gfs2_rgrp_host rd_rg; 80 u32 rd_free;
81 u32 rd_free_clone;
82 u32 rd_dinodes;
83 u64 rd_igeneration;
87 struct gfs2_bitmap *rd_bits; 84 struct gfs2_bitmap *rd_bits;
88 unsigned int rd_bh_count;
89 struct mutex rd_mutex; 85 struct mutex rd_mutex;
90 u32 rd_free_clone;
91 struct gfs2_log_element rd_le; 86 struct gfs2_log_element rd_le;
92 u32 rd_last_alloc;
93 struct gfs2_sbd *rd_sbd; 87 struct gfs2_sbd *rd_sbd;
88 unsigned int rd_bh_count;
89 u32 rd_last_alloc;
94 unsigned char rd_flags; 90 unsigned char rd_flags;
95#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ 91#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */
96#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ 92#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */
@@ -129,7 +125,7 @@ struct gfs2_glock_operations {
129 void (*go_xmote_th) (struct gfs2_glock *gl); 125 void (*go_xmote_th) (struct gfs2_glock *gl);
130 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); 126 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
131 void (*go_inval) (struct gfs2_glock *gl, int flags); 127 void (*go_inval) (struct gfs2_glock *gl, int flags);
132 int (*go_demote_ok) (struct gfs2_glock *gl); 128 int (*go_demote_ok) (const struct gfs2_glock *gl);
133 int (*go_lock) (struct gfs2_holder *gh); 129 int (*go_lock) (struct gfs2_holder *gh);
134 void (*go_unlock) (struct gfs2_holder *gh); 130 void (*go_unlock) (struct gfs2_holder *gh);
135 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); 131 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
@@ -159,7 +155,6 @@ struct gfs2_holder {
159 155
160enum { 156enum {
161 GLF_LOCK = 1, 157 GLF_LOCK = 1,
162 GLF_STICKY = 2,
163 GLF_DEMOTE = 3, 158 GLF_DEMOTE = 3,
164 GLF_PENDING_DEMOTE = 4, 159 GLF_PENDING_DEMOTE = 4,
165 GLF_DEMOTE_IN_PROGRESS = 5, 160 GLF_DEMOTE_IN_PROGRESS = 5,
@@ -194,7 +189,7 @@ struct gfs2_glock {
194 unsigned long gl_tchange; 189 unsigned long gl_tchange;
195 void *gl_object; 190 void *gl_object;
196 191
197 struct list_head gl_reclaim; 192 struct list_head gl_lru;
198 193
199 struct gfs2_sbd *gl_sbd; 194 struct gfs2_sbd *gl_sbd;
200 195
@@ -233,29 +228,24 @@ enum {
233 GIF_USER = 4, /* user inode, not metadata addr space */ 228 GIF_USER = 4, /* user inode, not metadata addr space */
234}; 229};
235 230
236struct gfs2_dinode_host {
237 u64 di_size; /* number of bytes in file */
238 u64 di_generation; /* generation number for NFS */
239 u32 di_flags; /* GFS2_DIF_... */
240 /* These only apply to directories */
241 u32 di_entries; /* The number of entries in the directory */
242 u64 di_eattr; /* extended attribute block number */
243};
244 231
245struct gfs2_inode { 232struct gfs2_inode {
246 struct inode i_inode; 233 struct inode i_inode;
247 u64 i_no_addr; 234 u64 i_no_addr;
248 u64 i_no_formal_ino; 235 u64 i_no_formal_ino;
236 u64 i_generation;
237 u64 i_eattr;
238 loff_t i_disksize;
249 unsigned long i_flags; /* GIF_... */ 239 unsigned long i_flags; /* GIF_... */
250
251 struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
252
253 struct gfs2_glock *i_gl; /* Move into i_gh? */ 240 struct gfs2_glock *i_gl; /* Move into i_gh? */
254 struct gfs2_holder i_iopen_gh; 241 struct gfs2_holder i_iopen_gh;
255 struct gfs2_holder i_gh; /* for prepare/commit_write only */ 242 struct gfs2_holder i_gh; /* for prepare/commit_write only */
256 struct gfs2_alloc *i_alloc; 243 struct gfs2_alloc *i_alloc;
257 u64 i_goal; /* goal block for allocations */ 244 u64 i_goal; /* goal block for allocations */
258 struct rw_semaphore i_rw_mutex; 245 struct rw_semaphore i_rw_mutex;
246 struct list_head i_trunc_list;
247 u32 i_entries;
248 u32 i_diskflags;
259 u8 i_height; 249 u8 i_height;
260 u8 i_depth; 250 u8 i_depth;
261}; 251};
@@ -406,13 +396,11 @@ struct gfs2_args {
406struct gfs2_tune { 396struct gfs2_tune {
407 spinlock_t gt_spin; 397 spinlock_t gt_spin;
408 398
409 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
410 unsigned int gt_incore_log_blocks; 399 unsigned int gt_incore_log_blocks;
411 unsigned int gt_log_flush_secs; 400 unsigned int gt_log_flush_secs;
412 401
413 unsigned int gt_recoverd_secs; 402 unsigned int gt_recoverd_secs;
414 unsigned int gt_logd_secs; 403 unsigned int gt_logd_secs;
415 unsigned int gt_quotad_secs;
416 404
417 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ 405 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
418 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ 406 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
@@ -488,10 +476,6 @@ struct gfs2_sbd {
488 /* Lock Stuff */ 476 /* Lock Stuff */
489 477
490 struct lm_lockstruct sd_lockstruct; 478 struct lm_lockstruct sd_lockstruct;
491 struct list_head sd_reclaim_list;
492 spinlock_t sd_reclaim_lock;
493 wait_queue_head_t sd_reclaim_wq;
494 atomic_t sd_reclaim_count;
495 struct gfs2_holder sd_live_gh; 479 struct gfs2_holder sd_live_gh;
496 struct gfs2_glock *sd_rename_gl; 480 struct gfs2_glock *sd_rename_gl;
497 struct gfs2_glock *sd_trans_gl; 481 struct gfs2_glock *sd_trans_gl;
@@ -519,7 +503,6 @@ struct gfs2_sbd {
519 spinlock_t sd_statfs_spin; 503 spinlock_t sd_statfs_spin;
520 struct gfs2_statfs_change_host sd_statfs_master; 504 struct gfs2_statfs_change_host sd_statfs_master;
521 struct gfs2_statfs_change_host sd_statfs_local; 505 struct gfs2_statfs_change_host sd_statfs_local;
522 unsigned long sd_statfs_sync_time;
523 506
524 /* Resource group stuff */ 507 /* Resource group stuff */
525 508
@@ -552,8 +535,6 @@ struct gfs2_sbd {
552 struct task_struct *sd_recoverd_process; 535 struct task_struct *sd_recoverd_process;
553 struct task_struct *sd_logd_process; 536 struct task_struct *sd_logd_process;
554 struct task_struct *sd_quotad_process; 537 struct task_struct *sd_quotad_process;
555 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
556 unsigned int sd_glockd_num;
557 538
558 /* Quota stuff */ 539 /* Quota stuff */
559 540
@@ -561,13 +542,15 @@ struct gfs2_sbd {
561 atomic_t sd_quota_count; 542 atomic_t sd_quota_count;
562 spinlock_t sd_quota_spin; 543 spinlock_t sd_quota_spin;
563 struct mutex sd_quota_mutex; 544 struct mutex sd_quota_mutex;
545 wait_queue_head_t sd_quota_wait;
546 struct list_head sd_trunc_list;
547 spinlock_t sd_trunc_lock;
564 548
565 unsigned int sd_quota_slots; 549 unsigned int sd_quota_slots;
566 unsigned int sd_quota_chunks; 550 unsigned int sd_quota_chunks;
567 unsigned char **sd_quota_bitmap; 551 unsigned char **sd_quota_bitmap;
568 552
569 u64 sd_quota_sync_gen; 553 u64 sd_quota_sync_gen;
570 unsigned long sd_quota_sync_time;
571 554
572 /* Log stuff */ 555 /* Log stuff */
573 556
@@ -624,10 +607,6 @@ struct gfs2_sbd {
624 struct mutex sd_freeze_lock; 607 struct mutex sd_freeze_lock;
625 unsigned int sd_freeze_count; 608 unsigned int sd_freeze_count;
626 609
627 /* Counters */
628
629 atomic_t sd_reclaimed;
630
631 char sd_fsname[GFS2_FSNAME_LEN]; 610 char sd_fsname[GFS2_FSNAME_LEN];
632 char sd_table_name[GFS2_FSNAME_LEN]; 611 char sd_table_name[GFS2_FSNAME_LEN];
633 char sd_proto_name[GFS2_FSNAME_LEN]; 612 char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d57616840e89..3b87c188da41 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -32,7 +32,6 @@
32#include "log.h" 32#include "log.h"
33#include "meta_io.h" 33#include "meta_io.h"
34#include "ops_address.h" 34#include "ops_address.h"
35#include "ops_inode.h"
36#include "quota.h" 35#include "quota.h"
37#include "rgrp.h" 36#include "rgrp.h"
38#include "trans.h" 37#include "trans.h"
@@ -248,7 +247,6 @@ fail:
248 247
249static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 248static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
250{ 249{
251 struct gfs2_dinode_host *di = &ip->i_di;
252 const struct gfs2_dinode *str = buf; 250 const struct gfs2_dinode *str = buf;
253 struct timespec atime; 251 struct timespec atime;
254 u16 height, depth; 252 u16 height, depth;
@@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
274 * to do that. 272 * to do that.
275 */ 273 */
276 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 274 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
277 di->di_size = be64_to_cpu(str->di_size); 275 ip->i_disksize = be64_to_cpu(str->di_size);
278 i_size_write(&ip->i_inode, di->di_size); 276 i_size_write(&ip->i_inode, ip->i_disksize);
279 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 277 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
280 atime.tv_sec = be64_to_cpu(str->di_atime); 278 atime.tv_sec = be64_to_cpu(str->di_atime);
281 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 279 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
287 ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); 285 ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
288 286
289 ip->i_goal = be64_to_cpu(str->di_goal_meta); 287 ip->i_goal = be64_to_cpu(str->di_goal_meta);
290 di->di_generation = be64_to_cpu(str->di_generation); 288 ip->i_generation = be64_to_cpu(str->di_generation);
291 289
292 di->di_flags = be32_to_cpu(str->di_flags); 290 ip->i_diskflags = be32_to_cpu(str->di_flags);
293 gfs2_set_inode_flags(&ip->i_inode); 291 gfs2_set_inode_flags(&ip->i_inode);
294 height = be16_to_cpu(str->di_height); 292 height = be16_to_cpu(str->di_height);
295 if (unlikely(height > GFS2_MAX_META_HEIGHT)) 293 if (unlikely(height > GFS2_MAX_META_HEIGHT))
@@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
300 if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) 298 if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
301 goto corrupt; 299 goto corrupt;
302 ip->i_depth = (u8)depth; 300 ip->i_depth = (u8)depth;
303 di->di_entries = be32_to_cpu(str->di_entries); 301 ip->i_entries = be32_to_cpu(str->di_entries);
304 302
305 di->di_eattr = be64_to_cpu(str->di_eattr); 303 ip->i_eattr = be64_to_cpu(str->di_eattr);
306 if (S_ISREG(ip->i_inode.i_mode)) 304 if (S_ISREG(ip->i_inode.i_mode))
307 gfs2_set_aops(&ip->i_inode); 305 gfs2_set_aops(&ip->i_inode);
308 306
@@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
388 gfs2_free_di(rgd, ip); 386 gfs2_free_di(rgd, ip);
389 387
390 gfs2_trans_end(sdp); 388 gfs2_trans_end(sdp);
391 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
392 389
393out_rg_gunlock: 390out_rg_gunlock:
394 gfs2_glock_dq_uninit(&al->al_rgd_gh); 391 gfs2_glock_dq_uninit(&al->al_rgd_gh);
@@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
690 return error; 687 return error;
691 } 688 }
692 689
693 if (dip->i_di.di_entries == (u32)-1) 690 if (dip->i_entries == (u32)-1)
694 return -EFBIG; 691 return -EFBIG;
695 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) 692 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
696 return -EMLINK; 693 return -EMLINK;
@@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
790 di->di_flags = 0; 787 di->di_flags = 0;
791 788
792 if (S_ISREG(mode)) { 789 if (S_ISREG(mode)) {
793 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || 790 if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
794 gfs2_tune_get(sdp, gt_new_files_jdata)) 791 gfs2_tune_get(sdp, gt_new_files_jdata))
795 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); 792 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
796 } else if (S_ISDIR(mode)) { 793 } else if (S_ISDIR(mode)) {
797 di->di_flags |= cpu_to_be32(dip->i_di.di_flags & 794 di->di_flags |= cpu_to_be32(dip->i_diskflags &
798 GFS2_DIF_INHERIT_JDATA); 795 GFS2_DIF_INHERIT_JDATA);
799 } 796 }
800 797
@@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
1068 struct qstr dotname; 1065 struct qstr dotname;
1069 int error; 1066 int error;
1070 1067
1071 if (ip->i_di.di_entries != 2) { 1068 if (ip->i_entries != 2) {
1072 if (gfs2_consist_inode(ip)) 1069 if (gfs2_consist_inode(ip))
1073 gfs2_dinode_print(ip); 1070 gfs2_dinode_print(ip);
1074 return -EIO; 1071 return -EIO;
@@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1168 return error; 1165 return error;
1169 } 1166 }
1170 1167
1171 if (!ip->i_di.di_size) { 1168 if (!ip->i_disksize) {
1172 gfs2_consist_inode(ip); 1169 gfs2_consist_inode(ip);
1173 error = -EIO; 1170 error = -EIO;
1174 goto out; 1171 goto out;
@@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1178 if (error) 1175 if (error)
1179 goto out; 1176 goto out;
1180 1177
1181 x = ip->i_di.di_size + 1; 1178 x = ip->i_disksize + 1;
1182 if (x > *len) { 1179 if (x > *len) {
1183 *buf = kmalloc(x, GFP_NOFS); 1180 *buf = kmalloc(x, GFP_NOFS);
1184 if (!*buf) { 1181 if (!*buf) {
@@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1242 1239
1243void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) 1240void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1244{ 1241{
1245 const struct gfs2_dinode_host *di = &ip->i_di;
1246 struct gfs2_dinode *str = buf; 1242 struct gfs2_dinode *str = buf;
1247 1243
1248 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 1244 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1256 str->di_uid = cpu_to_be32(ip->i_inode.i_uid); 1252 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1257 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 1253 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1258 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 1254 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1259 str->di_size = cpu_to_be64(di->di_size); 1255 str->di_size = cpu_to_be64(ip->i_disksize);
1260 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 1256 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1261 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1257 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1262 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 1258 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1264 1260
1265 str->di_goal_meta = cpu_to_be64(ip->i_goal); 1261 str->di_goal_meta = cpu_to_be64(ip->i_goal);
1266 str->di_goal_data = cpu_to_be64(ip->i_goal); 1262 str->di_goal_data = cpu_to_be64(ip->i_goal);
1267 str->di_generation = cpu_to_be64(di->di_generation); 1263 str->di_generation = cpu_to_be64(ip->i_generation);
1268 1264
1269 str->di_flags = cpu_to_be32(di->di_flags); 1265 str->di_flags = cpu_to_be32(ip->i_diskflags);
1270 str->di_height = cpu_to_be16(ip->i_height); 1266 str->di_height = cpu_to_be16(ip->i_height);
1271 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && 1267 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
1272 !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? 1268 !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
1273 GFS2_FORMAT_DE : 0); 1269 GFS2_FORMAT_DE : 0);
1274 str->di_depth = cpu_to_be16(ip->i_depth); 1270 str->di_depth = cpu_to_be16(ip->i_depth);
1275 str->di_entries = cpu_to_be32(di->di_entries); 1271 str->di_entries = cpu_to_be32(ip->i_entries);
1276 1272
1277 str->di_eattr = cpu_to_be64(di->di_eattr); 1273 str->di_eattr = cpu_to_be64(ip->i_eattr);
1278 str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); 1274 str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
1279 str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); 1275 str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
1280 str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); 1276 str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
@@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1282 1278
1283void gfs2_dinode_print(const struct gfs2_inode *ip) 1279void gfs2_dinode_print(const struct gfs2_inode *ip)
1284{ 1280{
1285 const struct gfs2_dinode_host *di = &ip->i_di;
1286
1287 printk(KERN_INFO " no_formal_ino = %llu\n", 1281 printk(KERN_INFO " no_formal_ino = %llu\n",
1288 (unsigned long long)ip->i_no_formal_ino); 1282 (unsigned long long)ip->i_no_formal_ino);
1289 printk(KERN_INFO " no_addr = %llu\n", 1283 printk(KERN_INFO " no_addr = %llu\n",
1290 (unsigned long long)ip->i_no_addr); 1284 (unsigned long long)ip->i_no_addr);
1291 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); 1285 printk(KERN_INFO " i_disksize = %llu\n",
1286 (unsigned long long)ip->i_disksize);
1292 printk(KERN_INFO " blocks = %llu\n", 1287 printk(KERN_INFO " blocks = %llu\n",
1293 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); 1288 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1294 printk(KERN_INFO " i_goal = %llu\n", 1289 printk(KERN_INFO " i_goal = %llu\n",
1295 (unsigned long long)ip->i_goal); 1290 (unsigned long long)ip->i_goal);
1296 printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags); 1291 printk(KERN_INFO " i_diskflags = 0x%.8X\n", ip->i_diskflags);
1297 printk(KERN_INFO " i_height = %u\n", ip->i_height); 1292 printk(KERN_INFO " i_height = %u\n", ip->i_height);
1298 printk(KERN_INFO " i_depth = %u\n", ip->i_depth); 1293 printk(KERN_INFO " i_depth = %u\n", ip->i_depth);
1299 printk(KERN_INFO " di_entries = %u\n", di->di_entries); 1294 printk(KERN_INFO " i_entries = %u\n", ip->i_entries);
1300 printk(KERN_INFO " di_eattr = %llu\n", 1295 printk(KERN_INFO " i_eattr = %llu\n",
1301 (unsigned long long)di->di_eattr); 1296 (unsigned long long)ip->i_eattr);
1302} 1297}
1303 1298
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2d43f69610a0..d5329364cdff 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,7 @@
10#ifndef __INODE_DOT_H__ 10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__ 11#define __INODE_DOT_H__
12 12
13#include <linux/fs.h>
13#include "util.h" 14#include "util.h"
14 15
15static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 16static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
19 20
20static inline int gfs2_is_jdata(const struct gfs2_inode *ip) 21static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
21{ 22{
22 return ip->i_di.di_flags & GFS2_DIF_JDATA; 23 return ip->i_diskflags & GFS2_DIF_JDATA;
23} 24}
24 25
25static inline int gfs2_is_writeback(const struct gfs2_inode *ip) 26static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
@@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
97void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 98void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
98void gfs2_dinode_print(const struct gfs2_inode *ip); 99void gfs2_dinode_print(const struct gfs2_inode *ip);
99 100
101extern const struct inode_operations gfs2_file_iops;
102extern const struct inode_operations gfs2_dir_iops;
103extern const struct inode_operations gfs2_symlink_iops;
104extern const struct file_operations gfs2_file_fops;
105extern const struct file_operations gfs2_dir_fops;
106extern const struct file_operations gfs2_file_fops_nolock;
107extern const struct file_operations gfs2_dir_fops_nolock;
108
109extern void gfs2_set_inode_flags(struct inode *inode);
110
100#endif /* __INODE_DOT_H__ */ 111#endif /* __INODE_DOT_H__ */
101 112
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0c4cbe6c8285..1aa7eb6a0226 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -194,17 +194,25 @@ out:
194static void gdlm_recovery_done(void *lockspace, unsigned int jid, 194static void gdlm_recovery_done(void *lockspace, unsigned int jid,
195 unsigned int message) 195 unsigned int message)
196{ 196{
197 char env_jid[20];
198 char env_status[20];
199 char *envp[] = { env_jid, env_status, NULL };
197 struct gdlm_ls *ls = lockspace; 200 struct gdlm_ls *ls = lockspace;
198 ls->recover_jid_done = jid; 201 ls->recover_jid_done = jid;
199 ls->recover_jid_status = message; 202 ls->recover_jid_status = message;
200 kobject_uevent(&ls->kobj, KOBJ_CHANGE); 203 sprintf(env_jid, "JID=%d", jid);
204 sprintf(env_status, "RECOVERY=%s",
205 message == LM_RD_SUCCESS ? "Done" : "Failed");
206 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
201} 207}
202 208
203static void gdlm_others_may_mount(void *lockspace) 209static void gdlm_others_may_mount(void *lockspace)
204{ 210{
211 char *message = "FIRSTMOUNT=Done";
212 char *envp[] = { message, NULL };
205 struct gdlm_ls *ls = lockspace; 213 struct gdlm_ls *ls = lockspace;
206 ls->first_done = 1; 214 ls->first_done = 1;
207 kobject_uevent(&ls->kobj, KOBJ_CHANGE); 215 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
208} 216}
209 217
210/* Userspace gets the offline uevent, blocks new gfs locks on 218/* Userspace gets the offline uevent, blocks new gfs locks on
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4ec571c3d8a9..9b7edcf7bd49 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
195 kobject_put(&ls->kobj); 195 kobject_put(&ls->kobj);
196} 196}
197 197
198static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
199 struct kobj_uevent_env *env)
200{
201 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
202 add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
203 add_uevent_var(env, "LOCKPROTO=lock_dlm");
204 return 0;
205}
206
207static struct kset_uevent_ops gdlm_uevent_ops = {
208 .uevent = gdlm_uevent,
209};
210
211
198int gdlm_sysfs_init(void) 212int gdlm_sysfs_init(void)
199{ 213{
200 gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj); 214 gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
201 if (!gdlm_kset) { 215 if (!gdlm_kset) {
202 printk(KERN_WARNING "%s: can not create kset\n", __func__); 216 printk(KERN_WARNING "%s: can not create kset\n", __func__);
203 return -ENOMEM; 217 return -ENOMEM;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bb2cc303ac29..7cacfde32194 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -19,7 +19,7 @@
19 19
20#include "gfs2.h" 20#include "gfs2.h"
21#include "incore.h" 21#include "incore.h"
22#include "ops_fstype.h" 22#include "super.h"
23#include "sys.h" 23#include "sys.h"
24#include "util.h" 24#include "util.h"
25#include "glock.h" 25#include "glock.h"
@@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo)
30 30
31 inode_init_once(&ip->i_inode); 31 inode_init_once(&ip->i_inode);
32 init_rwsem(&ip->i_rw_mutex); 32 init_rwsem(&ip->i_rw_mutex);
33 INIT_LIST_HEAD(&ip->i_trunc_list);
33 ip->i_alloc = NULL; 34 ip->i_alloc = NULL;
34} 35}
35 36
@@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo)
42 INIT_LIST_HEAD(&gl->gl_holders); 43 INIT_LIST_HEAD(&gl->gl_holders);
43 gl->gl_lvb = NULL; 44 gl->gl_lvb = NULL;
44 atomic_set(&gl->gl_lvb_count, 0); 45 atomic_set(&gl->gl_lvb_count, 0);
45 INIT_LIST_HEAD(&gl->gl_reclaim); 46 INIT_LIST_HEAD(&gl->gl_lru);
46 INIT_LIST_HEAD(&gl->gl_ail_list); 47 INIT_LIST_HEAD(&gl->gl_ail_list);
47 atomic_set(&gl->gl_ail_count, 0); 48 atomic_set(&gl->gl_ail_count, 0);
48} 49}
@@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void)
93 if (!gfs2_rgrpd_cachep) 94 if (!gfs2_rgrpd_cachep)
94 goto fail; 95 goto fail;
95 96
97 gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
98 sizeof(struct gfs2_quota_data),
99 0, 0, NULL);
100 if (!gfs2_quotad_cachep)
101 goto fail;
102
96 error = register_filesystem(&gfs2_fs_type); 103 error = register_filesystem(&gfs2_fs_type);
97 if (error) 104 if (error)
98 goto fail; 105 goto fail;
@@ -112,6 +119,9 @@ fail_unregister:
112fail: 119fail:
113 gfs2_glock_exit(); 120 gfs2_glock_exit();
114 121
122 if (gfs2_quotad_cachep)
123 kmem_cache_destroy(gfs2_quotad_cachep);
124
115 if (gfs2_rgrpd_cachep) 125 if (gfs2_rgrpd_cachep)
116 kmem_cache_destroy(gfs2_rgrpd_cachep); 126 kmem_cache_destroy(gfs2_rgrpd_cachep);
117 127
@@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void)
140 unregister_filesystem(&gfs2_fs_type); 150 unregister_filesystem(&gfs2_fs_type);
141 unregister_filesystem(&gfs2meta_fs_type); 151 unregister_filesystem(&gfs2meta_fs_type);
142 152
153 kmem_cache_destroy(gfs2_quotad_cachep);
143 kmem_cache_destroy(gfs2_rgrpd_cachep); 154 kmem_cache_destroy(gfs2_rgrpd_cachep);
144 kmem_cache_destroy(gfs2_bufdata_cachep); 155 kmem_cache_destroy(gfs2_bufdata_cachep);
145 kmem_cache_destroy(gfs2_inode_cachep); 156 kmem_cache_destroy(gfs2_inode_cachep);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f96eb90a2cfa..3cb0a44ba023 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -32,7 +32,6 @@ enum {
32 Opt_debug, 32 Opt_debug,
33 Opt_nodebug, 33 Opt_nodebug,
34 Opt_upgrade, 34 Opt_upgrade,
35 Opt_num_glockd,
36 Opt_acl, 35 Opt_acl,
37 Opt_noacl, 36 Opt_noacl,
38 Opt_quota_off, 37 Opt_quota_off,
@@ -57,7 +56,6 @@ static const match_table_t tokens = {
57 {Opt_debug, "debug"}, 56 {Opt_debug, "debug"},
58 {Opt_nodebug, "nodebug"}, 57 {Opt_nodebug, "nodebug"},
59 {Opt_upgrade, "upgrade"}, 58 {Opt_upgrade, "upgrade"},
60 {Opt_num_glockd, "num_glockd=%d"},
61 {Opt_acl, "acl"}, 59 {Opt_acl, "acl"},
62 {Opt_noacl, "noacl"}, 60 {Opt_noacl, "noacl"},
63 {Opt_quota_off, "quota=off"}, 61 {Opt_quota_off, "quota=off"},
@@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
87 int error = 0; 85 int error = 0;
88 86
89 if (!remount) { 87 if (!remount) {
90 /* If someone preloaded options, use those instead */
91 spin_lock(&gfs2_sys_margs_lock);
92 if (gfs2_sys_margs) {
93 data = gfs2_sys_margs;
94 gfs2_sys_margs = NULL;
95 }
96 spin_unlock(&gfs2_sys_margs_lock);
97
98 /* Set some defaults */ 88 /* Set some defaults */
99 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
100 args->ar_quota = GFS2_QUOTA_DEFAULT; 89 args->ar_quota = GFS2_QUOTA_DEFAULT;
101 args->ar_data = GFS2_DATA_DEFAULT; 90 args->ar_data = GFS2_DATA_DEFAULT;
102 } 91 }
@@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
105 process them */ 94 process them */
106 95
107 for (options = data; (o = strsep(&options, ",")); ) { 96 for (options = data; (o = strsep(&options, ",")); ) {
108 int token, option; 97 int token;
109 substring_t tmp[MAX_OPT_ARGS]; 98 substring_t tmp[MAX_OPT_ARGS];
110 99
111 if (!*o) 100 if (!*o)
@@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
196 goto cant_remount; 185 goto cant_remount;
197 args->ar_upgrade = 1; 186 args->ar_upgrade = 1;
198 break; 187 break;
199 case Opt_num_glockd:
200 if ((error = match_int(&tmp[0], &option))) {
201 fs_info(sdp, "problem getting num_glockd\n");
202 goto out_error;
203 }
204
205 if (remount && option != args->ar_num_glockd)
206 goto cant_remount;
207 if (!option || option > GFS2_GLOCKD_MAX) {
208 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
209 GFS2_GLOCKD_MAX, option);
210 error = -EINVAL;
211 goto out_error;
212 }
213 args->ar_num_glockd = option;
214 break;
215 case Opt_acl: 188 case Opt_acl:
216 args->ar_posix_acl = 1; 189 args->ar_posix_acl = 1;
217 sdp->sd_vfs->s_flags |= MS_POSIXACL; 190 sdp->sd_vfs->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 15f710f2d4da..4ddab67867eb 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
210{ 210{
211 struct inode *inode = page->mapping->host; 211 struct inode *inode = page->mapping->host;
212 struct gfs2_sbd *sdp = GFS2_SB(inode); 212 struct gfs2_sbd *sdp = GFS2_SB(inode);
213 int error; 213 int ret;
214 int done_trans = 0; 214 int done_trans = 0;
215 215
216 error = gfs2_writepage_common(page, wbc);
217 if (error <= 0)
218 return error;
219
220 if (PageChecked(page)) { 216 if (PageChecked(page)) {
221 if (wbc->sync_mode != WB_SYNC_ALL) 217 if (wbc->sync_mode != WB_SYNC_ALL)
222 goto out_ignore; 218 goto out_ignore;
223 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); 219 ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
224 if (error) 220 if (ret)
225 goto out_ignore; 221 goto out_ignore;
226 done_trans = 1; 222 done_trans = 1;
227 } 223 }
228 error = __gfs2_jdata_writepage(page, wbc); 224 ret = gfs2_writepage_common(page, wbc);
225 if (ret > 0)
226 ret = __gfs2_jdata_writepage(page, wbc);
229 if (done_trans) 227 if (done_trans)
230 gfs2_trans_end(sdp); 228 gfs2_trans_end(sdp);
231 return error; 229 return ret;
232 230
233out_ignore: 231out_ignore:
234 redirty_page_for_writepage(wbc, page); 232 redirty_page_for_writepage(wbc, page);
@@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
453 451
454 kaddr = kmap_atomic(page, KM_USER0); 452 kaddr = kmap_atomic(page, KM_USER0);
455 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 453 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
456 ip->i_di.di_size); 454 ip->i_disksize);
457 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size); 455 memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
458 kunmap_atomic(kaddr, KM_USER0); 456 kunmap_atomic(kaddr, KM_USER0);
459 flush_dcache_page(page); 457 flush_dcache_page(page);
460 brelse(dibh); 458 brelse(dibh);
@@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
627{ 625{
628 struct gfs2_inode *ip = GFS2_I(mapping->host); 626 struct gfs2_inode *ip = GFS2_I(mapping->host);
629 struct gfs2_sbd *sdp = GFS2_SB(mapping->host); 627 struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
630 unsigned int data_blocks, ind_blocks, rblocks; 628 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
631 int alloc_required; 629 int alloc_required;
632 int error = 0; 630 int error = 0;
633 struct gfs2_alloc *al; 631 struct gfs2_alloc *al;
@@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
641 if (unlikely(error)) 639 if (unlikely(error))
642 goto out_uninit; 640 goto out_uninit;
643 641
644 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
645 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); 642 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
646 if (error) 643 if (error)
647 goto out_unlock; 644 goto out_unlock;
648 645
646 if (alloc_required || gfs2_is_jdata(ip))
647 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
648
649 if (alloc_required) { 649 if (alloc_required) {
650 al = gfs2_alloc_get(ip); 650 al = gfs2_alloc_get(ip);
651 if (!al) { 651 if (!al) {
@@ -675,6 +675,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
675 goto out_trans_fail; 675 goto out_trans_fail;
676 676
677 error = -ENOMEM; 677 error = -ENOMEM;
678 flags |= AOP_FLAG_NOFS;
678 page = grab_cache_page_write_begin(mapping, index, flags); 679 page = grab_cache_page_write_begin(mapping, index, flags);
679 *pagep = page; 680 *pagep = page;
680 if (unlikely(!page)) 681 if (unlikely(!page))
@@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
782 783
783 if (inode->i_size < to) { 784 if (inode->i_size < to) {
784 i_size_write(inode, to); 785 i_size_write(inode, to);
785 ip->i_di.di_size = inode->i_size; 786 ip->i_disksize = inode->i_size;
786 di->di_size = cpu_to_be64(inode->i_size); 787 di->di_size = cpu_to_be64(inode->i_size);
787 mark_inode_dirty(inode); 788 mark_inode_dirty(inode);
788 } 789 }
@@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
847 848
848 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 849 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
849 850
850 if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) { 851 if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
851 di = (struct gfs2_dinode *)dibh->b_data; 852 di = (struct gfs2_dinode *)dibh->b_data;
852 ip->i_di.di_size = inode->i_size; 853 ip->i_disksize = inode->i_size;
853 di->di_size = cpu_to_be64(inode->i_size); 854 di->di_size = cpu_to_be64(inode->i_size);
854 mark_inode_dirty(inode); 855 mark_inode_dirty(inode);
855 } 856 }
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 4a5e676b4420..c2ad36330ca3 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -19,7 +19,7 @@
19#include "incore.h" 19#include "incore.h"
20#include "dir.h" 20#include "dir.h"
21#include "glock.h" 21#include "glock.h"
22#include "ops_dentry.h" 22#include "super.h"
23#include "util.h" 23#include "util.h"
24#include "inode.h" 24#include "inode.h"
25 25
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
deleted file mode 100644
index 5caa3db4d3f5..000000000000
--- a/fs/gfs2/ops_dentry.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13#include <linux/dcache.h>
14
15extern struct dentry_operations gfs2_dops;
16
17#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index bbb8c36403a9..7fdeb14ddd1a 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,8 +22,7 @@
22#include "glock.h" 22#include "glock.h"
23#include "glops.h" 23#include "glops.h"
24#include "inode.h" 24#include "inode.h"
25#include "ops_dentry.h" 25#include "super.h"
26#include "ops_fstype.h"
27#include "rgrp.h" 26#include "rgrp.h"
28#include "util.h" 27#include "util.h"
29 28
@@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
214 } 213 }
215 214
216 error = -EIO; 215 error = -EIO;
217 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { 216 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
218 iput(inode); 217 iput(inode);
219 goto fail; 218 goto fail;
220 } 219 }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3a747f8e2188..93fe41b67f97 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -39,7 +39,6 @@
39#include "util.h" 39#include "util.h"
40#include "eaops.h" 40#include "eaops.h"
41#include "ops_address.h" 41#include "ops_address.h"
42#include "ops_inode.h"
43 42
44/** 43/**
45 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
@@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
158 if (error) 157 if (error)
159 return error; 158 return error;
160 159
161 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); 160 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
162 if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA) 161 if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
163 fsflags |= FS_JOURNAL_DATA_FL; 162 fsflags |= FS_JOURNAL_DATA_FL;
164 if (put_user(fsflags, ptr)) 163 if (put_user(fsflags, ptr))
165 error = -EFAULT; 164 error = -EFAULT;
@@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
172void gfs2_set_inode_flags(struct inode *inode) 171void gfs2_set_inode_flags(struct inode *inode)
173{ 172{
174 struct gfs2_inode *ip = GFS2_I(inode); 173 struct gfs2_inode *ip = GFS2_I(inode);
175 struct gfs2_dinode_host *di = &ip->i_di;
176 unsigned int flags = inode->i_flags; 174 unsigned int flags = inode->i_flags;
177 175
178 flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 176 flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
179 if (di->di_flags & GFS2_DIF_IMMUTABLE) 177 if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
180 flags |= S_IMMUTABLE; 178 flags |= S_IMMUTABLE;
181 if (di->di_flags & GFS2_DIF_APPENDONLY) 179 if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
182 flags |= S_APPEND; 180 flags |= S_APPEND;
183 if (di->di_flags & GFS2_DIF_NOATIME) 181 if (ip->i_diskflags & GFS2_DIF_NOATIME)
184 flags |= S_NOATIME; 182 flags |= S_NOATIME;
185 if (di->di_flags & GFS2_DIF_SYNC) 183 if (ip->i_diskflags & GFS2_DIF_SYNC)
186 flags |= S_SYNC; 184 flags |= S_SYNC;
187 inode->i_flags = flags; 185 inode->i_flags = flags;
188} 186}
@@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
221 if (error) 219 if (error)
222 goto out_drop_write; 220 goto out_drop_write;
223 221
224 flags = ip->i_di.di_flags; 222 flags = ip->i_diskflags;
225 new_flags = (flags & ~mask) | (reqflags & mask); 223 new_flags = (flags & ~mask) | (reqflags & mask);
226 if ((new_flags ^ flags) == 0) 224 if ((new_flags ^ flags) == 0)
227 goto out; 225 goto out;
@@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
260 if (error) 258 if (error)
261 goto out_trans_end; 259 goto out_trans_end;
262 gfs2_trans_add_bh(ip->i_gl, bh, 1); 260 gfs2_trans_add_bh(ip->i_gl, bh, 1);
263 ip->i_di.di_flags = new_flags; 261 ip->i_diskflags = new_flags;
264 gfs2_dinode_out(ip, bh->b_data); 262 gfs2_dinode_out(ip, bh->b_data);
265 brelse(bh); 263 brelse(bh);
266 gfs2_set_inode_flags(inode); 264 gfs2_set_inode_flags(inode);
@@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
344 struct gfs2_inode *ip = GFS2_I(inode); 342 struct gfs2_inode *ip = GFS2_I(inode);
345 struct gfs2_sbd *sdp = GFS2_SB(inode); 343 struct gfs2_sbd *sdp = GFS2_SB(inode);
346 unsigned long last_index; 344 unsigned long last_index;
347 u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits); 345 u64 pos = page->index << PAGE_CACHE_SHIFT;
348 unsigned int data_blocks, ind_blocks, rblocks; 346 unsigned int data_blocks, ind_blocks, rblocks;
349 int alloc_required = 0; 347 int alloc_required = 0;
350 struct gfs2_holder gh; 348 struct gfs2_holder gh;
@@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
357 goto out; 355 goto out;
358 356
359 set_bit(GIF_SW_PAGED, &ip->i_flags); 357 set_bit(GIF_SW_PAGED, &ip->i_flags);
360 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
361 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); 358 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
362 if (ret || !alloc_required) 359 if (ret || !alloc_required)
363 goto out_unlock; 360 goto out_unlock;
@@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
369 ret = gfs2_quota_lock_check(ip); 366 ret = gfs2_quota_lock_check(ip);
370 if (ret) 367 if (ret)
371 goto out_alloc_put; 368 goto out_alloc_put;
369 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
372 al->al_requested = data_blocks + ind_blocks; 370 al->al_requested = data_blocks + ind_blocks;
373 ret = gfs2_inplace_reserve(ip); 371 ret = gfs2_inplace_reserve(ip);
374 if (ret) 372 if (ret)
@@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
479 goto fail; 477 goto fail;
480 478
481 if (!(file->f_flags & O_LARGEFILE) && 479 if (!(file->f_flags & O_LARGEFILE) &&
482 ip->i_di.di_size > MAX_NON_LFS) { 480 ip->i_disksize > MAX_NON_LFS) {
483 error = -EOVERFLOW; 481 error = -EOVERFLOW;
484 goto fail_gunlock; 482 goto fail_gunlock;
485 } 483 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b117fcf2c4f5..f91eebdde581 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -22,20 +22,18 @@
22#include "gfs2.h" 22#include "gfs2.h"
23#include "incore.h" 23#include "incore.h"
24#include "bmap.h" 24#include "bmap.h"
25#include "daemon.h"
26#include "glock.h" 25#include "glock.h"
27#include "glops.h" 26#include "glops.h"
28#include "inode.h" 27#include "inode.h"
29#include "mount.h" 28#include "mount.h"
30#include "ops_fstype.h"
31#include "ops_dentry.h"
32#include "ops_super.h"
33#include "recovery.h" 29#include "recovery.h"
34#include "rgrp.h" 30#include "rgrp.h"
35#include "super.h" 31#include "super.h"
36#include "sys.h" 32#include "sys.h"
37#include "util.h" 33#include "util.h"
38#include "log.h" 34#include "log.h"
35#include "quota.h"
36#include "dir.h"
39 37
40#define DO 0 38#define DO 0
41#define UNDO 1 39#define UNDO 1
@@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
58{ 56{
59 spin_lock_init(&gt->gt_spin); 57 spin_lock_init(&gt->gt_spin);
60 58
61 gt->gt_demote_secs = 300;
62 gt->gt_incore_log_blocks = 1024; 59 gt->gt_incore_log_blocks = 1024;
63 gt->gt_log_flush_secs = 60; 60 gt->gt_log_flush_secs = 60;
64 gt->gt_recoverd_secs = 60; 61 gt->gt_recoverd_secs = 60;
65 gt->gt_logd_secs = 1; 62 gt->gt_logd_secs = 1;
66 gt->gt_quotad_secs = 5;
67 gt->gt_quota_simul_sync = 64; 63 gt->gt_quota_simul_sync = 64;
68 gt->gt_quota_warn_period = 10; 64 gt->gt_quota_warn_period = 10;
69 gt->gt_quota_scale_num = 1; 65 gt->gt_quota_scale_num = 1;
@@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
91 87
92 gfs2_tune_init(&sdp->sd_tune); 88 gfs2_tune_init(&sdp->sd_tune);
93 89
94 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
95 spin_lock_init(&sdp->sd_reclaim_lock);
96 init_waitqueue_head(&sdp->sd_reclaim_wq);
97
98 mutex_init(&sdp->sd_inum_mutex); 90 mutex_init(&sdp->sd_inum_mutex);
99 spin_lock_init(&sdp->sd_statfs_spin); 91 spin_lock_init(&sdp->sd_statfs_spin);
100 92
@@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
110 INIT_LIST_HEAD(&sdp->sd_quota_list); 102 INIT_LIST_HEAD(&sdp->sd_quota_list);
111 spin_lock_init(&sdp->sd_quota_spin); 103 spin_lock_init(&sdp->sd_quota_spin);
112 mutex_init(&sdp->sd_quota_mutex); 104 mutex_init(&sdp->sd_quota_mutex);
105 init_waitqueue_head(&sdp->sd_quota_wait);
106 INIT_LIST_HEAD(&sdp->sd_trunc_list);
107 spin_lock_init(&sdp->sd_trunc_lock);
113 108
114 spin_lock_init(&sdp->sd_log_lock); 109 spin_lock_init(&sdp->sd_log_lock);
115 110
@@ -443,24 +438,11 @@ out:
443static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, 438static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
444 int undo) 439 int undo)
445{ 440{
446 struct task_struct *p;
447 int error = 0; 441 int error = 0;
448 442
449 if (undo) 443 if (undo)
450 goto fail_trans; 444 goto fail_trans;
451 445
452 for (sdp->sd_glockd_num = 0;
453 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
454 sdp->sd_glockd_num++) {
455 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
456 error = IS_ERR(p);
457 if (error) {
458 fs_err(sdp, "can't start glockd thread: %d\n", error);
459 goto fail;
460 }
461 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
462 }
463
464 error = gfs2_glock_nq_num(sdp, 446 error = gfs2_glock_nq_num(sdp,
465 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, 447 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
466 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, 448 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
@@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
493 fs_err(sdp, "can't create transaction glock: %d\n", error); 475 fs_err(sdp, "can't create transaction glock: %d\n", error);
494 goto fail_rename; 476 goto fail_rename;
495 } 477 }
496 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
497 478
498 return 0; 479 return 0;
499 480
@@ -506,9 +487,6 @@ fail_live:
506fail_mount: 487fail_mount:
507 gfs2_glock_dq_uninit(mount_gh); 488 gfs2_glock_dq_uninit(mount_gh);
508fail: 489fail:
509 while (sdp->sd_glockd_num--)
510 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
511
512 return error; 490 return error;
513} 491}
514 492
@@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
620 598
621 prev_db = 0; 599 prev_db = 0;
622 600
623 for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) { 601 for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
624 bh.b_state = 0; 602 bh.b_state = 0;
625 bh.b_blocknr = 0; 603 bh.b_blocknr = 0;
626 bh.b_size = 1 << ip->i_inode.i_blkbits; 604 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
661 sdp->sd_lockstruct.ls_lockspace); 639 sdp->sd_lockstruct.ls_lockspace);
662} 640}
663 641
642/**
643 * gfs2_jindex_hold - Grab a lock on the jindex
644 * @sdp: The GFS2 superblock
645 * @ji_gh: the holder for the jindex glock
646 *
647 * Returns: errno
648 */
649
650static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
651{
652 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
653 struct qstr name;
654 char buf[20];
655 struct gfs2_jdesc *jd;
656 int error;
657
658 name.name = buf;
659
660 mutex_lock(&sdp->sd_jindex_mutex);
661
662 for (;;) {
663 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
664 if (error)
665 break;
666
667 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
668 name.hash = gfs2_disk_hash(name.name, name.len);
669
670 error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
671 if (error == -ENOENT) {
672 error = 0;
673 break;
674 }
675
676 gfs2_glock_dq_uninit(ji_gh);
677
678 if (error)
679 break;
680
681 error = -ENOMEM;
682 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
683 if (!jd)
684 break;
685
686 INIT_LIST_HEAD(&jd->extent_list);
687 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
688 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
689 if (!jd->jd_inode)
690 error = -ENOENT;
691 else
692 error = PTR_ERR(jd->jd_inode);
693 kfree(jd);
694 break;
695 }
696
697 spin_lock(&sdp->sd_jindex_spin);
698 jd->jd_jid = sdp->sd_journals++;
699 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
700 spin_unlock(&sdp->sd_jindex_spin);
701 }
702
703 mutex_unlock(&sdp->sd_jindex_mutex);
704
705 return error;
706}
707
664static int init_journal(struct gfs2_sbd *sdp, int undo) 708static int init_journal(struct gfs2_sbd *sdp, int undo)
665{ 709{
666 struct inode *master = sdp->sd_master_dir->d_inode; 710 struct inode *master = sdp->sd_master_dir->d_inode;
@@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
681 return PTR_ERR(sdp->sd_jindex); 725 return PTR_ERR(sdp->sd_jindex);
682 } 726 }
683 ip = GFS2_I(sdp->sd_jindex); 727 ip = GFS2_I(sdp->sd_jindex);
684 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
685 728
686 /* Load in the journal index special file */ 729 /* Load in the journal index special file */
687 730
@@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
832 goto fail_statfs; 875 goto fail_statfs;
833 } 876 }
834 ip = GFS2_I(sdp->sd_rindex); 877 ip = GFS2_I(sdp->sd_rindex);
835 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
836 sdp->sd_rindex_uptodate = 0; 878 sdp->sd_rindex_uptodate = 0;
837 879
838 /* Read in the quota inode */ 880 /* Read in the quota inode */
@@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
973 } 1015 }
974 sdp->sd_logd_process = p; 1016 sdp->sd_logd_process = p;
975 1017
976 sdp->sd_statfs_sync_time = jiffies;
977 sdp->sd_quota_sync_time = jiffies;
978
979 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); 1018 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
980 error = IS_ERR(p); 1019 error = IS_ERR(p);
981 if (error) { 1020 if (error) {
@@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1224static void gfs2_kill_sb(struct super_block *sb) 1263static void gfs2_kill_sb(struct super_block *sb)
1225{ 1264{
1226 struct gfs2_sbd *sdp = sb->s_fs_info; 1265 struct gfs2_sbd *sdp = sb->s_fs_info;
1227 if (sdp) { 1266
1228 gfs2_meta_syncfs(sdp); 1267 if (sdp == NULL) {
1229 dput(sdp->sd_root_dir); 1268 kill_block_super(sb);
1230 dput(sdp->sd_master_dir); 1269 return;
1231 sdp->sd_root_dir = NULL;
1232 sdp->sd_master_dir = NULL;
1233 } 1270 }
1271
1272 gfs2_meta_syncfs(sdp);
1273 dput(sdp->sd_root_dir);
1274 dput(sdp->sd_master_dir);
1275 sdp->sd_root_dir = NULL;
1276 sdp->sd_master_dir = NULL;
1234 shrink_dcache_sb(sb); 1277 shrink_dcache_sb(sb);
1235 kill_block_super(sb); 1278 kill_block_super(sb);
1236 if (sdp) 1279 gfs2_delete_debugfs_file(sdp);
1237 gfs2_delete_debugfs_file(sdp); 1280 kfree(sdp);
1238} 1281}
1239 1282
1240struct file_system_type gfs2_fs_type = { 1283struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
deleted file mode 100644
index da8490511836..000000000000
--- a/fs/gfs2/ops_fstype.h
+++ /dev/null
@@ -1,19 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13#include <linux/fs.h>
14
15extern struct file_system_type gfs2_fs_type;
16extern struct file_system_type gfs2meta_fs_type;
17extern const struct export_operations gfs2_export_ops;
18
19#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d232991b9046..49877546beb9 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -19,6 +19,7 @@
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/crc32.h> 20#include <linux/crc32.h>
21#include <linux/lm_interface.h> 21#include <linux/lm_interface.h>
22#include <linux/fiemap.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23 24
24#include "gfs2.h" 25#include "gfs2.h"
@@ -31,12 +32,11 @@
31#include "glock.h" 32#include "glock.h"
32#include "inode.h" 33#include "inode.h"
33#include "meta_io.h" 34#include "meta_io.h"
34#include "ops_dentry.h"
35#include "ops_inode.h"
36#include "quota.h" 35#include "quota.h"
37#include "rgrp.h" 36#include "rgrp.h"
38#include "trans.h" 37#include "trans.h"
39#include "util.h" 38#include "util.h"
39#include "super.h"
40 40
41/** 41/**
42 * gfs2_create - Create a file 42 * gfs2_create - Create a file
@@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
185 if (!dip->i_inode.i_nlink) 185 if (!dip->i_inode.i_nlink)
186 goto out_gunlock; 186 goto out_gunlock;
187 error = -EFBIG; 187 error = -EFBIG;
188 if (dip->i_di.di_entries == (u32)-1) 188 if (dip->i_entries == (u32)-1)
189 goto out_gunlock; 189 goto out_gunlock;
190 error = -EPERM; 190 error = -EPERM;
191 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 191 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
371 371
372 ip = ghs[1].gh_gl->gl_object; 372 ip = ghs[1].gh_gl->gl_object;
373 373
374 ip->i_di.di_size = size; 374 ip->i_disksize = size;
375 375
376 error = gfs2_meta_inode_buffer(ip, &dibh); 376 error = gfs2_meta_inode_buffer(ip, &dibh);
377 377
@@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
425 ip = ghs[1].gh_gl->gl_object; 425 ip = ghs[1].gh_gl->gl_object;
426 426
427 ip->i_inode.i_nlink = 2; 427 ip->i_inode.i_nlink = 2;
428 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 428 ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
429 ip->i_di.di_flags |= GFS2_DIF_JDATA; 429 ip->i_diskflags |= GFS2_DIF_JDATA;
430 ip->i_di.di_entries = 2; 430 ip->i_entries = 2;
431 431
432 error = gfs2_meta_inode_buffer(ip, &dibh); 432 error = gfs2_meta_inode_buffer(ip, &dibh);
433 433
@@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
517 if (error) 517 if (error)
518 goto out_gunlock; 518 goto out_gunlock;
519 519
520 if (ip->i_di.di_entries < 2) { 520 if (ip->i_entries < 2) {
521 if (gfs2_consist_inode(ip)) 521 if (gfs2_consist_inode(ip))
522 gfs2_dinode_print(ip); 522 gfs2_dinode_print(ip);
523 error = -EIO; 523 error = -EIO;
524 goto out_gunlock; 524 goto out_gunlock;
525 } 525 }
526 if (ip->i_di.di_entries > 2) { 526 if (ip->i_entries > 2) {
527 error = -ENOTEMPTY; 527 error = -ENOTEMPTY;
528 goto out_gunlock; 528 goto out_gunlock;
529 } 529 }
@@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
726 goto out_gunlock; 726 goto out_gunlock;
727 727
728 if (S_ISDIR(nip->i_inode.i_mode)) { 728 if (S_ISDIR(nip->i_inode.i_mode)) {
729 if (nip->i_di.di_entries < 2) { 729 if (nip->i_entries < 2) {
730 if (gfs2_consist_inode(nip)) 730 if (gfs2_consist_inode(nip))
731 gfs2_dinode_print(nip); 731 gfs2_dinode_print(nip);
732 error = -EIO; 732 error = -EIO;
733 goto out_gunlock; 733 goto out_gunlock;
734 } 734 }
735 if (nip->i_di.di_entries > 2) { 735 if (nip->i_entries > 2) {
736 error = -ENOTEMPTY; 736 error = -ENOTEMPTY;
737 goto out_gunlock; 737 goto out_gunlock;
738 } 738 }
@@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 error = -EINVAL; 758 error = -EINVAL;
759 goto out_gunlock; 759 goto out_gunlock;
760 } 760 }
761 if (ndip->i_di.di_entries == (u32)-1) { 761 if (ndip->i_entries == (u32)-1) {
762 error = -EFBIG; 762 error = -EFBIG;
763 goto out_gunlock; 763 goto out_gunlock;
764 } 764 }
@@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
990 struct gfs2_sbd *sdp = GFS2_SB(inode); 990 struct gfs2_sbd *sdp = GFS2_SB(inode);
991 int error; 991 int error;
992 992
993 if (attr->ia_size != ip->i_di.di_size) { 993 if (attr->ia_size != ip->i_disksize) {
994 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 994 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
995 if (error) 995 if (error)
996 return error; 996 return error;
@@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
1001 } 1001 }
1002 1002
1003 error = gfs2_truncatei(ip, attr->ia_size); 1003 error = gfs2_truncatei(ip, attr->ia_size);
1004 if (error && (inode->i_size != ip->i_di.di_size)) 1004 if (error && (inode->i_size != ip->i_disksize))
1005 i_size_write(inode, ip->i_di.di_size); 1005 i_size_write(inode, ip->i_disksize);
1006 1006
1007 return error; 1007 return error;
1008} 1008}
@@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1212 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); 1212 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
1213} 1213}
1214 1214
1215static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1216 u64 start, u64 len)
1217{
1218 struct gfs2_inode *ip = GFS2_I(inode);
1219 struct gfs2_holder gh;
1220 int ret;
1221
1222 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
1223 if (ret)
1224 return ret;
1225
1226 mutex_lock(&inode->i_mutex);
1227
1228 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
1229 if (ret)
1230 goto out;
1231
1232 if (gfs2_is_stuffed(ip)) {
1233 u64 phys = ip->i_no_addr << inode->i_blkbits;
1234 u64 size = i_size_read(inode);
1235 u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
1236 FIEMAP_EXTENT_DATA_INLINE;
1237 phys += sizeof(struct gfs2_dinode);
1238 phys += start;
1239 if (start + len > size)
1240 len = size - start;
1241 if (start < size)
1242 ret = fiemap_fill_next_extent(fieinfo, start, phys,
1243 len, flags);
1244 if (ret == 1)
1245 ret = 0;
1246 } else {
1247 ret = __generic_block_fiemap(inode, fieinfo, start, len,
1248 gfs2_block_map);
1249 }
1250
1251 gfs2_glock_dq_uninit(&gh);
1252out:
1253 mutex_unlock(&inode->i_mutex);
1254 return ret;
1255}
1256
1215const struct inode_operations gfs2_file_iops = { 1257const struct inode_operations gfs2_file_iops = {
1216 .permission = gfs2_permission, 1258 .permission = gfs2_permission,
1217 .setattr = gfs2_setattr, 1259 .setattr = gfs2_setattr,
@@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = {
1220 .getxattr = gfs2_getxattr, 1262 .getxattr = gfs2_getxattr,
1221 .listxattr = gfs2_listxattr, 1263 .listxattr = gfs2_listxattr,
1222 .removexattr = gfs2_removexattr, 1264 .removexattr = gfs2_removexattr,
1265 .fiemap = gfs2_fiemap,
1223}; 1266};
1224 1267
1225const struct inode_operations gfs2_dir_iops = { 1268const struct inode_operations gfs2_dir_iops = {
@@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = {
1239 .getxattr = gfs2_getxattr, 1282 .getxattr = gfs2_getxattr,
1240 .listxattr = gfs2_listxattr, 1283 .listxattr = gfs2_listxattr,
1241 .removexattr = gfs2_removexattr, 1284 .removexattr = gfs2_removexattr,
1285 .fiemap = gfs2_fiemap,
1242}; 1286};
1243 1287
1244const struct inode_operations gfs2_symlink_iops = { 1288const struct inode_operations gfs2_symlink_iops = {
@@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = {
1251 .getxattr = gfs2_getxattr, 1295 .getxattr = gfs2_getxattr,
1252 .listxattr = gfs2_listxattr, 1296 .listxattr = gfs2_listxattr,
1253 .removexattr = gfs2_removexattr, 1297 .removexattr = gfs2_removexattr,
1298 .fiemap = gfs2_fiemap,
1254}; 1299};
1255 1300
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
deleted file mode 100644
index 14b4b797622a..000000000000
--- a/fs/gfs2/ops_inode.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13#include <linux/fs.h>
14
15extern const struct inode_operations gfs2_file_iops;
16extern const struct inode_operations gfs2_dir_iops;
17extern const struct inode_operations gfs2_symlink_iops;
18extern const struct file_operations gfs2_file_fops;
19extern const struct file_operations gfs2_dir_fops;
20extern const struct file_operations gfs2_file_fops_nolock;
21extern const struct file_operations gfs2_dir_fops_nolock;
22
23extern void gfs2_set_inode_flags(struct inode *inode);
24
25#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index d5355d9b5926..777783deddcb 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -28,7 +28,6 @@
28#include "inode.h" 28#include "inode.h"
29#include "log.h" 29#include "log.h"
30#include "mount.h" 30#include "mount.h"
31#include "ops_super.h"
32#include "quota.h" 31#include "quota.h"
33#include "recovery.h" 32#include "recovery.h"
34#include "rgrp.h" 33#include "rgrp.h"
@@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb)
143 kthread_stop(sdp->sd_quotad_process); 142 kthread_stop(sdp->sd_quotad_process);
144 kthread_stop(sdp->sd_logd_process); 143 kthread_stop(sdp->sd_logd_process);
145 kthread_stop(sdp->sd_recoverd_process); 144 kthread_stop(sdp->sd_recoverd_process);
146 while (sdp->sd_glockd_num--)
147 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
148 145
149 if (!(sb->s_flags & MS_RDONLY)) { 146 if (!(sb->s_flags & MS_RDONLY)) {
150 error = gfs2_make_fs_ro(sdp); 147 error = gfs2_make_fs_ro(sdp);
@@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb)
185 182
186 /* At this point, we're through participating in the lockspace */ 183 /* At this point, we're through participating in the lockspace */
187 gfs2_sys_fs_del(sdp); 184 gfs2_sys_fs_del(sdp);
188 kfree(sdp);
189} 185}
190 186
191/** 187/**
@@ -260,6 +256,137 @@ static void gfs2_unlockfs(struct super_block *sb)
260} 256}
261 257
262/** 258/**
259 * statfs_fill - fill in the sg for a given RG
260 * @rgd: the RG
261 * @sc: the sc structure
262 *
263 * Returns: 0 on success, -ESTALE if the LVB is invalid
264 */
265
266static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
267 struct gfs2_statfs_change_host *sc)
268{
269 gfs2_rgrp_verify(rgd);
270 sc->sc_total += rgd->rd_data;
271 sc->sc_free += rgd->rd_free;
272 sc->sc_dinodes += rgd->rd_dinodes;
273 return 0;
274}
275
276/**
277 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
278 * @sdp: the filesystem
279 * @sc: the sc info that will be returned
280 *
281 * Any error (other than a signal) will cause this routine to fall back
282 * to the synchronous version.
283 *
284 * FIXME: This really shouldn't busy wait like this.
285 *
286 * Returns: errno
287 */
288
289static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
290{
291 struct gfs2_holder ri_gh;
292 struct gfs2_rgrpd *rgd_next;
293 struct gfs2_holder *gha, *gh;
294 unsigned int slots = 64;
295 unsigned int x;
296 int done;
297 int error = 0, err;
298
299 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
300 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
301 if (!gha)
302 return -ENOMEM;
303
304 error = gfs2_rindex_hold(sdp, &ri_gh);
305 if (error)
306 goto out;
307
308 rgd_next = gfs2_rgrpd_get_first(sdp);
309
310 for (;;) {
311 done = 1;
312
313 for (x = 0; x < slots; x++) {
314 gh = gha + x;
315
316 if (gh->gh_gl && gfs2_glock_poll(gh)) {
317 err = gfs2_glock_wait(gh);
318 if (err) {
319 gfs2_holder_uninit(gh);
320 error = err;
321 } else {
322 if (!error)
323 error = statfs_slow_fill(
324 gh->gh_gl->gl_object, sc);
325 gfs2_glock_dq_uninit(gh);
326 }
327 }
328
329 if (gh->gh_gl)
330 done = 0;
331 else if (rgd_next && !error) {
332 error = gfs2_glock_nq_init(rgd_next->rd_gl,
333 LM_ST_SHARED,
334 GL_ASYNC,
335 gh);
336 rgd_next = gfs2_rgrpd_get_next(rgd_next);
337 done = 0;
338 }
339
340 if (signal_pending(current))
341 error = -ERESTARTSYS;
342 }
343
344 if (done)
345 break;
346
347 yield();
348 }
349
350 gfs2_glock_dq_uninit(&ri_gh);
351
352out:
353 kfree(gha);
354 return error;
355}
356
357/**
358 * gfs2_statfs_i - Do a statfs
359 * @sdp: the filesystem
360 * @sg: the sg structure
361 *
362 * Returns: errno
363 */
364
365static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
366{
367 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
368 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
369
370 spin_lock(&sdp->sd_statfs_spin);
371
372 *sc = *m_sc;
373 sc->sc_total += l_sc->sc_total;
374 sc->sc_free += l_sc->sc_free;
375 sc->sc_dinodes += l_sc->sc_dinodes;
376
377 spin_unlock(&sdp->sd_statfs_spin);
378
379 if (sc->sc_free < 0)
380 sc->sc_free = 0;
381 if (sc->sc_free > sc->sc_total)
382 sc->sc_free = sc->sc_total;
383 if (sc->sc_dinodes < 0)
384 sc->sc_dinodes = 0;
385
386 return 0;
387}
388
389/**
263 * gfs2_statfs - Gather and return stats about the filesystem 390 * gfs2_statfs - Gather and return stats about the filesystem
264 * @sb: The superblock 391 * @sb: The superblock
265 * @statfsbuf: The buffer 392 * @statfsbuf: The buffer
@@ -370,7 +497,6 @@ static void gfs2_clear_inode(struct inode *inode)
370 */ 497 */
371 if (test_bit(GIF_USER, &ip->i_flags)) { 498 if (test_bit(GIF_USER, &ip->i_flags)) {
372 ip->i_gl->gl_object = NULL; 499 ip->i_gl->gl_object = NULL;
373 gfs2_glock_schedule_for_reclaim(ip->i_gl);
374 gfs2_glock_put(ip->i_gl); 500 gfs2_glock_put(ip->i_gl);
375 ip->i_gl = NULL; 501 ip->i_gl = NULL;
376 if (ip->i_iopen_gh.gh_gl) { 502 if (ip->i_iopen_gh.gh_gl) {
@@ -423,8 +549,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
423 seq_printf(s, ",debug"); 549 seq_printf(s, ",debug");
424 if (args->ar_upgrade) 550 if (args->ar_upgrade)
425 seq_printf(s, ",upgrade"); 551 seq_printf(s, ",upgrade");
426 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
427 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
428 if (args->ar_posix_acl) 552 if (args->ar_posix_acl)
429 seq_printf(s, ",acl"); 553 seq_printf(s, ",acl");
430 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 554 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -494,16 +618,16 @@ static void gfs2_delete_inode(struct inode *inode)
494 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 618 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
495 error = gfs2_glock_nq(&ip->i_iopen_gh); 619 error = gfs2_glock_nq(&ip->i_iopen_gh);
496 if (error) 620 if (error)
497 goto out_uninit; 621 goto out_truncate;
498 622
499 if (S_ISDIR(inode->i_mode) && 623 if (S_ISDIR(inode->i_mode) &&
500 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) { 624 (ip->i_diskflags & GFS2_DIF_EXHASH)) {
501 error = gfs2_dir_exhash_dealloc(ip); 625 error = gfs2_dir_exhash_dealloc(ip);
502 if (error) 626 if (error)
503 goto out_unlock; 627 goto out_unlock;
504 } 628 }
505 629
506 if (ip->i_di.di_eattr) { 630 if (ip->i_eattr) {
507 error = gfs2_ea_dealloc(ip); 631 error = gfs2_ea_dealloc(ip);
508 if (error) 632 if (error)
509 goto out_unlock; 633 goto out_unlock;
@@ -519,6 +643,7 @@ static void gfs2_delete_inode(struct inode *inode)
519 if (error) 643 if (error)
520 goto out_unlock; 644 goto out_unlock;
521 645
646out_truncate:
522 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 647 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
523 if (error) 648 if (error)
524 goto out_unlock; 649 goto out_unlock;
@@ -527,8 +652,8 @@ static void gfs2_delete_inode(struct inode *inode)
527 gfs2_trans_end(sdp); 652 gfs2_trans_end(sdp);
528 653
529out_unlock: 654out_unlock:
530 gfs2_glock_dq(&ip->i_iopen_gh); 655 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
531out_uninit: 656 gfs2_glock_dq(&ip->i_iopen_gh);
532 gfs2_holder_uninit(&ip->i_iopen_gh); 657 gfs2_holder_uninit(&ip->i_iopen_gh);
533 gfs2_glock_dq_uninit(&gh); 658 gfs2_glock_dq_uninit(&gh);
534 if (error && error != GLR_TRYFAILED) 659 if (error && error != GLR_TRYFAILED)
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
deleted file mode 100644
index 442a274c6272..000000000000
--- a/fs/gfs2/ops_super.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13#include <linux/fs.h>
14
15extern const struct super_operations gfs2_super_ops;
16
17#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e073f5144fa..b08d09696b3e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -46,6 +46,8 @@
46#include <linux/bio.h> 46#include <linux/bio.h>
47#include <linux/gfs2_ondisk.h> 47#include <linux/gfs2_ondisk.h>
48#include <linux/lm_interface.h> 48#include <linux/lm_interface.h>
49#include <linux/kthread.h>
50#include <linux/freezer.h>
49 51
50#include "gfs2.h" 52#include "gfs2.h"
51#include "incore.h" 53#include "incore.h"
@@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
94 struct gfs2_quota_data *qd; 96 struct gfs2_quota_data *qd;
95 int error; 97 int error;
96 98
97 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS); 99 qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
98 if (!qd) 100 if (!qd)
99 return -ENOMEM; 101 return -ENOMEM;
100 102
@@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
119 return 0; 121 return 0;
120 122
121fail: 123fail:
122 kfree(qd); 124 kmem_cache_free(gfs2_quotad_cachep, qd);
123 return error; 125 return error;
124} 126}
125 127
@@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
158 if (qd || !create) { 160 if (qd || !create) {
159 if (new_qd) { 161 if (new_qd) {
160 gfs2_lvb_unhold(new_qd->qd_gl); 162 gfs2_lvb_unhold(new_qd->qd_gl);
161 kfree(new_qd); 163 kmem_cache_free(gfs2_quotad_cachep, new_qd);
162 } 164 }
163 *qdp = qd; 165 *qdp = qd;
164 return 0; 166 return 0;
@@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1013 1015
1014 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) 1016 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
1015 return; 1017 return;
1016 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) 1018 if (ip->i_diskflags & GFS2_DIF_SYSTEM)
1017 return; 1019 return;
1018 1020
1019 for (x = 0; x < al->al_qd_num; x++) { 1021 for (x = 0; x < al->al_qd_num; x++) {
@@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
1100int gfs2_quota_init(struct gfs2_sbd *sdp) 1102int gfs2_quota_init(struct gfs2_sbd *sdp)
1101{ 1103{
1102 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1104 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1103 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; 1105 unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
1104 unsigned int x, slot = 0; 1106 unsigned int x, slot = 0;
1105 unsigned int found = 0; 1107 unsigned int found = 0;
1106 u64 dblock; 1108 u64 dblock;
1107 u32 extlen = 0; 1109 u32 extlen = 0;
1108 int error; 1110 int error;
1109 1111
1110 if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) || 1112 if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
1111 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) { 1113 ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
1112 gfs2_consist_inode(ip); 1114 gfs2_consist_inode(ip);
1113 return -EIO; 1115 return -EIO;
1114 } 1116 }
@@ -1195,7 +1197,7 @@ fail:
1195 return error; 1197 return error;
1196} 1198}
1197 1199
1198void gfs2_quota_scan(struct gfs2_sbd *sdp) 1200static void gfs2_quota_scan(struct gfs2_sbd *sdp)
1199{ 1201{
1200 struct gfs2_quota_data *qd, *safe; 1202 struct gfs2_quota_data *qd, *safe;
1201 LIST_HEAD(dead); 1203 LIST_HEAD(dead);
@@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp)
1222 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1224 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1223 1225
1224 gfs2_lvb_unhold(qd->qd_gl); 1226 gfs2_lvb_unhold(qd->qd_gl);
1225 kfree(qd); 1227 kmem_cache_free(gfs2_quotad_cachep, qd);
1226 } 1228 }
1227} 1229}
1228 1230
@@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1257 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1259 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1258 1260
1259 gfs2_lvb_unhold(qd->qd_gl); 1261 gfs2_lvb_unhold(qd->qd_gl);
1260 kfree(qd); 1262 kmem_cache_free(gfs2_quotad_cachep, qd);
1261 1263
1262 spin_lock(&sdp->sd_quota_spin); 1264 spin_lock(&sdp->sd_quota_spin);
1263 } 1265 }
@@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1272 } 1274 }
1273} 1275}
1274 1276
1277static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
1278{
1279 if (error == 0 || error == -EROFS)
1280 return;
1281 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
1282 fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
1283}
1284
1285static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
1286 int (*fxn)(struct gfs2_sbd *sdp),
1287 unsigned long t, unsigned long *timeo,
1288 unsigned int *new_timeo)
1289{
1290 if (t >= *timeo) {
1291 int error = fxn(sdp);
1292 quotad_error(sdp, msg, error);
1293 *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
1294 } else {
1295 *timeo -= t;
1296 }
1297}
1298
1299static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
1300{
1301 struct gfs2_inode *ip;
1302
1303 while(1) {
1304 ip = NULL;
1305 spin_lock(&sdp->sd_trunc_lock);
1306 if (!list_empty(&sdp->sd_trunc_list)) {
1307 ip = list_entry(sdp->sd_trunc_list.next,
1308 struct gfs2_inode, i_trunc_list);
1309 list_del_init(&ip->i_trunc_list);
1310 }
1311 spin_unlock(&sdp->sd_trunc_lock);
1312 if (ip == NULL)
1313 return;
1314 gfs2_glock_finish_truncate(ip);
1315 }
1316}
1317
1318/**
1319 * gfs2_quotad - Write cached quota changes into the quota file
1320 * @sdp: Pointer to GFS2 superblock
1321 *
1322 */
1323
1324int gfs2_quotad(void *data)
1325{
1326 struct gfs2_sbd *sdp = data;
1327 struct gfs2_tune *tune = &sdp->sd_tune;
1328 unsigned long statfs_timeo = 0;
1329 unsigned long quotad_timeo = 0;
1330 unsigned long t = 0;
1331 DEFINE_WAIT(wait);
1332 int empty;
1333
1334 while (!kthread_should_stop()) {
1335
1336 /* Update the master statfs file */
1337 quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
1338 &statfs_timeo, &tune->gt_statfs_quantum);
1339
1340 /* Update quota file */
1341 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
1342 &quotad_timeo, &tune->gt_quota_quantum);
1343
1344 /* FIXME: This should be turned into a shrinker */
1345 gfs2_quota_scan(sdp);
1346
1347 /* Check for & recover partially truncated inodes */
1348 quotad_check_trunc_list(sdp);
1349
1350 if (freezing(current))
1351 refrigerator();
1352 t = min(quotad_timeo, statfs_timeo);
1353
1354 prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
1355 spin_lock(&sdp->sd_trunc_lock);
1356 empty = list_empty(&sdp->sd_trunc_list);
1357 spin_unlock(&sdp->sd_trunc_lock);
1358 if (empty)
1359 t -= schedule_timeout(t);
1360 else
1361 t = 0;
1362 finish_wait(&sdp->sd_quota_wait, &wait);
1363 }
1364
1365 return 0;
1366}
1367
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 3b7f4b0e5dfe..cec9032be97d 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,22 +15,22 @@ struct gfs2_sbd;
15 15
16#define NO_QUOTA_CHANGE ((u32)-1) 16#define NO_QUOTA_CHANGE ((u32)-1)
17 17
18int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); 18extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
19void gfs2_quota_unhold(struct gfs2_inode *ip); 19extern void gfs2_quota_unhold(struct gfs2_inode *ip);
20 20
21int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); 21extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
22void gfs2_quota_unlock(struct gfs2_inode *ip); 22extern void gfs2_quota_unlock(struct gfs2_inode *ip);
23 23
24int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); 24extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
25void gfs2_quota_change(struct gfs2_inode *ip, s64 change, 25extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
26 u32 uid, u32 gid); 26 u32 uid, u32 gid);
27 27
28int gfs2_quota_sync(struct gfs2_sbd *sdp); 28extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
29int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); 29extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
30 30
31int gfs2_quota_init(struct gfs2_sbd *sdp); 31extern int gfs2_quota_init(struct gfs2_sbd *sdp);
32void gfs2_quota_scan(struct gfs2_sbd *sdp); 32extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
33void gfs2_quota_cleanup(struct gfs2_sbd *sdp); 33extern int gfs2_quotad(void *data);
34 34
35static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) 35static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
36{ 36{
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index d5e91f4f6a0b..efd09c3d2b26 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,8 @@
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/lm_interface.h> 16#include <linux/lm_interface.h>
17#include <linux/kthread.h>
18#include <linux/freezer.h>
17 19
18#include "gfs2.h" 20#include "gfs2.h"
19#include "incore.h" 21#include "incore.h"
@@ -583,13 +585,35 @@ fail:
583 return error; 585 return error;
584} 586}
585 587
588static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
589{
590 struct gfs2_jdesc *jd;
591 int found = 0;
592
593 spin_lock(&sdp->sd_jindex_spin);
594
595 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
596 if (jd->jd_dirty) {
597 jd->jd_dirty = 0;
598 found = 1;
599 break;
600 }
601 }
602 spin_unlock(&sdp->sd_jindex_spin);
603
604 if (!found)
605 jd = NULL;
606
607 return jd;
608}
609
586/** 610/**
587 * gfs2_check_journals - Recover any dirty journals 611 * gfs2_check_journals - Recover any dirty journals
588 * @sdp: the filesystem 612 * @sdp: the filesystem
589 * 613 *
590 */ 614 */
591 615
592void gfs2_check_journals(struct gfs2_sbd *sdp) 616static void gfs2_check_journals(struct gfs2_sbd *sdp)
593{ 617{
594 struct gfs2_jdesc *jd; 618 struct gfs2_jdesc *jd;
595 619
@@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp)
603 } 627 }
604} 628}
605 629
630/**
631 * gfs2_recoverd - Recover dead machine's journals
632 * @sdp: Pointer to GFS2 superblock
633 *
634 */
635
636int gfs2_recoverd(void *data)
637{
638 struct gfs2_sbd *sdp = data;
639 unsigned long t;
640
641 while (!kthread_should_stop()) {
642 gfs2_check_journals(sdp);
643 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
644 if (freezing(current))
645 refrigerator();
646 schedule_timeout_interruptible(t);
647 }
648
649 return 0;
650}
651
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index f7235e61c723..a8218ea15b57 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
18 *blk = 0; 18 *blk = 0;
19} 19}
20 20
21int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, 21extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
22 struct buffer_head **bh); 22 struct buffer_head **bh);
23 23
24int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 24extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
25int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 25extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
26void gfs2_revoke_clean(struct gfs2_sbd *sdp); 26extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
27 27
28int gfs2_find_jhead(struct gfs2_jdesc *jd, 28extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header_host *head); 29 struct gfs2_log_header_host *head);
30int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); 30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
31void gfs2_check_journals(struct gfs2_sbd *sdp); 31extern int gfs2_recoverd(void *data);
32 32
33#endif /* __RECOVERY_DOT_H__ */ 33#endif /* __RECOVERY_DOT_H__ */
34 34
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d90fb253505..8b01c635d925 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
269 bi->bi_len, x); 269 bi->bi_len, x);
270 } 270 }
271 271
272 if (count[0] != rgd->rd_rg.rg_free) { 272 if (count[0] != rgd->rd_free) {
273 if (gfs2_consist_rgrpd(rgd)) 273 if (gfs2_consist_rgrpd(rgd))
274 fs_err(sdp, "free data mismatch: %u != %u\n", 274 fs_err(sdp, "free data mismatch: %u != %u\n",
275 count[0], rgd->rd_rg.rg_free); 275 count[0], rgd->rd_free);
276 return; 276 return;
277 } 277 }
278 278
279 tmp = rgd->rd_data - 279 tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
280 rgd->rd_rg.rg_free -
281 rgd->rd_rg.rg_dinodes;
282 if (count[1] + count[2] != tmp) { 280 if (count[1] + count[2] != tmp) {
283 if (gfs2_consist_rgrpd(rgd)) 281 if (gfs2_consist_rgrpd(rgd))
284 fs_err(sdp, "used data mismatch: %u != %u\n", 282 fs_err(sdp, "used data mismatch: %u != %u\n",
@@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
286 return; 284 return;
287 } 285 }
288 286
289 if (count[3] != rgd->rd_rg.rg_dinodes) { 287 if (count[3] != rgd->rd_dinodes) {
290 if (gfs2_consist_rgrpd(rgd)) 288 if (gfs2_consist_rgrpd(rgd))
291 fs_err(sdp, "used metadata mismatch: %u != %u\n", 289 fs_err(sdp, "used metadata mismatch: %u != %u\n",
292 count[3], rgd->rd_rg.rg_dinodes); 290 count[3], rgd->rd_dinodes);
293 return; 291 return;
294 } 292 }
295 293
@@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
501 for (rgrps = 0;; rgrps++) { 499 for (rgrps = 0;; rgrps++) {
502 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 500 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
503 501
504 if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size) 502 if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
505 break; 503 break;
506 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 504 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
507 sizeof(struct gfs2_rindex)); 505 sizeof(struct gfs2_rindex));
@@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
590 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
591 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
592 struct file_ra_state ra_state; 590 struct file_ra_state ra_state;
593 u64 rgrp_count = ip->i_di.di_size; 591 u64 rgrp_count = ip->i_disksize;
594 int error; 592 int error;
595 593
596 if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { 594 if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
@@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
634 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { 632 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
635 /* Ignore partials */ 633 /* Ignore partials */
636 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > 634 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
637 ip->i_di.di_size) 635 ip->i_disksize)
638 break; 636 break;
639 error = read_rindex_entry(ip, &ra_state); 637 error = read_rindex_entry(ip, &ra_state);
640 if (error) { 638 if (error) {
@@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
692static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) 690static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
693{ 691{
694 const struct gfs2_rgrp *str = buf; 692 const struct gfs2_rgrp *str = buf;
695 struct gfs2_rgrp_host *rg = &rgd->rd_rg;
696 u32 rg_flags; 693 u32 rg_flags;
697 694
698 rg_flags = be32_to_cpu(str->rg_flags); 695 rg_flags = be32_to_cpu(str->rg_flags);
@@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
700 rgd->rd_flags |= GFS2_RDF_NOALLOC; 697 rgd->rd_flags |= GFS2_RDF_NOALLOC;
701 else 698 else
702 rgd->rd_flags &= ~GFS2_RDF_NOALLOC; 699 rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
703 rg->rg_free = be32_to_cpu(str->rg_free); 700 rgd->rd_free = be32_to_cpu(str->rg_free);
704 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); 701 rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
705 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); 702 rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
706} 703}
707 704
708static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) 705static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
709{ 706{
710 struct gfs2_rgrp *str = buf; 707 struct gfs2_rgrp *str = buf;
711 struct gfs2_rgrp_host *rg = &rgd->rd_rg;
712 u32 rg_flags = 0; 708 u32 rg_flags = 0;
713 709
714 if (rgd->rd_flags & GFS2_RDF_NOALLOC) 710 if (rgd->rd_flags & GFS2_RDF_NOALLOC)
715 rg_flags |= GFS2_RGF_NOALLOC; 711 rg_flags |= GFS2_RGF_NOALLOC;
716 str->rg_flags = cpu_to_be32(rg_flags); 712 str->rg_flags = cpu_to_be32(rg_flags);
717 str->rg_free = cpu_to_be32(rg->rg_free); 713 str->rg_free = cpu_to_be32(rgd->rd_free);
718 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); 714 str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
719 str->__pad = cpu_to_be32(0); 715 str->__pad = cpu_to_be32(0);
720 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); 716 str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
721 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); 717 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
722} 718}
723 719
@@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
776 } 772 }
777 773
778 spin_lock(&sdp->sd_rindex_spin); 774 spin_lock(&sdp->sd_rindex_spin);
779 rgd->rd_free_clone = rgd->rd_rg.rg_free; 775 rgd->rd_free_clone = rgd->rd_free;
780 rgd->rd_bh_count++; 776 rgd->rd_bh_count++;
781 spin_unlock(&sdp->sd_rindex_spin); 777 spin_unlock(&sdp->sd_rindex_spin);
782 778
@@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
850 } 846 }
851 847
852 spin_lock(&sdp->sd_rindex_spin); 848 spin_lock(&sdp->sd_rindex_spin);
853 rgd->rd_free_clone = rgd->rd_rg.rg_free; 849 rgd->rd_free_clone = rgd->rd_free;
854 spin_unlock(&sdp->sd_rindex_spin); 850 spin_unlock(&sdp->sd_rindex_spin);
855} 851}
856 852
@@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1403 block = rgd->rd_data0 + blk; 1399 block = rgd->rd_data0 + blk;
1404 ip->i_goal = block; 1400 ip->i_goal = block;
1405 1401
1406 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n); 1402 gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
1407 rgd->rd_rg.rg_free -= *n; 1403 rgd->rd_free -= *n;
1408 1404
1409 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1405 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1410 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1406 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1445 1441
1446 block = rgd->rd_data0 + blk; 1442 block = rgd->rd_data0 + blk;
1447 1443
1448 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); 1444 gfs2_assert_withdraw(sdp, rgd->rd_free);
1449 rgd->rd_rg.rg_free--; 1445 rgd->rd_free--;
1450 rgd->rd_rg.rg_dinodes++; 1446 rgd->rd_dinodes++;
1451 *generation = rgd->rd_rg.rg_igeneration++; 1447 *generation = rgd->rd_igeneration++;
1452 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1448 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1453 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1449 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1454 1450
@@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1481 if (!rgd) 1477 if (!rgd)
1482 return; 1478 return;
1483 1479
1484 rgd->rd_rg.rg_free += blen; 1480 rgd->rd_free += blen;
1485 1481
1486 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1482 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1487 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1483 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1509 if (!rgd) 1505 if (!rgd)
1510 return; 1506 return;
1511 1507
1512 rgd->rd_rg.rg_free += blen; 1508 rgd->rd_free += blen;
1513 1509
1514 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1510 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1515 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1511 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1546 return; 1542 return;
1547 gfs2_assert_withdraw(sdp, rgd == tmp_rgd); 1543 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1548 1544
1549 if (!rgd->rd_rg.rg_dinodes) 1545 if (!rgd->rd_dinodes)
1550 gfs2_consist_rgrpd(rgd); 1546 gfs2_consist_rgrpd(rgd);
1551 rgd->rd_rg.rg_dinodes--; 1547 rgd->rd_dinodes--;
1552 rgd->rd_rg.rg_free++; 1548 rgd->rd_free++;
1553 1549
1554 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1550 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1555 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1551 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c3ba3d9d0aac..141b781f2fcc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -34,76 +34,6 @@
34#include "util.h" 34#include "util.h"
35 35
36/** 36/**
37 * gfs2_jindex_hold - Grab a lock on the jindex
38 * @sdp: The GFS2 superblock
39 * @ji_gh: the holder for the jindex glock
40 *
41 * This is very similar to the gfs2_rindex_hold() function, except that
42 * in general we hold the jindex lock for longer periods of time and
43 * we grab it far less frequently (in general) then the rgrp lock.
44 *
45 * Returns: errno
46 */
47
48int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
49{
50 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
51 struct qstr name;
52 char buf[20];
53 struct gfs2_jdesc *jd;
54 int error;
55
56 name.name = buf;
57
58 mutex_lock(&sdp->sd_jindex_mutex);
59
60 for (;;) {
61 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
62 if (error)
63 break;
64
65 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
66 name.hash = gfs2_disk_hash(name.name, name.len);
67
68 error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
69 if (error == -ENOENT) {
70 error = 0;
71 break;
72 }
73
74 gfs2_glock_dq_uninit(ji_gh);
75
76 if (error)
77 break;
78
79 error = -ENOMEM;
80 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
81 if (!jd)
82 break;
83
84 INIT_LIST_HEAD(&jd->extent_list);
85 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
86 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
87 if (!jd->jd_inode)
88 error = -ENOENT;
89 else
90 error = PTR_ERR(jd->jd_inode);
91 kfree(jd);
92 break;
93 }
94
95 spin_lock(&sdp->sd_jindex_spin);
96 jd->jd_jid = sdp->sd_journals++;
97 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
98 spin_unlock(&sdp->sd_jindex_spin);
99 }
100
101 mutex_unlock(&sdp->sd_jindex_mutex);
102
103 return error;
104}
105
106/**
107 * gfs2_jindex_free - Clear all the journal index information 37 * gfs2_jindex_free - Clear all the journal index information
108 * @sdp: The GFS2 superblock 38 * @sdp: The GFS2 superblock
109 * 39 *
@@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
166 return jd; 96 return jd;
167} 97}
168 98
169void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
170{
171 struct gfs2_jdesc *jd;
172
173 spin_lock(&sdp->sd_jindex_spin);
174 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
175 if (jd)
176 jd->jd_dirty = 1;
177 spin_unlock(&sdp->sd_jindex_spin);
178}
179
180struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
181{
182 struct gfs2_jdesc *jd;
183 int found = 0;
184
185 spin_lock(&sdp->sd_jindex_spin);
186
187 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
188 if (jd->jd_dirty) {
189 jd->jd_dirty = 0;
190 found = 1;
191 break;
192 }
193 }
194 spin_unlock(&sdp->sd_jindex_spin);
195
196 if (!found)
197 jd = NULL;
198
199 return jd;
200}
201
202int gfs2_jdesc_check(struct gfs2_jdesc *jd) 99int gfs2_jdesc_check(struct gfs2_jdesc *jd)
203{ 100{
204 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 101 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
206 int ar; 103 int ar;
207 int error; 104 int error;
208 105
209 if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) || 106 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
210 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) { 107 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
211 gfs2_consist_inode(ip); 108 gfs2_consist_inode(ip);
212 return -EIO; 109 return -EIO;
213 } 110 }
214 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; 111 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
215 112
216 error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar); 113 error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
217 if (!error && ar) { 114 if (!error && ar) {
218 gfs2_consist_inode(ip); 115 gfs2_consist_inode(ip);
219 error = -EIO; 116 error = -EIO;
@@ -423,137 +320,6 @@ out:
423 return error; 320 return error;
424} 321}
425 322
426/**
427 * gfs2_statfs_i - Do a statfs
428 * @sdp: the filesystem
429 * @sg: the sg structure
430 *
431 * Returns: errno
432 */
433
434int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
435{
436 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
437 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
438
439 spin_lock(&sdp->sd_statfs_spin);
440
441 *sc = *m_sc;
442 sc->sc_total += l_sc->sc_total;
443 sc->sc_free += l_sc->sc_free;
444 sc->sc_dinodes += l_sc->sc_dinodes;
445
446 spin_unlock(&sdp->sd_statfs_spin);
447
448 if (sc->sc_free < 0)
449 sc->sc_free = 0;
450 if (sc->sc_free > sc->sc_total)
451 sc->sc_free = sc->sc_total;
452 if (sc->sc_dinodes < 0)
453 sc->sc_dinodes = 0;
454
455 return 0;
456}
457
458/**
459 * statfs_fill - fill in the sg for a given RG
460 * @rgd: the RG
461 * @sc: the sc structure
462 *
463 * Returns: 0 on success, -ESTALE if the LVB is invalid
464 */
465
466static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
467 struct gfs2_statfs_change_host *sc)
468{
469 gfs2_rgrp_verify(rgd);
470 sc->sc_total += rgd->rd_data;
471 sc->sc_free += rgd->rd_rg.rg_free;
472 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
473 return 0;
474}
475
476/**
477 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
478 * @sdp: the filesystem
479 * @sc: the sc info that will be returned
480 *
481 * Any error (other than a signal) will cause this routine to fall back
482 * to the synchronous version.
483 *
484 * FIXME: This really shouldn't busy wait like this.
485 *
486 * Returns: errno
487 */
488
489int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
490{
491 struct gfs2_holder ri_gh;
492 struct gfs2_rgrpd *rgd_next;
493 struct gfs2_holder *gha, *gh;
494 unsigned int slots = 64;
495 unsigned int x;
496 int done;
497 int error = 0, err;
498
499 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
500 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
501 if (!gha)
502 return -ENOMEM;
503
504 error = gfs2_rindex_hold(sdp, &ri_gh);
505 if (error)
506 goto out;
507
508 rgd_next = gfs2_rgrpd_get_first(sdp);
509
510 for (;;) {
511 done = 1;
512
513 for (x = 0; x < slots; x++) {
514 gh = gha + x;
515
516 if (gh->gh_gl && gfs2_glock_poll(gh)) {
517 err = gfs2_glock_wait(gh);
518 if (err) {
519 gfs2_holder_uninit(gh);
520 error = err;
521 } else {
522 if (!error)
523 error = statfs_slow_fill(
524 gh->gh_gl->gl_object, sc);
525 gfs2_glock_dq_uninit(gh);
526 }
527 }
528
529 if (gh->gh_gl)
530 done = 0;
531 else if (rgd_next && !error) {
532 error = gfs2_glock_nq_init(rgd_next->rd_gl,
533 LM_ST_SHARED,
534 GL_ASYNC,
535 gh);
536 rgd_next = gfs2_rgrpd_get_next(rgd_next);
537 done = 0;
538 }
539
540 if (signal_pending(current))
541 error = -ERESTARTSYS;
542 }
543
544 if (done)
545 break;
546
547 yield();
548 }
549
550 gfs2_glock_dq_uninit(&ri_gh);
551
552out:
553 kfree(gha);
554 return error;
555}
556
557struct lfcc { 323struct lfcc {
558 struct list_head list; 324 struct list_head list;
559 struct gfs2_holder gh; 325 struct gfs2_holder gh;
@@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
580 struct gfs2_log_header_host lh; 346 struct gfs2_log_header_host lh;
581 int error; 347 int error;
582 348
583 error = gfs2_jindex_hold(sdp, &ji_gh);
584 if (error)
585 return error;
586
587 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 349 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
588 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); 350 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
589 if (!lfcc) { 351 if (!lfcc) {
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 50a4c9b1215e..f6b8b00ad881 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -10,6 +10,8 @@
10#ifndef __SUPER_DOT_H__ 10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__ 11#define __SUPER_DOT_H__
12 12
13#include <linux/fs.h>
14#include <linux/dcache.h>
13#include "incore.h" 15#include "incore.h"
14 16
15void gfs2_lm_unmount(struct gfs2_sbd *sdp); 17void gfs2_lm_unmount(struct gfs2_sbd *sdp);
@@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
23 return x; 25 return x;
24} 26}
25 27
26int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
27void gfs2_jindex_free(struct gfs2_sbd *sdp); 28void gfs2_jindex_free(struct gfs2_sbd *sdp);
28 29
29struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); 30struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
30void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
31struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
32int gfs2_jdesc_check(struct gfs2_jdesc *jd); 31int gfs2_jdesc_check(struct gfs2_jdesc *jd);
33 32
34int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, 33int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
@@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
40void gfs2_statfs_change(struct gfs2_sbd *sdp, 39void gfs2_statfs_change(struct gfs2_sbd *sdp,
41 s64 total, s64 free, s64 dinodes); 40 s64 total, s64 free, s64 dinodes);
42int gfs2_statfs_sync(struct gfs2_sbd *sdp); 41int gfs2_statfs_sync(struct gfs2_sbd *sdp);
43int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
44int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
45 42
46int gfs2_freeze_fs(struct gfs2_sbd *sdp); 43int gfs2_freeze_fs(struct gfs2_sbd *sdp);
47void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); 44void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
48 45
46extern struct file_system_type gfs2_fs_type;
47extern struct file_system_type gfs2meta_fs_type;
48extern const struct export_operations gfs2_export_ops;
49extern const struct super_operations gfs2_super_ops;
50extern struct dentry_operations gfs2_dops;
51
49#endif /* __SUPER_DOT_H__ */ 52#endif /* __SUPER_DOT_H__ */
50 53
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7e1879f1a02c..26c1fa777a95 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,9 +26,6 @@
26#include "quota.h" 26#include "quota.h"
27#include "util.h" 27#include "util.h"
28 28
29char *gfs2_sys_margs;
30spinlock_t gfs2_sys_margs_lock;
31
32static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) 29static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
33{ 30{
34 return snprintf(buf, PAGE_SIZE, "%u:%u\n", 31 return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -263,7 +260,6 @@ ARGS_ATTR(localcaching, "%d\n");
263ARGS_ATTR(localflocks, "%d\n"); 260ARGS_ATTR(localflocks, "%d\n");
264ARGS_ATTR(debug, "%d\n"); 261ARGS_ATTR(debug, "%d\n");
265ARGS_ATTR(upgrade, "%d\n"); 262ARGS_ATTR(upgrade, "%d\n");
266ARGS_ATTR(num_glockd, "%u\n");
267ARGS_ATTR(posix_acl, "%d\n"); 263ARGS_ATTR(posix_acl, "%d\n");
268ARGS_ATTR(quota, "%u\n"); 264ARGS_ATTR(quota, "%u\n");
269ARGS_ATTR(suiddir, "%d\n"); 265ARGS_ATTR(suiddir, "%d\n");
@@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = {
279 &args_attr_localflocks.attr, 275 &args_attr_localflocks.attr,
280 &args_attr_debug.attr, 276 &args_attr_debug.attr,
281 &args_attr_upgrade.attr, 277 &args_attr_upgrade.attr,
282 &args_attr_num_glockd.attr,
283 &args_attr_posix_acl.attr, 278 &args_attr_posix_acl.attr,
284 &args_attr_quota.attr, 279 &args_attr_quota.attr,
285 &args_attr_suiddir.attr, 280 &args_attr_suiddir.attr,
@@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = {
288}; 283};
289 284
290/* 285/*
291 * display counters from superblock
292 */
293
294struct counters_attr {
295 struct attribute attr;
296 ssize_t (*show)(struct gfs2_sbd *, char *);
297};
298
299#define COUNTERS_ATTR(name, fmt) \
300static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
301{ \
302 return snprintf(buf, PAGE_SIZE, fmt, \
303 (unsigned int)atomic_read(&sdp->sd_##name)); \
304} \
305static struct counters_attr counters_attr_##name = __ATTR_RO(name)
306
307COUNTERS_ATTR(reclaimed, "%u\n");
308
309static struct attribute *counters_attrs[] = {
310 &counters_attr_reclaimed.attr,
311 NULL,
312};
313
314/*
315 * get and set struct gfs2_tune fields 286 * get and set struct gfs2_tune fields
316 */ 287 */
317 288
@@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
393} \ 364} \
394TUNE_ATTR_2(name, name##_store) 365TUNE_ATTR_2(name, name##_store)
395 366
396TUNE_ATTR(demote_secs, 0);
397TUNE_ATTR(incore_log_blocks, 0); 367TUNE_ATTR(incore_log_blocks, 0);
398TUNE_ATTR(log_flush_secs, 0); 368TUNE_ATTR(log_flush_secs, 0);
399TUNE_ATTR(quota_warn_period, 0); 369TUNE_ATTR(quota_warn_period, 0);
@@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1);
408TUNE_ATTR(statfs_quantum, 1); 378TUNE_ATTR(statfs_quantum, 1);
409TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); 379TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
410TUNE_ATTR_DAEMON(logd_secs, logd_process); 380TUNE_ATTR_DAEMON(logd_secs, logd_process);
411TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
412TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 381TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
413 382
414static struct attribute *tune_attrs[] = { 383static struct attribute *tune_attrs[] = {
415 &tune_attr_demote_secs.attr,
416 &tune_attr_incore_log_blocks.attr, 384 &tune_attr_incore_log_blocks.attr,
417 &tune_attr_log_flush_secs.attr, 385 &tune_attr_log_flush_secs.attr,
418 &tune_attr_quota_warn_period.attr, 386 &tune_attr_quota_warn_period.attr,
@@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = {
426 &tune_attr_statfs_quantum.attr, 394 &tune_attr_statfs_quantum.attr,
427 &tune_attr_recoverd_secs.attr, 395 &tune_attr_recoverd_secs.attr,
428 &tune_attr_logd_secs.attr, 396 &tune_attr_logd_secs.attr,
429 &tune_attr_quotad_secs.attr,
430 &tune_attr_quota_scale.attr, 397 &tune_attr_quota_scale.attr,
431 &tune_attr_new_files_jdata.attr, 398 &tune_attr_new_files_jdata.attr,
432 NULL, 399 NULL,
@@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = {
437 .attrs = lockstruct_attrs, 404 .attrs = lockstruct_attrs,
438}; 405};
439 406
440static struct attribute_group counters_group = {
441 .name = "counters",
442 .attrs = counters_attrs,
443};
444
445static struct attribute_group args_group = { 407static struct attribute_group args_group = {
446 .name = "args", 408 .name = "args",
447 .attrs = args_attrs, 409 .attrs = args_attrs,
@@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
466 if (error) 428 if (error)
467 goto fail_reg; 429 goto fail_reg;
468 430
469 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
470 if (error)
471 goto fail_lockstruct;
472
473 error = sysfs_create_group(&sdp->sd_kobj, &args_group); 431 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
474 if (error) 432 if (error)
475 goto fail_counters; 433 goto fail_lockstruct;
476 434
477 error = sysfs_create_group(&sdp->sd_kobj, &tune_group); 435 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
478 if (error) 436 if (error)
@@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
483 441
484fail_args: 442fail_args:
485 sysfs_remove_group(&sdp->sd_kobj, &args_group); 443 sysfs_remove_group(&sdp->sd_kobj, &args_group);
486fail_counters:
487 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
488fail_lockstruct: 444fail_lockstruct:
489 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 445 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
490fail_reg: 446fail_reg:
@@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
498{ 454{
499 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 455 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
500 sysfs_remove_group(&sdp->sd_kobj, &args_group); 456 sysfs_remove_group(&sdp->sd_kobj, &args_group);
501 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
502 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 457 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
503 kobject_put(&sdp->sd_kobj); 458 kobject_put(&sdp->sd_kobj);
504} 459}
505 460
461static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
462 struct kobj_uevent_env *env)
463{
464 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
465 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
466 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
467 return 0;
468}
469
470static struct kset_uevent_ops gfs2_uevent_ops = {
471 .uevent = gfs2_uevent,
472};
473
474
506int gfs2_sys_init(void) 475int gfs2_sys_init(void)
507{ 476{
508 gfs2_sys_margs = NULL; 477 gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
509 spin_lock_init(&gfs2_sys_margs_lock);
510 gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
511 if (!gfs2_kset) 478 if (!gfs2_kset)
512 return -ENOMEM; 479 return -ENOMEM;
513 return 0; 480 return 0;
@@ -515,7 +482,6 @@ int gfs2_sys_init(void)
515 482
516void gfs2_sys_uninit(void) 483void gfs2_sys_uninit(void)
517{ 484{
518 kfree(gfs2_sys_margs);
519 kset_unregister(gfs2_kset); 485 kset_unregister(gfs2_kset);
520} 486}
521 487
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 1ca8cdac5304..e94560e836d7 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -13,10 +13,6 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14struct gfs2_sbd; 14struct gfs2_sbd;
15 15
16/* Allow args to be passed to GFS2 when using an initial ram disk */
17extern char *gfs2_sys_margs;
18extern spinlock_t gfs2_sys_margs_lock;
19
20int gfs2_sys_fs_add(struct gfs2_sbd *sdp); 16int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
21void gfs2_sys_fs_del(struct gfs2_sbd *sdp); 17void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
22 18
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d31e355c61fb..374f50e95496 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly;
25struct kmem_cache *gfs2_inode_cachep __read_mostly; 25struct kmem_cache *gfs2_inode_cachep __read_mostly;
26struct kmem_cache *gfs2_bufdata_cachep __read_mostly; 26struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; 27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
28struct kmem_cache *gfs2_quotad_cachep __read_mostly;
28 29
29void gfs2_assert_i(struct gfs2_sbd *sdp) 30void gfs2_assert_i(struct gfs2_sbd *sdp)
30{ 31{
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 7f48576289c9..33e96b0ce9ab 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep;
148extern struct kmem_cache *gfs2_inode_cachep; 148extern struct kmem_cache *gfs2_inode_cachep;
149extern struct kmem_cache *gfs2_bufdata_cachep; 149extern struct kmem_cache *gfs2_bufdata_cachep;
150extern struct kmem_cache *gfs2_rgrpd_cachep; 150extern struct kmem_cache *gfs2_rgrpd_cachep;
151extern struct kmem_cache *gfs2_quotad_cachep;
151 152
152static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, 153static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
153 unsigned int *p) 154 unsigned int *p)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3aceb..6903d37af037 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
252 for (;;) { 252 for (;;) {
253 struct page *page; 253 struct page *page;
254 unsigned long nr, ret; 254 unsigned long nr, ret;
255 int ra;
255 256
256 /* nr is the maximum number of bytes to copy from this page */ 257 /* nr is the maximum number of bytes to copy from this page */
257 nr = huge_page_size(h); 258 nr = huge_page_size(h);
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
274 */ 275 */
275 ret = len < nr ? len : nr; 276 ret = len < nr ? len : nr;
276 if (clear_user(buf, ret)) 277 if (clear_user(buf, ret))
277 ret = -EFAULT; 278 ra = -EFAULT;
279 else
280 ra = 0;
278 } else { 281 } else {
279 /* 282 /*
280 * We have the page, copy it to user space buffer. 283 * We have the page, copy it to user space buffer.
281 */ 284 */
282 ret = hugetlbfs_read_actor(page, offset, buf, len, nr); 285 ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
286 ret = ra;
283 } 287 }
284 if (ret < 0) { 288 if (ra < 0) {
285 if (retval == 0) 289 if (retval == 0)
286 retval = ret; 290 retval = ra;
287 if (page) 291 if (page)
288 page_cache_release(page); 292 page_cache_release(page);
289 goto out; 293 goto out;
@@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
506 inode->i_mode = mode; 510 inode->i_mode = mode;
507 inode->i_uid = uid; 511 inode->i_uid = uid;
508 inode->i_gid = gid; 512 inode->i_gid = gid;
509 inode->i_blocks = 0;
510 inode->i_mapping->a_ops = &hugetlbfs_aops; 513 inode->i_mapping->a_ops = &hugetlbfs_aops;
511 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 514 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
512 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 515 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 7de1cda92489..0013ac1af8e7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/inotify.h> 23#include <linux/inotify.h>
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/async.h>
25 26
26/* 27/*
27 * This is needed for the following functions: 28 * This is needed for the following functions:
@@ -110,8 +111,8 @@ static void wake_up_inode(struct inode *inode)
110 111
111/** 112/**
112 * inode_init_always - perform inode structure intialisation 113 * inode_init_always - perform inode structure intialisation
113 * @sb - superblock inode belongs to. 114 * @sb: superblock inode belongs to
114 * @inode - inode to initialise 115 * @inode: inode to initialise
115 * 116 *
116 * These are initializations that need to be done on every inode 117 * These are initializations that need to be done on every inode
117 * allocation as the fields are not initialised by slab allocation. 118 * allocation as the fields are not initialised by slab allocation.
@@ -131,6 +132,8 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
131 inode->i_op = &empty_iops; 132 inode->i_op = &empty_iops;
132 inode->i_fop = &empty_fops; 133 inode->i_fop = &empty_fops;
133 inode->i_nlink = 1; 134 inode->i_nlink = 1;
135 inode->i_uid = 0;
136 inode->i_gid = 0;
134 atomic_set(&inode->i_writecount, 0); 137 atomic_set(&inode->i_writecount, 0);
135 inode->i_size = 0; 138 inode->i_size = 0;
136 inode->i_blocks = 0; 139 inode->i_blocks = 0;
@@ -164,7 +167,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
164 mapping->a_ops = &empty_aops; 167 mapping->a_ops = &empty_aops;
165 mapping->host = inode; 168 mapping->host = inode;
166 mapping->flags = 0; 169 mapping->flags = 0;
167 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); 170 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
168 mapping->assoc_mapping = NULL; 171 mapping->assoc_mapping = NULL;
169 mapping->backing_dev_info = &default_backing_dev_info; 172 mapping->backing_dev_info = &default_backing_dev_info;
170 mapping->writeback_index = 0; 173 mapping->writeback_index = 0;
@@ -574,8 +577,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
574 577
575/** 578/**
576 * inode_add_to_lists - add a new inode to relevant lists 579 * inode_add_to_lists - add a new inode to relevant lists
577 * @sb - superblock inode belongs to. 580 * @sb: superblock inode belongs to
578 * @inode - inode to mark in use 581 * @inode: inode to mark in use
579 * 582 *
580 * When an inode is allocated it needs to be accounted for, added to the in use 583 * When an inode is allocated it needs to be accounted for, added to the in use
581 * list, the owning superblock and the inode hash. This needs to be done under 584 * list, the owning superblock and the inode hash. This needs to be done under
@@ -599,7 +602,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
599 * @sb: superblock 602 * @sb: superblock
600 * 603 *
601 * Allocates a new inode for given superblock. The default gfp_mask 604 * Allocates a new inode for given superblock. The default gfp_mask
602 * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. 605 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
603 * If HIGHMEM pages are unsuitable or it is known that pages allocated 606 * If HIGHMEM pages are unsuitable or it is known that pages allocated
604 * for the page cache are not reclaimable or migratable, 607 * for the page cache are not reclaimable or migratable,
605 * mapping_set_gfp_mask() must be called with suitable flags on the 608 * mapping_set_gfp_mask() must be called with suitable flags on the
@@ -1136,16 +1139,11 @@ EXPORT_SYMBOL(remove_inode_hash);
1136 * I_FREEING is set so that no-one will take a new reference to the inode while 1139 * I_FREEING is set so that no-one will take a new reference to the inode while
1137 * it is being deleted. 1140 * it is being deleted.
1138 */ 1141 */
1139void generic_delete_inode(struct inode *inode) 1142static void generic_delete_inode_async(void *data, async_cookie_t cookie)
1140{ 1143{
1144 struct inode *inode = data;
1141 const struct super_operations *op = inode->i_sb->s_op; 1145 const struct super_operations *op = inode->i_sb->s_op;
1142 1146
1143 list_del_init(&inode->i_list);
1144 list_del_init(&inode->i_sb_list);
1145 inode->i_state |= I_FREEING;
1146 inodes_stat.nr_inodes--;
1147 spin_unlock(&inode_lock);
1148
1149 security_inode_delete(inode); 1147 security_inode_delete(inode);
1150 1148
1151 if (op->delete_inode) { 1149 if (op->delete_inode) {
@@ -1169,6 +1167,16 @@ void generic_delete_inode(struct inode *inode)
1169 destroy_inode(inode); 1167 destroy_inode(inode);
1170} 1168}
1171 1169
1170void generic_delete_inode(struct inode *inode)
1171{
1172 list_del_init(&inode->i_list);
1173 list_del_init(&inode->i_sb_list);
1174 inode->i_state |= I_FREEING;
1175 inodes_stat.nr_inodes--;
1176 spin_unlock(&inode_lock);
1177 async_schedule_special(generic_delete_inode_async, inode, &inode->i_sb->s_async_list);
1178}
1179
1172EXPORT_SYMBOL(generic_delete_inode); 1180EXPORT_SYMBOL(generic_delete_inode);
1173 1181
1174static void generic_forget_inode(struct inode *inode) 1182static void generic_forget_inode(struct inode *inode)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 43e8b2c0664b..cc3f1aa1cf7b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) 231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); 232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
233 233
234/* 234/**
235 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
235 * @inode - the inode to map 236 * @inode - the inode to map
236 * @arg - the pointer to userspace where we copy everything to 237 * @arg - the pointer to userspace where we copy everything to
237 * @get_block - the fs's get_block function 238 * @get_block - the fs's get_block function
@@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
242 * 243 *
243 * If it is possible to have data blocks beyond a hole past @inode->i_size, then 244 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
244 * please do not use this function, it will stop at the first unmapped block 245 * please do not use this function, it will stop at the first unmapped block
245 * beyond i_size 246 * beyond i_size.
247 *
248 * If you use this function directly, you need to do your own locking. Use
249 * generic_block_fiemap if you want the locking done for you.
246 */ 250 */
247int generic_block_fiemap(struct inode *inode, 251
248 struct fiemap_extent_info *fieinfo, u64 start, 252int __generic_block_fiemap(struct inode *inode,
249 u64 len, get_block_t *get_block) 253 struct fiemap_extent_info *fieinfo, u64 start,
254 u64 len, get_block_t *get_block)
250{ 255{
251 struct buffer_head tmp; 256 struct buffer_head tmp;
252 unsigned int start_blk; 257 unsigned int start_blk;
@@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode,
260 265
261 start_blk = logical_to_blk(inode, start); 266 start_blk = logical_to_blk(inode, start);
262 267
263 /* guard against change */
264 mutex_lock(&inode->i_mutex);
265
266 length = (long long)min_t(u64, len, i_size_read(inode)); 268 length = (long long)min_t(u64, len, i_size_read(inode));
267 map_len = length; 269 map_len = length;
268 270
@@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode,
334 cond_resched(); 336 cond_resched();
335 } while (1); 337 } while (1);
336 338
337 mutex_unlock(&inode->i_mutex);
338
339 /* if ret is 1 then we just hit the end of the extent array */ 339 /* if ret is 1 then we just hit the end of the extent array */
340 if (ret == 1) 340 if (ret == 1)
341 ret = 0; 341 ret = 0;
342 342
343 return ret; 343 return ret;
344} 344}
345EXPORT_SYMBOL(__generic_block_fiemap);
346
347/**
348 * generic_block_fiemap - FIEMAP for block based inodes
349 * @inode: The inode to map
350 * @fieinfo: The mapping information
351 * @start: The initial block to map
352 * @len: The length of the extect to attempt to map
353 * @get_block: The block mapping function for the fs
354 *
355 * Calls __generic_block_fiemap to map the inode, after taking
356 * the inode's mutex lock.
357 */
358
359int generic_block_fiemap(struct inode *inode,
360 struct fiemap_extent_info *fieinfo, u64 start,
361 u64 len, get_block_t *get_block)
362{
363 int ret;
364 mutex_lock(&inode->i_mutex);
365 ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
366 mutex_unlock(&inode->i_mutex);
367 return ret;
368}
345EXPORT_SYMBOL(generic_block_fiemap); 369EXPORT_SYMBOL(generic_block_fiemap);
346 370
347#endif /* CONFIG_BLOCK */ 371#endif /* CONFIG_BLOCK */
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505b..6147ec3643a0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
855 } 855 }
856 sbi->s_joliet_level = joliet_level; 856 sbi->s_joliet_level = joliet_level;
857 857
858 /* check the root inode */
859 if (!inode->i_op)
860 goto out_bad_root;
861
862 /* Make sure the root inode is a directory */ 858 /* Make sure the root inode is a directory */
863 if (!S_ISDIR(inode->i_mode)) { 859 if (!S_ISDIR(inode->i_mode)) {
864 printk(KERN_WARNING 860 printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
886 /* 882 /*
887 * Display error messages and free resources. 883 * Display error messages and free resources.
888 */ 884 */
889out_bad_root:
890 printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
891out_iput: 885out_iput:
892 iput(inode); 886 iput(inode);
893 goto out_no_inode; 887 goto out_no_inode;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
306 int flags; 306 int flags;
307 int err; 307 int err;
308 unsigned long blocknr; 308 unsigned long blocknr;
309 ktime_t start_time;
310 u64 commit_time;
309 char *tagp = NULL; 311 char *tagp = NULL;
310 journal_header_t *header; 312 journal_header_t *header;
311 journal_block_tag_t *tag = NULL; 313 journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
418 commit_transaction->t_state = T_FLUSH; 420 commit_transaction->t_state = T_FLUSH;
419 journal->j_committing_transaction = commit_transaction; 421 journal->j_committing_transaction = commit_transaction;
420 journal->j_running_transaction = NULL; 422 journal->j_running_transaction = NULL;
423 start_time = ktime_get();
421 commit_transaction->t_log_start = journal->j_head; 424 commit_transaction->t_log_start = journal->j_head;
422 wake_up(&journal->j_wait_transaction_locked); 425 wake_up(&journal->j_wait_transaction_locked);
423 spin_unlock(&journal->j_state_lock); 426 spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
913 J_ASSERT(commit_transaction == journal->j_committing_transaction); 916 J_ASSERT(commit_transaction == journal->j_committing_transaction);
914 journal->j_commit_sequence = commit_transaction->t_tid; 917 journal->j_commit_sequence = commit_transaction->t_tid;
915 journal->j_committing_transaction = NULL; 918 journal->j_committing_transaction = NULL;
919 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
920
921 /*
922 * weight the commit time higher than the average time so we don't
923 * react too strongly to vast changes in commit time
924 */
925 if (likely(journal->j_average_commit_time))
926 journal->j_average_commit_time = (commit_time*3 +
927 journal->j_average_commit_time) / 4;
928 else
929 journal->j_average_commit_time = commit_time;
930
916 spin_unlock(&journal->j_state_lock); 931 spin_unlock(&journal->j_state_lock);
917 932
918 if (commit_transaction->t_checkpoint_list == NULL && 933 if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..e6a117431277 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __journal_temp_unlink_buffer(struct journal_head *jh); 30static void __journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
49{ 50{
50 transaction->t_journal = journal; 51 transaction->t_journal = journal;
51 transaction->t_state = T_RUNNING; 52 transaction->t_state = T_RUNNING;
53 transaction->t_start_time = ktime_get();
52 transaction->t_tid = journal->j_transaction_sequence++; 54 transaction->t_tid = journal->j_transaction_sequence++;
53 transaction->t_expires = jiffies + journal->j_commit_interval; 55 transaction->t_expires = jiffies + journal->j_commit_interval;
54 spin_lock_init(&transaction->t_handle_lock); 56 spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
752 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. 754 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
753 * @handle: transaction to add buffer modifications to 755 * @handle: transaction to add buffer modifications to
754 * @bh: bh to be used for metadata writes 756 * @bh: bh to be used for metadata writes
755 * @credits: variable that will receive credits for the buffer
756 * 757 *
757 * Returns an error code or 0 on success. 758 * Returns an error code or 0 on success.
758 * 759 *
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
1370{ 1371{
1371 transaction_t *transaction = handle->h_transaction; 1372 transaction_t *transaction = handle->h_transaction;
1372 journal_t *journal = transaction->t_journal; 1373 journal_t *journal = transaction->t_journal;
1373 int old_handle_count, err; 1374 int err;
1374 pid_t pid; 1375 pid_t pid;
1375 1376
1376 J_ASSERT(journal_current_handle() == handle); 1377 J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
1399 * on IO anyway. Speeds up many-threaded, many-dir operations 1400 * on IO anyway. Speeds up many-threaded, many-dir operations
1400 * by 30x or more... 1401 * by 30x or more...
1401 * 1402 *
1403 * We try and optimize the sleep time against what the underlying disk
1404 * can do, instead of having a static sleep time. This is usefull for
1405 * the case where our storage is so fast that it is more optimal to go
1406 * ahead and force a flush and wait for the transaction to be committed
1407 * than it is to wait for an arbitrary amount of time for new writers to
1408 * join the transaction. We acheive this by measuring how long it takes
1409 * to commit a transaction, and compare it with how long this
1410 * transaction has been running, and if run time < commit time then we
1411 * sleep for the delta and commit. This greatly helps super fast disks
1412 * that would see slowdowns as more threads started doing fsyncs.
1413 *
1402 * But don't do this if this process was the most recent one to 1414 * But don't do this if this process was the most recent one to
1403 * perform a synchronous write. We do this to detect the case where a 1415 * perform a synchronous write. We do this to detect the case where a
1404 * single process is doing a stream of sync writes. No point in waiting 1416 * single process is doing a stream of sync writes. No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
1406 */ 1418 */
1407 pid = current->pid; 1419 pid = current->pid;
1408 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1420 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1421 u64 commit_time, trans_time;
1422
1409 journal->j_last_sync_writer = pid; 1423 journal->j_last_sync_writer = pid;
1410 do { 1424
1411 old_handle_count = transaction->t_handle_count; 1425 spin_lock(&journal->j_state_lock);
1412 schedule_timeout_uninterruptible(1); 1426 commit_time = journal->j_average_commit_time;
1413 } while (old_handle_count != transaction->t_handle_count); 1427 spin_unlock(&journal->j_state_lock);
1428
1429 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1430 transaction->t_start_time));
1431
1432 commit_time = min_t(u64, commit_time,
1433 1000*jiffies_to_usecs(1));
1434
1435 if (trans_time < commit_time) {
1436 ktime_t expires = ktime_add_ns(ktime_get(),
1437 commit_time);
1438 set_current_state(TASK_UNINTERRUPTIBLE);
1439 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1440 }
1414 } 1441 }
1415 1442
1416 current->journal_info = NULL; 1443 current->journal_info = NULL;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 073124a29b8c..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -535,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
535 if (is_journal_aborted(journal)) { 535 if (is_journal_aborted(journal)) {
536 clear_buffer_jbddirty(jh2bh(jh)); 536 clear_buffer_jbddirty(jh2bh(jh));
537 JBUFFER_TRACE(jh, "journal is aborting: refile"); 537 JBUFFER_TRACE(jh, "journal is aborting: refile");
538 jbd2_buffer_abort_trigger(jh,
539 jh->b_frozen_data ?
540 jh->b_frozen_triggers :
541 jh->b_triggers);
538 jbd2_journal_refile_buffer(journal, jh); 542 jbd2_journal_refile_buffer(journal, jh);
539 /* If that was the last one, we need to clean up 543 /* If that was the last one, we need to clean up
540 * any descriptor buffers which may have been 544 * any descriptor buffers which may have been
@@ -870,6 +874,9 @@ restart_loop:
870 * data. 874 * data.
871 * 875 *
872 * Otherwise, we can just throw away the frozen data now. 876 * Otherwise, we can just throw away the frozen data now.
877 *
878 * We also know that the frozen data has already fired
879 * its triggers if they exist, so we can clear that too.
873 */ 880 */
874 if (jh->b_committed_data) { 881 if (jh->b_committed_data) {
875 jbd2_free(jh->b_committed_data, bh->b_size); 882 jbd2_free(jh->b_committed_data, bh->b_size);
@@ -877,10 +884,12 @@ restart_loop:
877 if (jh->b_frozen_data) { 884 if (jh->b_frozen_data) {
878 jh->b_committed_data = jh->b_frozen_data; 885 jh->b_committed_data = jh->b_frozen_data;
879 jh->b_frozen_data = NULL; 886 jh->b_frozen_data = NULL;
887 jh->b_frozen_triggers = NULL;
880 } 888 }
881 } else if (jh->b_frozen_data) { 889 } else if (jh->b_frozen_data) {
882 jbd2_free(jh->b_frozen_data, bh->b_size); 890 jbd2_free(jh->b_frozen_data, bh->b_size);
883 jh->b_frozen_data = NULL; 891 jh->b_frozen_data = NULL;
892 jh->b_frozen_triggers = NULL;
884 } 893 }
885 894
886 spin_lock(&journal->j_list_lock); 895 spin_lock(&journal->j_list_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2932c8f55199..56675306ed81 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -51,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
51EXPORT_SYMBOL(jbd2_journal_get_write_access); 51EXPORT_SYMBOL(jbd2_journal_get_write_access);
52EXPORT_SYMBOL(jbd2_journal_get_create_access); 52EXPORT_SYMBOL(jbd2_journal_get_create_access);
53EXPORT_SYMBOL(jbd2_journal_get_undo_access); 53EXPORT_SYMBOL(jbd2_journal_get_undo_access);
54EXPORT_SYMBOL(jbd2_journal_set_triggers);
54EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 55EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
55EXPORT_SYMBOL(jbd2_journal_release_buffer); 56EXPORT_SYMBOL(jbd2_journal_release_buffer);
56EXPORT_SYMBOL(jbd2_journal_forget); 57EXPORT_SYMBOL(jbd2_journal_forget);
@@ -291,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
291 struct page *new_page; 292 struct page *new_page;
292 unsigned int new_offset; 293 unsigned int new_offset;
293 struct buffer_head *bh_in = jh2bh(jh_in); 294 struct buffer_head *bh_in = jh2bh(jh_in);
295 struct jbd2_buffer_trigger_type *triggers;
294 296
295 /* 297 /*
296 * The buffer really shouldn't be locked: only the current committing 298 * The buffer really shouldn't be locked: only the current committing
@@ -315,13 +317,23 @@ repeat:
315 done_copy_out = 1; 317 done_copy_out = 1;
316 new_page = virt_to_page(jh_in->b_frozen_data); 318 new_page = virt_to_page(jh_in->b_frozen_data);
317 new_offset = offset_in_page(jh_in->b_frozen_data); 319 new_offset = offset_in_page(jh_in->b_frozen_data);
320 triggers = jh_in->b_frozen_triggers;
318 } else { 321 } else {
319 new_page = jh2bh(jh_in)->b_page; 322 new_page = jh2bh(jh_in)->b_page;
320 new_offset = offset_in_page(jh2bh(jh_in)->b_data); 323 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
324 triggers = jh_in->b_triggers;
321 } 325 }
322 326
323 mapped_data = kmap_atomic(new_page, KM_USER0); 327 mapped_data = kmap_atomic(new_page, KM_USER0);
324 /* 328 /*
329 * Fire any commit trigger. Do this before checking for escaping,
330 * as the trigger may modify the magic offset. If a copy-out
331 * happens afterwards, it will have the correct data in the buffer.
332 */
333 jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
334 triggers);
335
336 /*
325 * Check for escaping 337 * Check for escaping
326 */ 338 */
327 if (*((__be32 *)(mapped_data + new_offset)) == 339 if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -353,6 +365,13 @@ repeat:
353 new_page = virt_to_page(tmp); 365 new_page = virt_to_page(tmp);
354 new_offset = offset_in_page(tmp); 366 new_offset = offset_in_page(tmp);
355 done_copy_out = 1; 367 done_copy_out = 1;
368
369 /*
370 * This isn't strictly necessary, as we're using frozen
371 * data for the escaping, but it keeps consistency with
372 * b_frozen_data usage.
373 */
374 jh_in->b_frozen_triggers = jh_in->b_triggers;
356 } 375 }
357 376
358 /* 377 /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 48c21bac5a56..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -743,6 +743,12 @@ done:
743 source = kmap_atomic(page, KM_USER0); 743 source = kmap_atomic(page, KM_USER0);
744 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 744 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
745 kunmap_atomic(source, KM_USER0); 745 kunmap_atomic(source, KM_USER0);
746
747 /*
748 * Now that the frozen data is saved off, we need to store
749 * any matching triggers.
750 */
751 jh->b_frozen_triggers = jh->b_triggers;
746 } 752 }
747 jbd_unlock_bh_state(bh); 753 jbd_unlock_bh_state(bh);
748 754
@@ -946,6 +952,47 @@ out:
946} 952}
947 953
948/** 954/**
955 * void jbd2_journal_set_triggers() - Add triggers for commit writeout
956 * @bh: buffer to trigger on
957 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
958 *
959 * Set any triggers on this journal_head. This is always safe, because
960 * triggers for a committing buffer will be saved off, and triggers for
961 * a running transaction will match the buffer in that transaction.
962 *
963 * Call with NULL to clear the triggers.
964 */
965void jbd2_journal_set_triggers(struct buffer_head *bh,
966 struct jbd2_buffer_trigger_type *type)
967{
968 struct journal_head *jh = bh2jh(bh);
969
970 jh->b_triggers = type;
971}
972
973void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
974 struct jbd2_buffer_trigger_type *triggers)
975{
976 struct buffer_head *bh = jh2bh(jh);
977
978 if (!triggers || !triggers->t_commit)
979 return;
980
981 triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
982}
983
984void jbd2_buffer_abort_trigger(struct journal_head *jh,
985 struct jbd2_buffer_trigger_type *triggers)
986{
987 if (!triggers || !triggers->t_abort)
988 return;
989
990 triggers->t_abort(triggers, jh2bh(jh));
991}
992
993
994
995/**
949 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata 996 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
950 * @handle: transaction to add buffer to. 997 * @handle: transaction to add buffer to.
951 * @bh: buffer to mark 998 * @bh: buffer to mark
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d0..0f94381ca6d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
58 58
59/* 59/*
60 * __mark_inode_dirty expects inodes to be hashed. Since we don't want 60 * __mark_inode_dirty expects inodes to be hashed. Since we don't want
61 * special inodes in the fileset inode space, we hash them to a dummy head 61 * special inodes in the fileset inode space, we make them appear hashed,
62 * but do not put on any lists.
62 */ 63 */
63static HLIST_HEAD(aggregate_hash);
64 64
65/* 65/*
66 * imap locks 66 * imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
496 /* release the page */ 496 /* release the page */
497 release_metapage(mp); 497 release_metapage(mp);
498 498
499 hlist_add_head(&ip->i_hash, &aggregate_hash); 499 /*
500 * that will look hashed, but won't be on any list; hlist_del()
501 * will work fine and require no locking.
502 */
503 ip->i_hash.pprev = &ip->i_hash.next;
500 504
501 return (ip); 505 return (ip);
502} 506}
diff --git a/fs/libfs.c b/fs/libfs.c
index bdaec17fa388..49b44099dabb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
231 */ 231 */
232 root->i_ino = 1; 232 root->i_ino = 1;
233 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; 233 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
234 root->i_uid = root->i_gid = 0;
235 root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; 234 root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
236 dentry = d_alloc(NULL, &d_name); 235 dentry = d_alloc(NULL, &d_name);
237 if (!dentry) { 236 if (!dentry) {
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
436 */ 435 */
437 inode->i_ino = 1; 436 inode->i_ino = 1;
438 inode->i_mode = S_IFDIR | 0755; 437 inode->i_mode = S_IFDIR | 0755;
439 inode->i_uid = inode->i_gid = 0;
440 inode->i_blocks = 0;
441 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 438 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
442 inode->i_op = &simple_dir_inode_operations; 439 inode->i_op = &simple_dir_inode_operations;
443 inode->i_fop = &simple_dir_operations; 440 inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
464 if (!inode) 461 if (!inode)
465 goto out; 462 goto out;
466 inode->i_mode = S_IFREG | files->mode; 463 inode->i_mode = S_IFREG | files->mode;
467 inode->i_uid = inode->i_gid = 0;
468 inode->i_blocks = 0;
469 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 464 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
470 inode->i_fop = files->ops; 465 inode->i_fop = files->ops;
471 inode->i_ino = i; 466 inode->i_ino = i;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 31668b690e03..dd7957064a8c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -16,7 +16,6 @@
16#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
17#include <linux/sunrpc/svc.h> 17#include <linux/sunrpc/svc.h>
18#include <linux/lockd/lockd.h> 18#include <linux/lockd/lockd.h>
19#include <linux/lockd/sm_inter.h>
20 19
21#define NLMDBG_FACILITY NLMDBG_CLIENT 20#define NLMDBG_FACILITY NLMDBG_CLIENT
22#define NLMCLNT_GRACE_WAIT (5*HZ) 21#define NLMCLNT_GRACE_WAIT (5*HZ)
@@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
518 unsigned char fl_type; 517 unsigned char fl_type;
519 int status = -ENOLCK; 518 int status = -ENOLCK;
520 519
521 if (nsm_monitor(host) < 0) { 520 if (nsm_monitor(host) < 0)
522 printk(KERN_NOTICE "lockd: failed to monitor %s\n",
523 host->h_name);
524 goto out; 521 goto out;
525 } 522
526 fl->fl_flags |= FL_ACCESS; 523 fl->fl_flags |= FL_ACCESS;
527 status = do_vfs_lock(fl); 524 status = do_vfs_lock(fl);
528 fl->fl_flags = fl_flags; 525 fl->fl_flags = fl_flags;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index abdebf76b820..99d737bd4325 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -15,7 +15,6 @@
15#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/lockd/sm_inter.h>
19#include <linux/mutex.h> 18#include <linux/mutex.h>
20 19
21#include <net/ipv6.h> 20#include <net/ipv6.h>
@@ -32,11 +31,6 @@ static int nrhosts;
32static DEFINE_MUTEX(nlm_host_mutex); 31static DEFINE_MUTEX(nlm_host_mutex);
33 32
34static void nlm_gc_hosts(void); 33static void nlm_gc_hosts(void);
35static struct nsm_handle *nsm_find(const struct sockaddr *sap,
36 const size_t salen,
37 const char *hostname,
38 const size_t hostname_len,
39 const int create);
40 34
41struct nlm_lookup_host_info { 35struct nlm_lookup_host_info {
42 const int server; /* search for server|client */ 36 const int server; /* search for server|client */
@@ -105,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap)
105 } 99 }
106} 100}
107 101
108static void nlm_display_address(const struct sockaddr *sap,
109 char *buf, const size_t len)
110{
111 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
112 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
113
114 switch (sap->sa_family) {
115 case AF_UNSPEC:
116 snprintf(buf, len, "unspecified");
117 break;
118 case AF_INET:
119 snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
120 break;
121 case AF_INET6:
122 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
123 snprintf(buf, len, "%pI4",
124 &sin6->sin6_addr.s6_addr32[3]);
125 else
126 snprintf(buf, len, "%pI6", &sin6->sin6_addr);
127 break;
128 default:
129 snprintf(buf, len, "unsupported address family");
130 break;
131 }
132}
133
134/* 102/*
135 * Common host lookup routine for server & client 103 * Common host lookup routine for server & client
136 */ 104 */
@@ -190,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
190 atomic_inc(&nsm->sm_count); 158 atomic_inc(&nsm->sm_count);
191 else { 159 else {
192 host = NULL; 160 host = NULL;
193 nsm = nsm_find(ni->sap, ni->salen, 161 nsm = nsm_get_handle(ni->sap, ni->salen,
194 ni->hostname, ni->hostname_len, 1); 162 ni->hostname, ni->hostname_len);
195 if (!nsm) { 163 if (!nsm) {
196 dprintk("lockd: nlm_lookup_host failed; " 164 dprintk("lockd: nlm_lookup_host failed; "
197 "no nsm handle\n"); 165 "no nsm handle\n");
@@ -206,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
206 goto out; 174 goto out;
207 } 175 }
208 host->h_name = nsm->sm_name; 176 host->h_name = nsm->sm_name;
177 host->h_addrbuf = nsm->sm_addrbuf;
209 memcpy(nlm_addr(host), ni->sap, ni->salen); 178 memcpy(nlm_addr(host), ni->sap, ni->salen);
210 host->h_addrlen = ni->salen; 179 host->h_addrlen = ni->salen;
211 nlm_clear_port(nlm_addr(host)); 180 nlm_clear_port(nlm_addr(host));
@@ -232,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
232 201
233 nrhosts++; 202 nrhosts++;
234 203
235 nlm_display_address((struct sockaddr *)&host->h_addr,
236 host->h_addrbuf, sizeof(host->h_addrbuf));
237 nlm_display_address((struct sockaddr *)&host->h_srcaddr,
238 host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
239
240 dprintk("lockd: nlm_lookup_host created host %s\n", 204 dprintk("lockd: nlm_lookup_host created host %s\n",
241 host->h_name); 205 host->h_name);
242 206
@@ -256,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host)
256 BUG_ON(!list_empty(&host->h_lockowners)); 220 BUG_ON(!list_empty(&host->h_lockowners));
257 BUG_ON(atomic_read(&host->h_count)); 221 BUG_ON(atomic_read(&host->h_count));
258 222
259 /*
260 * Release NSM handle and unmonitor host.
261 */
262 nsm_unmonitor(host); 223 nsm_unmonitor(host);
224 nsm_release(host->h_nsmhandle);
263 225
264 clnt = host->h_rpcclnt; 226 clnt = host->h_rpcclnt;
265 if (clnt != NULL) 227 if (clnt != NULL)
@@ -378,8 +340,8 @@ nlm_bind_host(struct nlm_host *host)
378{ 340{
379 struct rpc_clnt *clnt; 341 struct rpc_clnt *clnt;
380 342
381 dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", 343 dprintk("lockd: nlm_bind_host %s (%s)\n",
382 host->h_name, host->h_addrbuf, host->h_srcaddrbuf); 344 host->h_name, host->h_addrbuf);
383 345
384 /* Lock host handle */ 346 /* Lock host handle */
385 mutex_lock(&host->h_mutex); 347 mutex_lock(&host->h_mutex);
@@ -481,35 +443,23 @@ void nlm_release_host(struct nlm_host *host)
481 } 443 }
482} 444}
483 445
484/* 446/**
485 * We were notified that the host indicated by address &sin 447 * nlm_host_rebooted - Release all resources held by rebooted host
486 * has rebooted. 448 * @info: pointer to decoded results of NLM_SM_NOTIFY call
487 * Release all resources held by that peer. 449 *
450 * We were notified that the specified host has rebooted. Release
451 * all resources held by that peer.
488 */ 452 */
489void nlm_host_rebooted(const struct sockaddr_in *sin, 453void nlm_host_rebooted(const struct nlm_reboot *info)
490 const char *hostname,
491 unsigned int hostname_len,
492 u32 new_state)
493{ 454{
494 struct hlist_head *chain; 455 struct hlist_head *chain;
495 struct hlist_node *pos; 456 struct hlist_node *pos;
496 struct nsm_handle *nsm; 457 struct nsm_handle *nsm;
497 struct nlm_host *host; 458 struct nlm_host *host;
498 459
499 nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), 460 nsm = nsm_reboot_lookup(info);
500 hostname, hostname_len, 0); 461 if (unlikely(nsm == NULL))
501 if (nsm == NULL) {
502 dprintk("lockd: never saw rebooted peer '%.*s' before\n",
503 hostname_len, hostname);
504 return; 462 return;
505 }
506
507 dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
508 hostname_len, hostname, nsm->sm_addrbuf);
509
510 /* When reclaiming locks on this peer, make sure that
511 * we set up a new notification */
512 nsm->sm_monitored = 0;
513 463
514 /* Mark all hosts tied to this NSM state as having rebooted. 464 /* Mark all hosts tied to this NSM state as having rebooted.
515 * We run the loop repeatedly, because we drop the host table 465 * We run the loop repeatedly, because we drop the host table
@@ -520,8 +470,8 @@ again: mutex_lock(&nlm_host_mutex);
520 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 470 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
521 hlist_for_each_entry(host, pos, chain, h_hash) { 471 hlist_for_each_entry(host, pos, chain, h_hash) {
522 if (host->h_nsmhandle == nsm 472 if (host->h_nsmhandle == nsm
523 && host->h_nsmstate != new_state) { 473 && host->h_nsmstate != info->state) {
524 host->h_nsmstate = new_state; 474 host->h_nsmstate = info->state;
525 host->h_state++; 475 host->h_state++;
526 476
527 nlm_get_host(host); 477 nlm_get_host(host);
@@ -629,89 +579,3 @@ nlm_gc_hosts(void)
629 579
630 next_gc = jiffies + NLM_HOST_COLLECT; 580 next_gc = jiffies + NLM_HOST_COLLECT;
631} 581}
632
633
634/*
635 * Manage NSM handles
636 */
637static LIST_HEAD(nsm_handles);
638static DEFINE_SPINLOCK(nsm_lock);
639
640static struct nsm_handle *nsm_find(const struct sockaddr *sap,
641 const size_t salen,
642 const char *hostname,
643 const size_t hostname_len,
644 const int create)
645{
646 struct nsm_handle *nsm = NULL;
647 struct nsm_handle *pos;
648
649 if (!sap)
650 return NULL;
651
652 if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
653 if (printk_ratelimit()) {
654 printk(KERN_WARNING "Invalid hostname \"%.*s\" "
655 "in NFS lock request\n",
656 (int)hostname_len, hostname);
657 }
658 return NULL;
659 }
660
661retry:
662 spin_lock(&nsm_lock);
663 list_for_each_entry(pos, &nsm_handles, sm_link) {
664
665 if (hostname && nsm_use_hostnames) {
666 if (strlen(pos->sm_name) != hostname_len
667 || memcmp(pos->sm_name, hostname, hostname_len))
668 continue;
669 } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
670 continue;
671 atomic_inc(&pos->sm_count);
672 kfree(nsm);
673 nsm = pos;
674 goto found;
675 }
676 if (nsm) {
677 list_add(&nsm->sm_link, &nsm_handles);
678 goto found;
679 }
680 spin_unlock(&nsm_lock);
681
682 if (!create)
683 return NULL;
684
685 nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
686 if (nsm == NULL)
687 return NULL;
688
689 memcpy(nsm_addr(nsm), sap, salen);
690 nsm->sm_addrlen = salen;
691 nsm->sm_name = (char *) (nsm + 1);
692 memcpy(nsm->sm_name, hostname, hostname_len);
693 nsm->sm_name[hostname_len] = '\0';
694 nlm_display_address((struct sockaddr *)&nsm->sm_addr,
695 nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
696 atomic_set(&nsm->sm_count, 1);
697 goto retry;
698
699found:
700 spin_unlock(&nsm_lock);
701 return nsm;
702}
703
704/*
705 * Release an NSM handle
706 */
707void
708nsm_release(struct nsm_handle *nsm)
709{
710 if (!nsm)
711 return;
712 if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
713 list_del(&nsm->sm_link);
714 spin_unlock(&nsm_lock);
715 kfree(nsm);
716 }
717}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index ffd3461f75ef..5e2c4d5ac827 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -9,35 +9,123 @@
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/ktime.h>
13
12#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
13#include <linux/sunrpc/xprtsock.h> 15#include <linux/sunrpc/xprtsock.h>
14#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
15#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
16#include <linux/lockd/sm_inter.h>
17
18 18
19#define NLMDBG_FACILITY NLMDBG_MONITOR 19#define NLMDBG_FACILITY NLMDBG_MONITOR
20#define NSM_PROGRAM 100024
21#define NSM_VERSION 1
22
23enum {
24 NSMPROC_NULL,
25 NSMPROC_STAT,
26 NSMPROC_MON,
27 NSMPROC_UNMON,
28 NSMPROC_UNMON_ALL,
29 NSMPROC_SIMU_CRASH,
30 NSMPROC_NOTIFY,
31};
32
33struct nsm_args {
34 struct nsm_private *priv;
35 u32 prog; /* RPC callback info */
36 u32 vers;
37 u32 proc;
20 38
21#define XDR_ADDRBUF_LEN (20) 39 char *mon_name;
40};
22 41
23static struct rpc_clnt * nsm_create(void); 42struct nsm_res {
43 u32 status;
44 u32 state;
45};
24 46
25static struct rpc_program nsm_program; 47static struct rpc_program nsm_program;
48static LIST_HEAD(nsm_handles);
49static DEFINE_SPINLOCK(nsm_lock);
26 50
27/* 51/*
28 * Local NSM state 52 * Local NSM state
29 */ 53 */
30int nsm_local_state; 54int __read_mostly nsm_local_state;
55int __read_mostly nsm_use_hostnames;
31 56
32/* 57static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
33 * Common procedure for SM_MON/SM_UNMON calls 58{
34 */ 59 return (struct sockaddr *)&nsm->sm_addr;
35static int 60}
36nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) 61
62static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
63 const size_t len)
64{
65 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
66 snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
67}
68
69static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
70 const size_t len)
71{
72 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
73
74 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
75 snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
76 else if (sin6->sin6_scope_id != 0)
77 snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
78 sin6->sin6_scope_id);
79 else
80 snprintf(buf, len, "%pI6", &sin6->sin6_addr);
81}
82
83static void nsm_display_address(const struct sockaddr *sap,
84 char *buf, const size_t len)
85{
86 switch (sap->sa_family) {
87 case AF_INET:
88 nsm_display_ipv4_address(sap, buf, len);
89 break;
90 case AF_INET6:
91 nsm_display_ipv6_address(sap, buf, len);
92 break;
93 default:
94 snprintf(buf, len, "unsupported address family");
95 break;
96 }
97}
98
99static struct rpc_clnt *nsm_create(void)
100{
101 struct sockaddr_in sin = {
102 .sin_family = AF_INET,
103 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
104 };
105 struct rpc_create_args args = {
106 .protocol = XPRT_TRANSPORT_UDP,
107 .address = (struct sockaddr *)&sin,
108 .addrsize = sizeof(sin),
109 .servername = "rpc.statd",
110 .program = &nsm_program,
111 .version = NSM_VERSION,
112 .authflavor = RPC_AUTH_NULL,
113 };
114
115 return rpc_create(&args);
116}
117
118static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
37{ 119{
38 struct rpc_clnt *clnt; 120 struct rpc_clnt *clnt;
39 int status; 121 int status;
40 struct nsm_args args; 122 struct nsm_args args = {
123 .priv = &nsm->sm_priv,
124 .prog = NLM_PROGRAM,
125 .vers = 3,
126 .proc = NLMPROC_NSM_NOTIFY,
127 .mon_name = nsm->sm_mon_name,
128 };
41 struct rpc_message msg = { 129 struct rpc_message msg = {
42 .rpc_argp = &args, 130 .rpc_argp = &args,
43 .rpc_resp = res, 131 .rpc_resp = res,
@@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
46 clnt = nsm_create(); 134 clnt = nsm_create();
47 if (IS_ERR(clnt)) { 135 if (IS_ERR(clnt)) {
48 status = PTR_ERR(clnt); 136 status = PTR_ERR(clnt);
137 dprintk("lockd: failed to create NSM upcall transport, "
138 "status=%d\n", status);
49 goto out; 139 goto out;
50 } 140 }
51 141
52 memset(&args, 0, sizeof(args));
53 args.mon_name = nsm->sm_name;
54 args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
55 args.prog = NLM_PROGRAM;
56 args.vers = 3;
57 args.proc = NLMPROC_NSM_NOTIFY;
58 memset(res, 0, sizeof(*res)); 142 memset(res, 0, sizeof(*res));
59 143
60 msg.rpc_proc = &clnt->cl_procinfo[proc]; 144 msg.rpc_proc = &clnt->cl_procinfo[proc];
61 status = rpc_call_sync(clnt, &msg, 0); 145 status = rpc_call_sync(clnt, &msg, 0);
62 if (status < 0) 146 if (status < 0)
63 printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n", 147 dprintk("lockd: NSM upcall RPC failed, status=%d\n",
64 status); 148 status);
65 else 149 else
66 status = 0; 150 status = 0;
67 rpc_shutdown_client(clnt); 151 rpc_shutdown_client(clnt);
@@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
69 return status; 153 return status;
70} 154}
71 155
72/* 156/**
73 * Set up monitoring of a remote host 157 * nsm_monitor - Notify a peer in case we reboot
158 * @host: pointer to nlm_host of peer to notify
159 *
160 * If this peer is not already monitored, this function sends an
161 * upcall to the local rpc.statd to record the name/address of
162 * the peer to notify in case we reboot.
163 *
164 * Returns zero if the peer is monitored by the local rpc.statd;
165 * otherwise a negative errno value is returned.
74 */ 166 */
75int 167int nsm_monitor(const struct nlm_host *host)
76nsm_monitor(struct nlm_host *host)
77{ 168{
78 struct nsm_handle *nsm = host->h_nsmhandle; 169 struct nsm_handle *nsm = host->h_nsmhandle;
79 struct nsm_res res; 170 struct nsm_res res;
80 int status; 171 int status;
81 172
82 dprintk("lockd: nsm_monitor(%s)\n", host->h_name); 173 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
83 BUG_ON(nsm == NULL);
84 174
85 if (nsm->sm_monitored) 175 if (nsm->sm_monitored)
86 return 0; 176 return 0;
87 177
88 status = nsm_mon_unmon(nsm, SM_MON, &res); 178 /*
179 * Choose whether to record the caller_name or IP address of
180 * this peer in the local rpc.statd's database.
181 */
182 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
89 183
90 if (status < 0 || res.status != 0) 184 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
91 printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name); 185 if (res.status != 0)
186 status = -EIO;
187 if (status < 0)
188 printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
92 else 189 else
93 nsm->sm_monitored = 1; 190 nsm->sm_monitored = 1;
94 return status; 191 return status;
95} 192}
96 193
97/* 194/**
98 * Cease to monitor remote host 195 * nsm_unmonitor - Unregister peer notification
196 * @host: pointer to nlm_host of peer to stop monitoring
197 *
198 * If this peer is monitored, this function sends an upcall to
199 * tell the local rpc.statd not to send this peer a notification
200 * when we reboot.
99 */ 201 */
100int 202void nsm_unmonitor(const struct nlm_host *host)
101nsm_unmonitor(struct nlm_host *host)
102{ 203{
103 struct nsm_handle *nsm = host->h_nsmhandle; 204 struct nsm_handle *nsm = host->h_nsmhandle;
104 struct nsm_res res; 205 struct nsm_res res;
105 int status = 0; 206 int status;
106
107 if (nsm == NULL)
108 return 0;
109 host->h_nsmhandle = NULL;
110 207
111 if (atomic_read(&nsm->sm_count) == 1 208 if (atomic_read(&nsm->sm_count) == 1
112 && nsm->sm_monitored && !nsm->sm_sticky) { 209 && nsm->sm_monitored && !nsm->sm_sticky) {
113 dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name); 210 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
114 211
115 status = nsm_mon_unmon(nsm, SM_UNMON, &res); 212 status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
213 if (res.status != 0)
214 status = -EIO;
116 if (status < 0) 215 if (status < 0)
117 printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", 216 printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
118 host->h_name); 217 nsm->sm_name);
119 else 218 else
120 nsm->sm_monitored = 0; 219 nsm->sm_monitored = 0;
121 } 220 }
122 nsm_release(nsm); 221}
123 return status; 222
223static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
224 const size_t len)
225{
226 struct nsm_handle *nsm;
227
228 list_for_each_entry(nsm, &nsm_handles, sm_link)
229 if (strlen(nsm->sm_name) == len &&
230 memcmp(nsm->sm_name, hostname, len) == 0)
231 return nsm;
232 return NULL;
233}
234
235static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
236{
237 struct nsm_handle *nsm;
238
239 list_for_each_entry(nsm, &nsm_handles, sm_link)
240 if (nlm_cmp_addr(nsm_addr(nsm), sap))
241 return nsm;
242 return NULL;
243}
244
245static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
246{
247 struct nsm_handle *nsm;
248
249 list_for_each_entry(nsm, &nsm_handles, sm_link)
250 if (memcmp(nsm->sm_priv.data, priv->data,
251 sizeof(priv->data)) == 0)
252 return nsm;
253 return NULL;
124} 254}
125 255
126/* 256/*
127 * Create NSM client for the local host 257 * Construct a unique cookie to match this nsm_handle to this monitored
258 * host. It is passed to the local rpc.statd via NSMPROC_MON, and
259 * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
260 * requests.
261 *
262 * The NSM protocol requires that these cookies be unique while the
263 * system is running. We prefer a stronger requirement of making them
264 * unique across reboots. If user space bugs cause a stale cookie to
265 * be sent to the kernel, it could cause the wrong host to lose its
266 * lock state if cookies were not unique across reboots.
267 *
268 * The cookies are exposed only to local user space via loopback. They
269 * do not appear on the physical network. If we want greater security
270 * for some reason, nsm_init_private() could perform a one-way hash to
271 * obscure the contents of the cookie.
128 */ 272 */
129static struct rpc_clnt * 273static void nsm_init_private(struct nsm_handle *nsm)
130nsm_create(void)
131{ 274{
132 struct sockaddr_in sin = { 275 u64 *p = (u64 *)&nsm->sm_priv.data;
133 .sin_family = AF_INET, 276 struct timespec ts;
134 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
135 .sin_port = 0,
136 };
137 struct rpc_create_args args = {
138 .protocol = XPRT_TRANSPORT_UDP,
139 .address = (struct sockaddr *)&sin,
140 .addrsize = sizeof(sin),
141 .servername = "localhost",
142 .program = &nsm_program,
143 .version = SM_VERSION,
144 .authflavor = RPC_AUTH_NULL,
145 };
146 277
147 return rpc_create(&args); 278 ktime_get_ts(&ts);
279 *p++ = timespec_to_ns(&ts);
280 *p = (unsigned long)nsm;
281}
282
283static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
284 const size_t salen,
285 const char *hostname,
286 const size_t hostname_len)
287{
288 struct nsm_handle *new;
289
290 new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
291 if (unlikely(new == NULL))
292 return NULL;
293
294 atomic_set(&new->sm_count, 1);
295 new->sm_name = (char *)(new + 1);
296 memcpy(nsm_addr(new), sap, salen);
297 new->sm_addrlen = salen;
298 nsm_init_private(new);
299 nsm_display_address((const struct sockaddr *)&new->sm_addr,
300 new->sm_addrbuf, sizeof(new->sm_addrbuf));
301 memcpy(new->sm_name, hostname, hostname_len);
302 new->sm_name[hostname_len] = '\0';
303
304 return new;
305}
306
307/**
308 * nsm_get_handle - Find or create a cached nsm_handle
309 * @sap: pointer to socket address of handle to find
310 * @salen: length of socket address
311 * @hostname: pointer to C string containing hostname to find
312 * @hostname_len: length of C string
313 *
314 * Behavior is modulated by the global nsm_use_hostnames variable.
315 *
316 * Returns a cached nsm_handle after bumping its ref count, or
317 * returns a fresh nsm_handle if a handle that matches @sap and/or
318 * @hostname cannot be found in the handle cache. Returns NULL if
319 * an error occurs.
320 */
321struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
322 const size_t salen, const char *hostname,
323 const size_t hostname_len)
324{
325 struct nsm_handle *cached, *new = NULL;
326
327 if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
328 if (printk_ratelimit()) {
329 printk(KERN_WARNING "Invalid hostname \"%.*s\" "
330 "in NFS lock request\n",
331 (int)hostname_len, hostname);
332 }
333 return NULL;
334 }
335
336retry:
337 spin_lock(&nsm_lock);
338
339 if (nsm_use_hostnames && hostname != NULL)
340 cached = nsm_lookup_hostname(hostname, hostname_len);
341 else
342 cached = nsm_lookup_addr(sap);
343
344 if (cached != NULL) {
345 atomic_inc(&cached->sm_count);
346 spin_unlock(&nsm_lock);
347 kfree(new);
348 dprintk("lockd: found nsm_handle for %s (%s), "
349 "cnt %d\n", cached->sm_name,
350 cached->sm_addrbuf,
351 atomic_read(&cached->sm_count));
352 return cached;
353 }
354
355 if (new != NULL) {
356 list_add(&new->sm_link, &nsm_handles);
357 spin_unlock(&nsm_lock);
358 dprintk("lockd: created nsm_handle for %s (%s)\n",
359 new->sm_name, new->sm_addrbuf);
360 return new;
361 }
362
363 spin_unlock(&nsm_lock);
364
365 new = nsm_create_handle(sap, salen, hostname, hostname_len);
366 if (unlikely(new == NULL))
367 return NULL;
368 goto retry;
369}
370
371/**
372 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
373 * @info: pointer to NLMPROC_SM_NOTIFY arguments
374 *
375 * Returns a matching nsm_handle if found in the nsm cache; the returned
376 * nsm_handle's reference count is bumped and sm_monitored is cleared.
377 * Otherwise returns NULL if some error occurred.
378 */
379struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
380{
381 struct nsm_handle *cached;
382
383 spin_lock(&nsm_lock);
384
385 cached = nsm_lookup_priv(&info->priv);
386 if (unlikely(cached == NULL)) {
387 spin_unlock(&nsm_lock);
388 dprintk("lockd: never saw rebooted peer '%.*s' before\n",
389 info->len, info->mon);
390 return cached;
391 }
392
393 atomic_inc(&cached->sm_count);
394 spin_unlock(&nsm_lock);
395
396 /*
397 * During subsequent lock activity, force a fresh
398 * notification to be set up for this host.
399 */
400 cached->sm_monitored = 0;
401
402 dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
403 cached->sm_name, cached->sm_addrbuf,
404 atomic_read(&cached->sm_count));
405 return cached;
406}
407
408/**
409 * nsm_release - Release an NSM handle
410 * @nsm: pointer to handle to be released
411 *
412 */
413void nsm_release(struct nsm_handle *nsm)
414{
415 if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
416 list_del(&nsm->sm_link);
417 spin_unlock(&nsm_lock);
418 dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
419 nsm->sm_name, nsm->sm_addrbuf);
420 kfree(nsm);
421 }
148} 422}
149 423
150/* 424/*
@@ -154,127 +428,132 @@ nsm_create(void)
154 * Status Monitor wire protocol. 428 * Status Monitor wire protocol.
155 */ 429 */
156 430
157static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) 431static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
158{ 432{
159 size_t len = strlen(string); 433 const u32 len = strlen(string);
160 434 __be32 *p;
161 if (len > SM_MAXSTRLEN) 435
162 len = SM_MAXSTRLEN; 436 if (unlikely(len > SM_MAXSTRLEN))
163 return xdr_encode_opaque(p, string, len); 437 return -EIO;
438 p = xdr_reserve_space(xdr, sizeof(u32) + len);
439 if (unlikely(p == NULL))
440 return -EIO;
441 xdr_encode_opaque(p, string, len);
442 return 0;
164} 443}
165 444
166/* 445/*
167 * "mon_name" specifies the host to be monitored. 446 * "mon_name" specifies the host to be monitored.
168 *
169 * Linux uses a text version of the IP address of the remote
170 * host as the host identifier (the "mon_name" argument).
171 *
172 * Linux statd always looks up the canonical hostname first for
173 * whatever remote hostname it receives, so this works alright.
174 */ 447 */
175static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp) 448static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
176{ 449{
177 char buffer[XDR_ADDRBUF_LEN + 1]; 450 return encode_nsm_string(xdr, argp->mon_name);
178 char *name = argp->mon_name;
179
180 if (!nsm_use_hostnames) {
181 snprintf(buffer, XDR_ADDRBUF_LEN,
182 "%pI4", &argp->addr);
183 name = buffer;
184 }
185
186 return xdr_encode_nsm_string(p, name);
187} 451}
188 452
189/* 453/*
190 * The "my_id" argument specifies the hostname and RPC procedure 454 * The "my_id" argument specifies the hostname and RPC procedure
191 * to be called when the status manager receives notification 455 * to be called when the status manager receives notification
192 * (via the SM_NOTIFY call) that the state of host "mon_name" 456 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
193 * has changed. 457 * has changed.
194 */ 458 */
195static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp) 459static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
196{ 460{
197 p = xdr_encode_nsm_string(p, utsname()->nodename); 461 int status;
198 if (!p) 462 __be32 *p;
199 return ERR_PTR(-EIO); 463
200 464 status = encode_nsm_string(xdr, utsname()->nodename);
465 if (unlikely(status != 0))
466 return status;
467 p = xdr_reserve_space(xdr, 3 * sizeof(u32));
468 if (unlikely(p == NULL))
469 return -EIO;
201 *p++ = htonl(argp->prog); 470 *p++ = htonl(argp->prog);
202 *p++ = htonl(argp->vers); 471 *p++ = htonl(argp->vers);
203 *p++ = htonl(argp->proc); 472 *p++ = htonl(argp->proc);
204 473 return 0;
205 return p;
206} 474}
207 475
208/* 476/*
209 * The "mon_id" argument specifies the non-private arguments 477 * The "mon_id" argument specifies the non-private arguments
210 * of an SM_MON or SM_UNMON call. 478 * of an NSMPROC_MON or NSMPROC_UNMON call.
211 */ 479 */
212static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) 480static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
213{ 481{
214 p = xdr_encode_mon_name(p, argp); 482 int status;
215 if (!p)
216 return ERR_PTR(-EIO);
217 483
218 return xdr_encode_my_id(p, argp); 484 status = encode_mon_name(xdr, argp);
485 if (unlikely(status != 0))
486 return status;
487 return encode_my_id(xdr, argp);
219} 488}
220 489
221/* 490/*
222 * The "priv" argument may contain private information required 491 * The "priv" argument may contain private information required
223 * by the SM_MON call. This information will be supplied in the 492 * by the NSMPROC_MON call. This information will be supplied in the
224 * SM_NOTIFY call. 493 * NLMPROC_SM_NOTIFY call.
225 *
226 * Linux provides the raw IP address of the monitored host,
227 * left in network byte order.
228 */ 494 */
229static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp) 495static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
230{ 496{
231 *p++ = argp->addr; 497 __be32 *p;
232 *p++ = 0;
233 *p++ = 0;
234 *p++ = 0;
235 498
236 return p; 499 p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
500 if (unlikely(p == NULL))
501 return -EIO;
502 xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
503 return 0;
237} 504}
238 505
239static int 506static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
240xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) 507 const struct nsm_args *argp)
241{ 508{
242 p = xdr_encode_mon_id(p, argp); 509 struct xdr_stream xdr;
243 if (IS_ERR(p)) 510 int status;
244 return PTR_ERR(p); 511
245 512 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
246 p = xdr_encode_priv(p, argp); 513 status = encode_mon_id(&xdr, argp);
247 if (IS_ERR(p)) 514 if (unlikely(status))
248 return PTR_ERR(p); 515 return status;
249 516 return encode_priv(&xdr, argp);
250 rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
251 return 0;
252} 517}
253 518
254static int 519static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
255xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) 520 const struct nsm_args *argp)
256{ 521{
257 p = xdr_encode_mon_id(p, argp); 522 struct xdr_stream xdr;
258 if (IS_ERR(p)) 523
259 return PTR_ERR(p); 524 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
260 rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); 525 return encode_mon_id(&xdr, argp);
261 return 0;
262} 526}
263 527
264static int 528static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
265xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) 529 struct nsm_res *resp)
266{ 530{
531 struct xdr_stream xdr;
532
533 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
534 p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
535 if (unlikely(p == NULL))
536 return -EIO;
267 resp->status = ntohl(*p++); 537 resp->status = ntohl(*p++);
268 resp->state = ntohl(*p++); 538 resp->state = ntohl(*p);
269 dprintk("nsm: xdr_decode_stat_res status %d state %d\n", 539
540 dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
270 resp->status, resp->state); 541 resp->status, resp->state);
271 return 0; 542 return 0;
272} 543}
273 544
274static int 545static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
275xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) 546 struct nsm_res *resp)
276{ 547{
277 resp->state = ntohl(*p++); 548 struct xdr_stream xdr;
549
550 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
551 p = xdr_inline_decode(&xdr, sizeof(u32));
552 if (unlikely(p == NULL))
553 return -EIO;
554 resp->state = ntohl(*p);
555
556 dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
278 return 0; 557 return 0;
279} 558}
280 559
@@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
288#define SM_unmonres_sz 1 567#define SM_unmonres_sz 1
289 568
290static struct rpc_procinfo nsm_procedures[] = { 569static struct rpc_procinfo nsm_procedures[] = {
291[SM_MON] = { 570[NSMPROC_MON] = {
292 .p_proc = SM_MON, 571 .p_proc = NSMPROC_MON,
293 .p_encode = (kxdrproc_t) xdr_encode_mon, 572 .p_encode = (kxdrproc_t)xdr_enc_mon,
294 .p_decode = (kxdrproc_t) xdr_decode_stat_res, 573 .p_decode = (kxdrproc_t)xdr_dec_stat_res,
295 .p_arglen = SM_mon_sz, 574 .p_arglen = SM_mon_sz,
296 .p_replen = SM_monres_sz, 575 .p_replen = SM_monres_sz,
297 .p_statidx = SM_MON, 576 .p_statidx = NSMPROC_MON,
298 .p_name = "MONITOR", 577 .p_name = "MONITOR",
299 }, 578 },
300[SM_UNMON] = { 579[NSMPROC_UNMON] = {
301 .p_proc = SM_UNMON, 580 .p_proc = NSMPROC_UNMON,
302 .p_encode = (kxdrproc_t) xdr_encode_unmon, 581 .p_encode = (kxdrproc_t)xdr_enc_unmon,
303 .p_decode = (kxdrproc_t) xdr_decode_stat, 582 .p_decode = (kxdrproc_t)xdr_dec_stat,
304 .p_arglen = SM_mon_id_sz, 583 .p_arglen = SM_mon_id_sz,
305 .p_replen = SM_unmonres_sz, 584 .p_replen = SM_unmonres_sz,
306 .p_statidx = SM_UNMON, 585 .p_statidx = NSMPROC_UNMON,
307 .p_name = "UNMONITOR", 586 .p_name = "UNMONITOR",
308 }, 587 },
309}; 588};
@@ -322,7 +601,7 @@ static struct rpc_stat nsm_stats;
322 601
323static struct rpc_program nsm_program = { 602static struct rpc_program nsm_program = {
324 .name = "statd", 603 .name = "statd",
325 .number = SM_PROGRAM, 604 .number = NSM_PROGRAM,
326 .nrvers = ARRAY_SIZE(nsm_version), 605 .nrvers = ARRAY_SIZE(nsm_version),
327 .version = nsm_version, 606 .version = nsm_version,
328 .stats = &nsm_stats 607 .stats = &nsm_stats
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 252d80163d02..64f1c31b5853 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,7 +35,6 @@
35#include <linux/sunrpc/svcsock.h> 35#include <linux/sunrpc/svcsock.h>
36#include <net/ip.h> 36#include <net/ip.h>
37#include <linux/lockd/lockd.h> 37#include <linux/lockd/lockd.h>
38#include <linux/lockd/sm_inter.h>
39#include <linux/nfs.h> 38#include <linux/nfs.h>
40 39
41#define NLMDBG_FACILITY NLMDBG_SVC 40#define NLMDBG_FACILITY NLMDBG_SVC
@@ -54,13 +53,26 @@ static struct svc_rqst *nlmsvc_rqst;
54unsigned long nlmsvc_timeout; 53unsigned long nlmsvc_timeout;
55 54
56/* 55/*
56 * If the kernel has IPv6 support available, always listen for
57 * both AF_INET and AF_INET6 requests.
58 */
59#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
60 defined(CONFIG_SUNRPC_REGISTER_V4)
61static const sa_family_t nlmsvc_family = AF_INET6;
62#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
63static const sa_family_t nlmsvc_family = AF_INET;
64#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
65
66/*
57 * These can be set at insmod time (useful for NFS as root filesystem), 67 * These can be set at insmod time (useful for NFS as root filesystem),
58 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 68 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
59 */ 69 */
60static unsigned long nlm_grace_period; 70static unsigned long nlm_grace_period;
61static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; 71static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO;
62static int nlm_udpport, nlm_tcpport; 72static int nlm_udpport, nlm_tcpport;
63int nsm_use_hostnames = 0; 73
74/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
75static unsigned int nlm_max_connections = 1024;
64 76
65/* 77/*
66 * Constants needed for the sysctl interface. 78 * Constants needed for the sysctl interface.
@@ -143,6 +155,9 @@ lockd(void *vrqstp)
143 long timeout = MAX_SCHEDULE_TIMEOUT; 155 long timeout = MAX_SCHEDULE_TIMEOUT;
144 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); 156 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
145 157
158 /* update sv_maxconn if it has changed */
159 rqstp->rq_server->sv_maxconn = nlm_max_connections;
160
146 if (signalled()) { 161 if (signalled()) {
147 flush_signals(current); 162 flush_signals(current);
148 if (nlmsvc_ops) { 163 if (nlmsvc_ops) {
@@ -189,6 +204,19 @@ lockd(void *vrqstp)
189 return 0; 204 return 0;
190} 205}
191 206
207static int create_lockd_listener(struct svc_serv *serv, char *name,
208 unsigned short port)
209{
210 struct svc_xprt *xprt;
211
212 xprt = svc_find_xprt(serv, name, 0, 0);
213 if (xprt == NULL)
214 return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
215
216 svc_xprt_put(xprt);
217 return 0;
218}
219
192/* 220/*
193 * Ensure there are active UDP and TCP listeners for lockd. 221 * Ensure there are active UDP and TCP listeners for lockd.
194 * 222 *
@@ -202,29 +230,23 @@ lockd(void *vrqstp)
202static int make_socks(struct svc_serv *serv) 230static int make_socks(struct svc_serv *serv)
203{ 231{
204 static int warned; 232 static int warned;
205 struct svc_xprt *xprt; 233 int err;
206 int err = 0;
207 234
208 xprt = svc_find_xprt(serv, "udp", 0, 0); 235 err = create_lockd_listener(serv, "udp", nlm_udpport);
209 if (!xprt) 236 if (err < 0)
210 err = svc_create_xprt(serv, "udp", nlm_udpport, 237 goto out_err;
211 SVC_SOCK_DEFAULTS); 238
212 else 239 err = create_lockd_listener(serv, "tcp", nlm_tcpport);
213 svc_xprt_put(xprt); 240 if (err < 0)
214 if (err >= 0) { 241 goto out_err;
215 xprt = svc_find_xprt(serv, "tcp", 0, 0); 242
216 if (!xprt) 243 warned = 0;
217 err = svc_create_xprt(serv, "tcp", nlm_tcpport, 244 return 0;
218 SVC_SOCK_DEFAULTS); 245
219 else 246out_err:
220 svc_xprt_put(xprt); 247 if (warned++ == 0)
221 }
222 if (err >= 0) {
223 warned = 0;
224 err = 0;
225 } else if (warned++ == 0)
226 printk(KERN_WARNING 248 printk(KERN_WARNING
227 "lockd_up: makesock failed, error=%d\n", err); 249 "lockd_up: makesock failed, error=%d\n", err);
228 return err; 250 return err;
229} 251}
230 252
@@ -252,7 +274,7 @@ int lockd_up(void)
252 "lockd_up: no pid, %d users??\n", nlmsvc_users); 274 "lockd_up: no pid, %d users??\n", nlmsvc_users);
253 275
254 error = -ENOMEM; 276 error = -ENOMEM;
255 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); 277 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
256 if (!serv) { 278 if (!serv) {
257 printk(KERN_WARNING "lockd_up: create service failed\n"); 279 printk(KERN_WARNING "lockd_up: create service failed\n");
258 goto out; 280 goto out;
@@ -276,6 +298,7 @@ int lockd_up(void)
276 } 298 }
277 299
278 svc_sock_update_bufs(serv); 300 svc_sock_update_bufs(serv);
301 serv->sv_maxconn = nlm_max_connections;
279 302
280 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); 303 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
281 if (IS_ERR(nlmsvc_task)) { 304 if (IS_ERR(nlmsvc_task)) {
@@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
485module_param_call(nlm_tcpport, param_set_port, param_get_int, 508module_param_call(nlm_tcpport, param_set_port, param_get_int,
486 &nlm_tcpport, 0644); 509 &nlm_tcpport, 0644);
487module_param(nsm_use_hostnames, bool, 0644); 510module_param(nsm_use_hostnames, bool, 0644);
511module_param(nlm_max_connections, uint, 0644);
488 512
489/* 513/*
490 * Initialising and terminating the module. 514 * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4dfdcbc6bf68..1725037374c5 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -16,8 +16,6 @@
16#include <linux/nfsd/nfsd.h> 16#include <linux/nfsd/nfsd.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/lockd/share.h> 18#include <linux/lockd/share.h>
19#include <linux/lockd/sm_inter.h>
20
21 19
22#define NLMDBG_FACILITY NLMDBG_CLIENT 20#define NLMDBG_FACILITY NLMDBG_CLIENT
23 21
@@ -419,8 +417,6 @@ static __be32
419nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, 417nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
420 void *resp) 418 void *resp)
421{ 419{
422 struct sockaddr_in saddr;
423
424 dprintk("lockd: SM_NOTIFY called\n"); 420 dprintk("lockd: SM_NOTIFY called\n");
425 421
426 if (!nlm_privileged_requester(rqstp)) { 422 if (!nlm_privileged_requester(rqstp)) {
@@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
430 return rpc_system_err; 426 return rpc_system_err;
431 } 427 }
432 428
433 /* Obtain the host pointer for this NFS server and try to 429 nlm_host_rebooted(argp);
434 * reclaim all locks we hold on this server.
435 */
436 memset(&saddr, 0, sizeof(saddr));
437 saddr.sin_family = AF_INET;
438 saddr.sin_addr.s_addr = argp->addr;
439 nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
440
441 return rpc_success; 430 return rpc_success;
442} 431}
443 432
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3ca89e2a9381..3688e55901fc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -16,8 +16,6 @@
16#include <linux/nfsd/nfsd.h> 16#include <linux/nfsd/nfsd.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/lockd/share.h> 18#include <linux/lockd/share.h>
19#include <linux/lockd/sm_inter.h>
20
21 19
22#define NLMDBG_FACILITY NLMDBG_CLIENT 20#define NLMDBG_FACILITY NLMDBG_CLIENT
23 21
@@ -451,8 +449,6 @@ static __be32
451nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, 449nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
452 void *resp) 450 void *resp)
453{ 451{
454 struct sockaddr_in saddr;
455
456 dprintk("lockd: SM_NOTIFY called\n"); 452 dprintk("lockd: SM_NOTIFY called\n");
457 453
458 if (!nlm_privileged_requester(rqstp)) { 454 if (!nlm_privileged_requester(rqstp)) {
@@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
462 return rpc_system_err; 458 return rpc_system_err;
463 } 459 }
464 460
465 /* Obtain the host pointer for this NFS server and try to 461 nlm_host_rebooted(argp);
466 * reclaim all locks we hold on this server.
467 */
468 memset(&saddr, 0, sizeof(saddr));
469 saddr.sin_family = AF_INET;
470 saddr.sin_addr.s_addr = argp->addr;
471 nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
472
473 return rpc_success; 462 return rpc_success;
474} 463}
475 464
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 34c2766e27c7..9e4d6aab611b 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -17,7 +17,6 @@
17#include <linux/nfsd/export.h> 17#include <linux/nfsd/export.h>
18#include <linux/lockd/lockd.h> 18#include <linux/lockd/lockd.h>
19#include <linux/lockd/share.h> 19#include <linux/lockd/share.h>
20#include <linux/lockd/sm_inter.h>
21#include <linux/module.h> 20#include <linux/module.h>
22#include <linux/mount.h> 21#include <linux/mount.h>
23 22
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1f226290c67c..0336f2beacde 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,7 +16,6 @@
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/sunrpc/stats.h> 17#include <linux/sunrpc/stats.h>
18#include <linux/lockd/lockd.h> 18#include <linux/lockd/lockd.h>
19#include <linux/lockd/sm_inter.h>
20 19
21#define NLMDBG_FACILITY NLMDBG_XDR 20#define NLMDBG_FACILITY NLMDBG_XDR
22 21
@@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
349 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) 348 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
350 return 0; 349 return 0;
351 argp->state = ntohl(*p++); 350 argp->state = ntohl(*p++);
352 /* Preserve the address in network byte order */ 351 memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
353 argp->addr = *p++; 352 p += XDR_QUADLEN(SM_PRIV_SIZE);
354 return xdr_argsize_check(rqstp, p); 353 return xdr_argsize_check(rqstp, p);
355} 354}
356 355
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 50c493a8ad8e..e1d528653192 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -17,7 +17,6 @@
17#include <linux/sunrpc/svc.h> 17#include <linux/sunrpc/svc.h>
18#include <linux/sunrpc/stats.h> 18#include <linux/sunrpc/stats.h>
19#include <linux/lockd/lockd.h> 19#include <linux/lockd/lockd.h>
20#include <linux/lockd/sm_inter.h>
21 20
22#define NLMDBG_FACILITY NLMDBG_XDR 21#define NLMDBG_FACILITY NLMDBG_XDR
23 22
@@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
356 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) 355 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
357 return 0; 356 return 0;
358 argp->state = ntohl(*p++); 357 argp->state = ntohl(*p++);
359 /* Preserve the address in network byte order */ 358 memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
360 argp->addr = *p++; 359 p += XDR_QUADLEN(SM_PRIV_SIZE);
361 return xdr_argsize_check(rqstp, p); 360 return xdr_argsize_check(rqstp, p);
362} 361}
363 362
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index f70433816a38..d4946c4c90e2 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
280 return -EINVAL; 280 return -EINVAL;
281 281
282got_it: 282got_it:
283 pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page); 283 pos = page_offset(page) + p - (char *)page_address(page);
284 err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, 284 err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
285 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 285 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
286 if (err) 286 if (err)
diff --git a/fs/mpage.c b/fs/mpage.c
index 552b80b3facc..16c3ef37eae3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
241 first_hole = page_block; 241 first_hole = page_block;
242 page_block++; 242 page_block++;
243 block_in_file++; 243 block_in_file++;
244 clear_buffer_mapped(map_bh);
245 continue; 244 continue;
246 } 245 }
247 246
@@ -308,7 +307,10 @@ alloc_new:
308 goto alloc_new; 307 goto alloc_new;
309 } 308 }
310 309
311 if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) 310 relative_block = block_in_file - *first_logical_block;
311 nblocks = map_bh->b_size >> blkbits;
312 if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
313 (first_hole != blocks_per_page))
312 bio = mpage_bio_submit(READ, bio); 314 bio = mpage_bio_submit(READ, bio);
313 else 315 else
314 *last_block_in_bio = blocks[blocks_per_page - 1]; 316 *last_block_in_bio = blocks[blocks_per_page - 1];
diff --git a/fs/namei.c b/fs/namei.c
index df2d3df4f049..f05bed242422 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -257,7 +257,7 @@ int inode_permission(struct inode *inode, int mask)
257 return -EACCES; 257 return -EACCES;
258 } 258 }
259 259
260 if (inode->i_op && inode->i_op->permission) 260 if (inode->i_op->permission)
261 retval = inode->i_op->permission(inode, mask); 261 retval = inode->i_op->permission(inode, mask);
262 else 262 else
263 retval = generic_permission(inode, mask, NULL); 263 retval = generic_permission(inode, mask, NULL);
@@ -432,7 +432,7 @@ static int exec_permission_lite(struct inode *inode)
432{ 432{
433 umode_t mode = inode->i_mode; 433 umode_t mode = inode->i_mode;
434 434
435 if (inode->i_op && inode->i_op->permission) 435 if (inode->i_op->permission)
436 return -EAGAIN; 436 return -EAGAIN;
437 437
438 if (current_fsuid() == inode->i_uid) 438 if (current_fsuid() == inode->i_uid)
@@ -908,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
908 inode = next.dentry->d_inode; 908 inode = next.dentry->d_inode;
909 if (!inode) 909 if (!inode)
910 goto out_dput; 910 goto out_dput;
911 err = -ENOTDIR;
912 if (!inode->i_op)
913 goto out_dput;
914 911
915 if (inode->i_op->follow_link) { 912 if (inode->i_op->follow_link) {
916 err = do_follow_link(&next, nd); 913 err = do_follow_link(&next, nd);
@@ -920,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
920 inode = nd->path.dentry->d_inode; 917 inode = nd->path.dentry->d_inode;
921 if (!inode) 918 if (!inode)
922 break; 919 break;
923 err = -ENOTDIR;
924 if (!inode->i_op)
925 break;
926 } else 920 } else
927 path_to_nameidata(&next, nd); 921 path_to_nameidata(&next, nd);
928 err = -ENOTDIR; 922 err = -ENOTDIR;
@@ -961,7 +955,7 @@ last_component:
961 break; 955 break;
962 inode = next.dentry->d_inode; 956 inode = next.dentry->d_inode;
963 if ((lookup_flags & LOOKUP_FOLLOW) 957 if ((lookup_flags & LOOKUP_FOLLOW)
964 && inode && inode->i_op && inode->i_op->follow_link) { 958 && inode && inode->i_op->follow_link) {
965 err = do_follow_link(&next, nd); 959 err = do_follow_link(&next, nd);
966 if (err) 960 if (err)
967 goto return_err; 961 goto return_err;
@@ -973,7 +967,7 @@ last_component:
973 break; 967 break;
974 if (lookup_flags & LOOKUP_DIRECTORY) { 968 if (lookup_flags & LOOKUP_DIRECTORY) {
975 err = -ENOTDIR; 969 err = -ENOTDIR;
976 if (!inode->i_op || !inode->i_op->lookup) 970 if (!inode->i_op->lookup)
977 break; 971 break;
978 } 972 }
979 goto return_base; 973 goto return_base;
@@ -1469,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1469 if (error) 1463 if (error)
1470 return error; 1464 return error;
1471 1465
1472 if (!dir->i_op || !dir->i_op->create) 1466 if (!dir->i_op->create)
1473 return -EACCES; /* shouldn't it be ENOSYS? */ 1467 return -EACCES; /* shouldn't it be ENOSYS? */
1474 mode &= S_IALLUGO; 1468 mode &= S_IALLUGO;
1475 mode |= S_IFREG; 1469 mode |= S_IFREG;
@@ -1752,7 +1746,7 @@ do_last:
1752 error = -ENOENT; 1746 error = -ENOENT;
1753 if (!path.dentry->d_inode) 1747 if (!path.dentry->d_inode)
1754 goto exit_dput; 1748 goto exit_dput;
1755 if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link) 1749 if (path.dentry->d_inode->i_op->follow_link)
1756 goto do_link; 1750 goto do_link;
1757 1751
1758 path_to_nameidata(&path, &nd); 1752 path_to_nameidata(&path, &nd);
@@ -1933,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1933 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) 1927 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1934 return -EPERM; 1928 return -EPERM;
1935 1929
1936 if (!dir->i_op || !dir->i_op->mknod) 1930 if (!dir->i_op->mknod)
1937 return -EPERM; 1931 return -EPERM;
1938 1932
1939 error = devcgroup_inode_mknod(mode, dev); 1933 error = devcgroup_inode_mknod(mode, dev);
@@ -2035,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2035 if (error) 2029 if (error)
2036 return error; 2030 return error;
2037 2031
2038 if (!dir->i_op || !dir->i_op->mkdir) 2032 if (!dir->i_op->mkdir)
2039 return -EPERM; 2033 return -EPERM;
2040 2034
2041 mode &= (S_IRWXUGO|S_ISVTX); 2035 mode &= (S_IRWXUGO|S_ISVTX);
@@ -2126,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2126 if (error) 2120 if (error)
2127 return error; 2121 return error;
2128 2122
2129 if (!dir->i_op || !dir->i_op->rmdir) 2123 if (!dir->i_op->rmdir)
2130 return -EPERM; 2124 return -EPERM;
2131 2125
2132 DQUOT_INIT(dir); 2126 DQUOT_INIT(dir);
@@ -2213,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2213 if (error) 2207 if (error)
2214 return error; 2208 return error;
2215 2209
2216 if (!dir->i_op || !dir->i_op->unlink) 2210 if (!dir->i_op->unlink)
2217 return -EPERM; 2211 return -EPERM;
2218 2212
2219 DQUOT_INIT(dir); 2213 DQUOT_INIT(dir);
@@ -2320,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2320 if (error) 2314 if (error)
2321 return error; 2315 return error;
2322 2316
2323 if (!dir->i_op || !dir->i_op->symlink) 2317 if (!dir->i_op->symlink)
2324 return -EPERM; 2318 return -EPERM;
2325 2319
2326 error = security_inode_symlink(dir, dentry, oldname); 2320 error = security_inode_symlink(dir, dentry, oldname);
@@ -2401,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2401 */ 2395 */
2402 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 2396 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2403 return -EPERM; 2397 return -EPERM;
2404 if (!dir->i_op || !dir->i_op->link) 2398 if (!dir->i_op->link)
2405 return -EPERM; 2399 return -EPERM;
2406 if (S_ISDIR(inode->i_mode)) 2400 if (S_ISDIR(inode->i_mode))
2407 return -EPERM; 2401 return -EPERM;
@@ -2608,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2608 if (error) 2602 if (error)
2609 return error; 2603 return error;
2610 2604
2611 if (!old_dir->i_op || !old_dir->i_op->rename) 2605 if (!old_dir->i_op->rename)
2612 return -EPERM; 2606 return -EPERM;
2613 2607
2614 DQUOT_INIT(old_dir); 2608 DQUOT_INIT(old_dir);
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 335b003dddf9..0af3349de851 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -16,7 +16,6 @@
16 * @opts: an array of &struct option entries controlling parser operations 16 * @opts: an array of &struct option entries controlling parser operations
17 * @optopt: output; will contain the current option 17 * @optopt: output; will contain the current option
18 * @optarg: output; will contain the value (if one exists) 18 * @optarg: output; will contain the value (if one exists)
19 * @flag: output; may be NULL; should point to a long for or'ing flags
20 * @value: output; may be NULL; will be overwritten with the integer value 19 * @value: output; may be NULL; will be overwritten with the integer value
21 * of the current argument. 20 * of the current argument.
22 * 21 *
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6d04e050c74e..f54360f50a9c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl
98{ 98{
99 s32 auth_type; 99 s32 auth_type;
100 u32 object_name_len; 100 u32 object_name_len;
101 compat_caddr_t object_name; /* an userspace data, in most cases user name */ 101 compat_caddr_t object_name; /* a userspace data, in most cases user name */
102}; 102};
103 103
104struct compat_ncp_fs_info_v2 { 104struct compat_ncp_fs_info_v2 {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 0184fe9b514c..c903e04aa217 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
76 76
77 ret = set_groups(new, gi); 77 ret = set_groups(new, gi);
78 put_group_info(gi); 78 put_group_info(gi);
79 if (!ret) 79 if (ret < 0)
80 goto error; 80 goto error;
81 81
82 if (new->uid) 82 if (new->fsuid)
83 new->cap_effective = cap_drop_nfsd_set(new->cap_effective); 83 new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
84 else 84 else
85 new->cap_effective = cap_raise_nfsd_set(new->cap_effective, 85 new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6d7d8c02c197..c464181b5994 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,9 +53,6 @@
53#define NFSPROC4_CB_NULL 0 53#define NFSPROC4_CB_NULL 0
54#define NFSPROC4_CB_COMPOUND 1 54#define NFSPROC4_CB_COMPOUND 1
55 55
56/* declarations */
57static const struct rpc_call_ops nfs4_cb_null_ops;
58
59/* Index of predefined Linux callback client operations */ 56/* Index of predefined Linux callback client operations */
60 57
61enum { 58enum {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 669461e291ae..9fa60a3ad48c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -946,6 +946,11 @@ encode_op:
946 nfsd4_encode_operation(resp, op); 946 nfsd4_encode_operation(resp, op);
947 status = op->status; 947 status = op->status;
948 } 948 }
949
950 dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
951 args->ops, args->opcnt, resp->opcnt, op->opnum,
952 be32_to_cpu(status));
953
949 if (cstate->replay_owner) { 954 if (cstate->replay_owner) {
950 nfs4_put_stateowner(cstate->replay_owner); 955 nfs4_put_stateowner(cstate->replay_owner);
951 cstate->replay_owner = NULL; 956 cstate->replay_owner = NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0f9d6efaa62b..74f7b67567fd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
116 116
117 md5_to_hex(dname, cksum.data); 117 md5_to_hex(dname, cksum.data);
118 118
119 kfree(cksum.data);
120 status = nfs_ok; 119 status = nfs_ok;
121out: 120out:
121 kfree(cksum.data);
122 crypto_free_hash(desc.tfm); 122 crypto_free_hash(desc.tfm);
123out_no_tfm: 123out_no_tfm:
124 return status; 124 return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 13e0e074dbb8..88db7d3ec120 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2416,6 +2416,26 @@ out:
2416#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) 2416#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS)
2417#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) 2417#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1)
2418 2418
2419static inline u64
2420end_offset(u64 start, u64 len)
2421{
2422 u64 end;
2423
2424 end = start + len;
2425 return end >= start ? end: NFS4_MAX_UINT64;
2426}
2427
2428/* last octet in a range */
2429static inline u64
2430last_byte_offset(u64 start, u64 len)
2431{
2432 u64 end;
2433
2434 BUG_ON(!len);
2435 end = start + len;
2436 return end > start ? end - 1: NFS4_MAX_UINT64;
2437}
2438
2419#define lockownerid_hashval(id) \ 2439#define lockownerid_hashval(id) \
2420 ((id) & LOCK_HASH_MASK) 2440 ((id) & LOCK_HASH_MASK)
2421 2441
@@ -2435,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
2435static struct nfs4_stateid * 2455static struct nfs4_stateid *
2436find_stateid(stateid_t *stid, int flags) 2456find_stateid(stateid_t *stid, int flags)
2437{ 2457{
2438 struct nfs4_stateid *local = NULL; 2458 struct nfs4_stateid *local;
2439 u32 st_id = stid->si_stateownerid; 2459 u32 st_id = stid->si_stateownerid;
2440 u32 f_id = stid->si_fileid; 2460 u32 f_id = stid->si_fileid;
2441 unsigned int hashval; 2461 unsigned int hashval;
2442 2462
2443 dprintk("NFSD: find_stateid flags 0x%x\n",flags); 2463 dprintk("NFSD: find_stateid flags 0x%x\n",flags);
2444 if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { 2464 if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
2445 hashval = stateid_hashval(st_id, f_id); 2465 hashval = stateid_hashval(st_id, f_id);
2446 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { 2466 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
2447 if ((local->st_stateid.si_stateownerid == st_id) && 2467 if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2449,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags)
2449 return local; 2469 return local;
2450 } 2470 }
2451 } 2471 }
2452 if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { 2472
2473 if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
2453 hashval = stateid_hashval(st_id, f_id); 2474 hashval = stateid_hashval(st_id, f_id);
2454 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { 2475 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
2455 if ((local->st_stateid.si_stateownerid == st_id) && 2476 if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2518,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
2518 deny->ld_clientid.cl_id = 0; 2539 deny->ld_clientid.cl_id = 0;
2519 } 2540 }
2520 deny->ld_start = fl->fl_start; 2541 deny->ld_start = fl->fl_start;
2521 deny->ld_length = ~(u64)0; 2542 deny->ld_length = NFS4_MAX_UINT64;
2522 if (fl->fl_end != ~(u64)0) 2543 if (fl->fl_end != NFS4_MAX_UINT64)
2523 deny->ld_length = fl->fl_end - fl->fl_start + 1; 2544 deny->ld_length = fl->fl_end - fl->fl_start + 1;
2524 deny->ld_type = NFS4_READ_LT; 2545 deny->ld_type = NFS4_READ_LT;
2525 if (fl->fl_type != F_RDLCK) 2546 if (fl->fl_type != F_RDLCK)
@@ -2616,7 +2637,7 @@ out:
2616static int 2637static int
2617check_lock_length(u64 offset, u64 length) 2638check_lock_length(u64 offset, u64 length)
2618{ 2639{
2619 return ((length == 0) || ((length != ~(u64)0) && 2640 return ((length == 0) || ((length != NFS4_MAX_UINT64) &&
2620 LOFF_OVERFLOW(offset, length))); 2641 LOFF_OVERFLOW(offset, length)));
2621} 2642}
2622 2643
@@ -2736,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2736 file_lock.fl_lmops = &nfsd_posix_mng_ops; 2757 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2737 2758
2738 file_lock.fl_start = lock->lk_offset; 2759 file_lock.fl_start = lock->lk_offset;
2739 if ((lock->lk_length == ~(u64)0) || 2760 file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
2740 LOFF_OVERFLOW(lock->lk_offset, lock->lk_length))
2741 file_lock.fl_end = ~(u64)0;
2742 else
2743 file_lock.fl_end = lock->lk_offset + lock->lk_length - 1;
2744 nfs4_transform_lock_offset(&file_lock); 2761 nfs4_transform_lock_offset(&file_lock);
2745 2762
2746 /* 2763 /*
@@ -2781,6 +2798,25 @@ out:
2781} 2798}
2782 2799
2783/* 2800/*
2801 * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
2802 * so we do a temporary open here just to get an open file to pass to
2803 * vfs_test_lock. (Arguably perhaps test_lock should be done with an
2804 * inode operation.)
2805 */
2806static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
2807{
2808 struct file *file;
2809 int err;
2810
2811 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
2812 if (err)
2813 return err;
2814 err = vfs_test_lock(file, lock);
2815 nfsd_close(file);
2816 return err;
2817}
2818
2819/*
2784 * LOCKT operation 2820 * LOCKT operation
2785 */ 2821 */
2786__be32 2822__be32
@@ -2788,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2788 struct nfsd4_lockt *lockt) 2824 struct nfsd4_lockt *lockt)
2789{ 2825{
2790 struct inode *inode; 2826 struct inode *inode;
2791 struct file file;
2792 struct file_lock file_lock; 2827 struct file_lock file_lock;
2793 int error; 2828 int error;
2794 __be32 status; 2829 __be32 status;
@@ -2839,23 +2874,12 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2839 file_lock.fl_lmops = &nfsd_posix_mng_ops; 2874 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2840 2875
2841 file_lock.fl_start = lockt->lt_offset; 2876 file_lock.fl_start = lockt->lt_offset;
2842 if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) 2877 file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
2843 file_lock.fl_end = ~(u64)0;
2844 else
2845 file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1;
2846 2878
2847 nfs4_transform_lock_offset(&file_lock); 2879 nfs4_transform_lock_offset(&file_lock);
2848 2880
2849 /* vfs_test_lock uses the struct file _only_ to resolve the inode.
2850 * since LOCKT doesn't require an OPEN, and therefore a struct
2851 * file may not exist, pass vfs_test_lock a struct file with
2852 * only the dentry:inode set.
2853 */
2854 memset(&file, 0, sizeof (struct file));
2855 file.f_path.dentry = cstate->current_fh.fh_dentry;
2856
2857 status = nfs_ok; 2881 status = nfs_ok;
2858 error = vfs_test_lock(&file, &file_lock); 2882 error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
2859 if (error) { 2883 if (error) {
2860 status = nfserrno(error); 2884 status = nfserrno(error);
2861 goto out; 2885 goto out;
@@ -2906,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2906 file_lock.fl_lmops = &nfsd_posix_mng_ops; 2930 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2907 file_lock.fl_start = locku->lu_offset; 2931 file_lock.fl_start = locku->lu_offset;
2908 2932
2909 if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) 2933 file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
2910 file_lock.fl_end = ~(u64)0;
2911 else
2912 file_lock.fl_end = locku->lu_offset + locku->lu_length - 1;
2913 nfs4_transform_lock_offset(&file_lock); 2934 nfs4_transform_lock_offset(&file_lock);
2914 2935
2915 /* 2936 /*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index afcdf4b76843..f65953be39c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * fs/nfs/nfs4xdr.c
3 *
4 * Server-side XDR for NFSv4 2 * Server-side XDR for NFSv4
5 * 3 *
6 * Copyright (c) 2002 The Regents of the University of Michigan. 4 * Copyright (c) 2002 The Regents of the University of Michigan.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77d7b8c531a6..3d93b2064ce5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size);
84static ssize_t write_getfd(struct file *file, char *buf, size_t size); 84static ssize_t write_getfd(struct file *file, char *buf, size_t size);
85static ssize_t write_getfs(struct file *file, char *buf, size_t size); 85static ssize_t write_getfs(struct file *file, char *buf, size_t size);
86static ssize_t write_filehandle(struct file *file, char *buf, size_t size); 86static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
87static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
88static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
87static ssize_t write_threads(struct file *file, char *buf, size_t size); 89static ssize_t write_threads(struct file *file, char *buf, size_t size);
88static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); 90static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
89static ssize_t write_versions(struct file *file, char *buf, size_t size); 91static ssize_t write_versions(struct file *file, char *buf, size_t size);
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
94static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 96static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
95#endif 97#endif
96 98
97static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
98static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
99
100static ssize_t (*write_op[])(struct file *, char *, size_t) = { 99static ssize_t (*write_op[])(struct file *, char *, size_t) = {
101 [NFSD_Svc] = write_svc, 100 [NFSD_Svc] = write_svc,
102 [NFSD_Add] = write_add, 101 [NFSD_Add] = write_add,
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
106 [NFSD_Getfd] = write_getfd, 105 [NFSD_Getfd] = write_getfd,
107 [NFSD_Getfs] = write_getfs, 106 [NFSD_Getfs] = write_getfs,
108 [NFSD_Fh] = write_filehandle, 107 [NFSD_Fh] = write_filehandle,
109 [NFSD_FO_UnlockIP] = failover_unlock_ip, 108 [NFSD_FO_UnlockIP] = write_unlock_ip,
110 [NFSD_FO_UnlockFS] = failover_unlock_fs, 109 [NFSD_FO_UnlockFS] = write_unlock_fs,
111 [NFSD_Threads] = write_threads, 110 [NFSD_Threads] = write_threads,
112 [NFSD_Pool_Threads] = write_pool_threads, 111 [NFSD_Pool_Threads] = write_pool_threads,
113 [NFSD_Versions] = write_versions, 112 [NFSD_Versions] = write_versions,
@@ -176,10 +175,24 @@ static const struct file_operations exports_operations = {
176/*----------------------------------------------------------------------------*/ 175/*----------------------------------------------------------------------------*/
177/* 176/*
178 * payload - write methods 177 * payload - write methods
179 * If the method has a response, the response should be put in buf,
180 * and the length returned. Otherwise return 0 or and -error.
181 */ 178 */
182 179
180/**
181 * write_svc - Start kernel's NFSD server
182 *
183 * Deprecated. /proc/fs/nfsd/threads is preferred.
184 * Function remains to support old versions of nfs-utils.
185 *
186 * Input:
187 * buf: struct nfsctl_svc
188 * svc_port: port number of this
189 * server's listener
190 * svc_nthreads: number of threads to start
191 * size: size in bytes of passed in nfsctl_svc
192 * Output:
193 * On success: returns zero
194 * On error: return code is negative errno value
195 */
183static ssize_t write_svc(struct file *file, char *buf, size_t size) 196static ssize_t write_svc(struct file *file, char *buf, size_t size)
184{ 197{
185 struct nfsctl_svc *data; 198 struct nfsctl_svc *data;
@@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size)
189 return nfsd_svc(data->svc_port, data->svc_nthreads); 202 return nfsd_svc(data->svc_port, data->svc_nthreads);
190} 203}
191 204
205/**
206 * write_add - Add or modify client entry in auth unix cache
207 *
208 * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
209 * Function remains to support old versions of nfs-utils.
210 *
211 * Input:
212 * buf: struct nfsctl_client
213 * cl_ident: '\0'-terminated C string
214 * containing domain name
215 * of client
216 * cl_naddr: no. of items in cl_addrlist
217 * cl_addrlist: array of client addresses
218 * cl_fhkeytype: ignored
219 * cl_fhkeylen: ignored
220 * cl_fhkey: ignored
221 * size: size in bytes of passed in nfsctl_client
222 * Output:
223 * On success: returns zero
224 * On error: return code is negative errno value
225 *
226 * Note: Only AF_INET client addresses are passed in, since
227 * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
228 */
192static ssize_t write_add(struct file *file, char *buf, size_t size) 229static ssize_t write_add(struct file *file, char *buf, size_t size)
193{ 230{
194 struct nfsctl_client *data; 231 struct nfsctl_client *data;
@@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size)
198 return exp_addclient(data); 235 return exp_addclient(data);
199} 236}
200 237
238/**
239 * write_del - Remove client from auth unix cache
240 *
241 * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
242 * Function remains to support old versions of nfs-utils.
243 *
244 * Input:
245 * buf: struct nfsctl_client
246 * cl_ident: '\0'-terminated C string
247 * containing domain name
248 * of client
249 * cl_naddr: ignored
250 * cl_addrlist: ignored
251 * cl_fhkeytype: ignored
252 * cl_fhkeylen: ignored
253 * cl_fhkey: ignored
254 * size: size in bytes of passed in nfsctl_client
255 * Output:
256 * On success: returns zero
257 * On error: return code is negative errno value
258 *
259 * Note: Only AF_INET client addresses are passed in, since
260 * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
261 */
201static ssize_t write_del(struct file *file, char *buf, size_t size) 262static ssize_t write_del(struct file *file, char *buf, size_t size)
202{ 263{
203 struct nfsctl_client *data; 264 struct nfsctl_client *data;
@@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size)
207 return exp_delclient(data); 268 return exp_delclient(data);
208} 269}
209 270
271/**
272 * write_export - Export part or all of a local file system
273 *
274 * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
275 * Function remains to support old versions of nfs-utils.
276 *
277 * Input:
278 * buf: struct nfsctl_export
279 * ex_client: '\0'-terminated C string
280 * containing domain name
281 * of client allowed to access
282 * this export
283 * ex_path: '\0'-terminated C string
284 * containing pathname of
285 * directory in local file system
286 * ex_dev: fsid to use for this export
287 * ex_ino: ignored
288 * ex_flags: export flags for this export
289 * ex_anon_uid: UID to use for anonymous
290 * requests
291 * ex_anon_gid: GID to use for anonymous
292 * requests
293 * size: size in bytes of passed in nfsctl_export
294 * Output:
295 * On success: returns zero
296 * On error: return code is negative errno value
297 */
210static ssize_t write_export(struct file *file, char *buf, size_t size) 298static ssize_t write_export(struct file *file, char *buf, size_t size)
211{ 299{
212 struct nfsctl_export *data; 300 struct nfsctl_export *data;
@@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size)
216 return exp_export(data); 304 return exp_export(data);
217} 305}
218 306
307/**
308 * write_unexport - Unexport a previously exported file system
309 *
310 * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
311 * Function remains to support old versions of nfs-utils.
312 *
313 * Input:
314 * buf: struct nfsctl_export
315 * ex_client: '\0'-terminated C string
316 * containing domain name
317 * of client no longer allowed
318 * to access this export
319 * ex_path: '\0'-terminated C string
320 * containing pathname of
321 * directory in local file system
322 * ex_dev: ignored
323 * ex_ino: ignored
324 * ex_flags: ignored
325 * ex_anon_uid: ignored
326 * ex_anon_gid: ignored
327 * size: size in bytes of passed in nfsctl_export
328 * Output:
329 * On success: returns zero
330 * On error: return code is negative errno value
331 */
219static ssize_t write_unexport(struct file *file, char *buf, size_t size) 332static ssize_t write_unexport(struct file *file, char *buf, size_t size)
220{ 333{
221 struct nfsctl_export *data; 334 struct nfsctl_export *data;
@@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size)
226 return exp_unexport(data); 339 return exp_unexport(data);
227} 340}
228 341
342/**
343 * write_getfs - Get a variable-length NFS file handle by path
344 *
345 * Deprecated. /proc/fs/nfsd/filehandle is preferred.
346 * Function remains to support old versions of nfs-utils.
347 *
348 * Input:
349 * buf: struct nfsctl_fsparm
350 * gd_addr: socket address of client
351 * gd_path: '\0'-terminated C string
352 * containing pathname of
353 * directory in local file system
354 * gd_maxlen: maximum size of returned file
355 * handle
356 * size: size in bytes of passed in nfsctl_fsparm
357 * Output:
358 * On success: passed-in buffer filled with a knfsd_fh structure
359 * (a variable-length raw NFS file handle);
360 * return code is the size in bytes of the file handle
361 * On error: return code is negative errno value
362 *
363 * Note: Only AF_INET client addresses are passed in, since gd_addr
364 * is the same size as a struct sockaddr_in.
365 */
229static ssize_t write_getfs(struct file *file, char *buf, size_t size) 366static ssize_t write_getfs(struct file *file, char *buf, size_t size)
230{ 367{
231 struct nfsctl_fsparm *data; 368 struct nfsctl_fsparm *data;
@@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
265 return err; 402 return err;
266} 403}
267 404
405/**
406 * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
407 *
408 * Deprecated. /proc/fs/nfsd/filehandle is preferred.
409 * Function remains to support old versions of nfs-utils.
410 *
411 * Input:
412 * buf: struct nfsctl_fdparm
413 * gd_addr: socket address of client
414 * gd_path: '\0'-terminated C string
415 * containing pathname of
416 * directory in local file system
417 * gd_version: fdparm structure version
418 * size: size in bytes of passed in nfsctl_fdparm
419 * Output:
420 * On success: passed-in buffer filled with nfsctl_res
421 * (a fixed-length raw NFS file handle);
422 * return code is the size in bytes of the file handle
423 * On error: return code is negative errno value
424 *
425 * Note: Only AF_INET client addresses are passed in, since gd_addr
426 * is the same size as a struct sockaddr_in.
427 */
268static ssize_t write_getfd(struct file *file, char *buf, size_t size) 428static ssize_t write_getfd(struct file *file, char *buf, size_t size)
269{ 429{
270 struct nfsctl_fdparm *data; 430 struct nfsctl_fdparm *data;
@@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
309 return err; 469 return err;
310} 470}
311 471
312static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) 472/**
473 * write_unlock_ip - Release all locks used by a client
474 *
475 * Experimental.
476 *
477 * Input:
478 * buf: '\n'-terminated C string containing a
479 * presentation format IPv4 address
480 * size: length of C string in @buf
481 * Output:
482 * On success: returns zero if all specified locks were released;
483 * returns one if one or more locks were not released
484 * On error: return code is negative errno value
485 *
486 * Note: Only AF_INET client addresses are passed in
487 */
488static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
313{ 489{
314 struct sockaddr_in sin = { 490 struct sockaddr_in sin = {
315 .sin_family = AF_INET, 491 .sin_family = AF_INET,
@@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
339 return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); 515 return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
340} 516}
341 517
342static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) 518/**
519 * write_unlock_fs - Release all locks on a local file system
520 *
521 * Experimental.
522 *
523 * Input:
524 * buf: '\n'-terminated C string containing the
525 * absolute pathname of a local file system
526 * size: length of C string in @buf
527 * Output:
528 * On success: returns zero if all specified locks were released;
529 * returns one if one or more locks were not released
530 * On error: return code is negative errno value
531 */
532static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
343{ 533{
344 struct path path; 534 struct path path;
345 char *fo_path; 535 char *fo_path;
@@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
360 if (error) 550 if (error)
361 return error; 551 return error;
362 552
553 /*
554 * XXX: Needs better sanity checking. Otherwise we could end up
555 * releasing locks on the wrong file system.
556 *
557 * For example:
558 * 1. Does the path refer to a directory?
559 * 2. Is that directory a mount point, or
560 * 3. Is that directory the root of an exported file system?
561 */
363 error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); 562 error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
364 563
365 path_put(&path); 564 path_put(&path);
366 return error; 565 return error;
367} 566}
368 567
568/**
569 * write_filehandle - Get a variable-length NFS file handle by path
570 *
571 * On input, the buffer contains a '\n'-terminated C string comprised of
572 * three alphanumeric words separated by whitespace. The string may
573 * contain escape sequences.
574 *
575 * Input:
576 * buf:
577 * domain: client domain name
578 * path: export pathname
579 * maxsize: numeric maximum size of
580 * @buf
581 * size: length of C string in @buf
582 * Output:
583 * On success: passed-in buffer filled with '\n'-terminated C
584 * string containing a ASCII hex text version
585 * of the NFS file handle;
586 * return code is the size in bytes of the string
587 * On error: return code is negative errno value
588 */
369static ssize_t write_filehandle(struct file *file, char *buf, size_t size) 589static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
370{ 590{
371 /* request is:
372 * domain path maxsize
373 * response is
374 * filehandle
375 *
376 * qword quoting is used, so filehandle will be \x....
377 */
378 char *dname, *path; 591 char *dname, *path;
379 int uninitialized_var(maxsize); 592 int uninitialized_var(maxsize);
380 char *mesg = buf; 593 char *mesg = buf;
@@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
391 604
392 dname = mesg; 605 dname = mesg;
393 len = qword_get(&mesg, dname, size); 606 len = qword_get(&mesg, dname, size);
394 if (len <= 0) return -EINVAL; 607 if (len <= 0)
608 return -EINVAL;
395 609
396 path = dname+len+1; 610 path = dname+len+1;
397 len = qword_get(&mesg, path, size); 611 len = qword_get(&mesg, path, size);
398 if (len <= 0) return -EINVAL; 612 if (len <= 0)
613 return -EINVAL;
399 614
400 len = get_int(&mesg, &maxsize); 615 len = get_int(&mesg, &maxsize);
401 if (len) 616 if (len)
@@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
419 if (len) 634 if (len)
420 return len; 635 return len;
421 636
422 mesg = buf; len = SIMPLE_TRANSACTION_LIMIT; 637 mesg = buf;
638 len = SIMPLE_TRANSACTION_LIMIT;
423 qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); 639 qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
424 mesg[-1] = '\n'; 640 mesg[-1] = '\n';
425 return mesg - buf; 641 return mesg - buf;
426} 642}
427 643
644/**
645 * write_threads - Start NFSD, or report the current number of running threads
646 *
647 * Input:
648 * buf: ignored
649 * size: zero
650 * Output:
651 * On success: passed-in buffer filled with '\n'-terminated C
652 * string numeric value representing the number of
653 * running NFSD threads;
654 * return code is the size in bytes of the string
655 * On error: return code is zero
656 *
657 * OR
658 *
659 * Input:
660 * buf: C string containing an unsigned
661 * integer value representing the
662 * number of NFSD threads to start
663 * size: non-zero length of C string in @buf
664 * Output:
665 * On success: NFS service is started;
666 * passed-in buffer filled with '\n'-terminated C
667 * string numeric value representing the number of
668 * running NFSD threads;
669 * return code is the size in bytes of the string
670 * On error: return code is zero or a negative errno value
671 */
428static ssize_t write_threads(struct file *file, char *buf, size_t size) 672static ssize_t write_threads(struct file *file, char *buf, size_t size)
429{ 673{
430 /* if size > 0, look for a number of threads and call nfsd_svc
431 * then write out number of threads as reply
432 */
433 char *mesg = buf; 674 char *mesg = buf;
434 int rv; 675 int rv;
435 if (size > 0) { 676 if (size > 0) {
@@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
437 rv = get_int(&mesg, &newthreads); 678 rv = get_int(&mesg, &newthreads);
438 if (rv) 679 if (rv)
439 return rv; 680 return rv;
440 if (newthreads <0) 681 if (newthreads < 0)
441 return -EINVAL; 682 return -EINVAL;
442 rv = nfsd_svc(2049, newthreads); 683 rv = nfsd_svc(NFS_PORT, newthreads);
443 if (rv) 684 if (rv)
444 return rv; 685 return rv;
445 } 686 }
@@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
447 return strlen(buf); 688 return strlen(buf);
448} 689}
449 690
691/**
692 * write_pool_threads - Set or report the current number of threads per pool
693 *
694 * Input:
695 * buf: ignored
696 * size: zero
697 *
698 * OR
699 *
700 * Input:
701 * buf: C string containing whitespace-
702 * separated unsigned integer values
703 * representing the number of NFSD
704 * threads to start in each pool
705 * size: non-zero length of C string in @buf
706 * Output:
707 * On success: passed-in buffer filled with '\n'-terminated C
708 * string containing integer values representing the
709 * number of NFSD threads in each pool;
710 * return code is the size in bytes of the string
711 * On error: return code is zero or a negative errno value
712 */
450static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) 713static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
451{ 714{
452 /* if size > 0, look for an array of number of threads per node 715 /* if size > 0, look for an array of number of threads per node
@@ -517,10 +780,6 @@ out_free:
517 780
518static ssize_t __write_versions(struct file *file, char *buf, size_t size) 781static ssize_t __write_versions(struct file *file, char *buf, size_t size)
519{ 782{
520 /*
521 * Format:
522 * [-/+]vers [-/+]vers ...
523 */
524 char *mesg = buf; 783 char *mesg = buf;
525 char *vers, sign; 784 char *vers, sign;
526 int len, num; 785 int len, num;
@@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
578 return len; 837 return len;
579} 838}
580 839
840/**
841 * write_versions - Set or report the available NFS protocol versions
842 *
843 * Input:
844 * buf: ignored
845 * size: zero
846 * Output:
847 * On success: passed-in buffer filled with '\n'-terminated C
848 * string containing positive or negative integer
849 * values representing the current status of each
850 * protocol version;
851 * return code is the size in bytes of the string
852 * On error: return code is zero or a negative errno value
853 *
854 * OR
855 *
856 * Input:
857 * buf: C string containing whitespace-
858 * separated positive or negative
859 * integer values representing NFS
860 * protocol versions to enable ("+n")
861 * or disable ("-n")
862 * size: non-zero length of C string in @buf
863 * Output:
864 * On success: status of zero or more protocol versions has
865 * been updated; passed-in buffer filled with
866 * '\n'-terminated C string containing positive
867 * or negative integer values representing the
868 * current status of each protocol version;
869 * return code is the size in bytes of the string
870 * On error: return code is zero or a negative errno value
871 */
581static ssize_t write_versions(struct file *file, char *buf, size_t size) 872static ssize_t write_versions(struct file *file, char *buf, size_t size)
582{ 873{
583 ssize_t rv; 874 ssize_t rv;
@@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
687 return -EINVAL; 978 return -EINVAL;
688} 979}
689 980
981/**
982 * write_ports - Pass a socket file descriptor or transport name to listen on
983 *
984 * Input:
985 * buf: ignored
986 * size: zero
987 * Output:
988 * On success: passed-in buffer filled with a '\n'-terminated C
989 * string containing a whitespace-separated list of
990 * named NFSD listeners;
991 * return code is the size in bytes of the string
992 * On error: return code is zero or a negative errno value
993 *
994 * OR
995 *
996 * Input:
997 * buf: C string containing an unsigned
998 * integer value representing a bound
999 * but unconnected socket that is to be
1000 * used as an NFSD listener
1001 * size: non-zero length of C string in @buf
1002 * Output:
1003 * On success: NFS service is started;
1004 * passed-in buffer filled with a '\n'-terminated C
1005 * string containing a unique alphanumeric name of
1006 * the listener;
1007 * return code is the size in bytes of the string
1008 * On error: return code is a negative errno value
1009 *
1010 * OR
1011 *
1012 * Input:
1013 * buf: C string containing a "-" followed
1014 * by an integer value representing a
1015 * previously passed in socket file
1016 * descriptor
1017 * size: non-zero length of C string in @buf
1018 * Output:
1019 * On success: NFS service no longer listens on that socket;
1020 * passed-in buffer filled with a '\n'-terminated C
1021 * string containing a unique name of the listener;
1022 * return code is the size in bytes of the string
1023 * On error: return code is a negative errno value
1024 *
1025 * OR
1026 *
1027 * Input:
1028 * buf: C string containing a transport
1029 * name and an unsigned integer value
1030 * representing the port to listen on,
1031 * separated by whitespace
1032 * size: non-zero length of C string in @buf
1033 * Output:
1034 * On success: returns zero; NFS service is started
1035 * On error: return code is a negative errno value
1036 *
1037 * OR
1038 *
1039 * Input:
1040 * buf: C string containing a "-" followed
1041 * by a transport name and an unsigned
1042 * integer value representing the port
1043 * to listen on, separated by whitespace
1044 * size: non-zero length of C string in @buf
1045 * Output:
1046 * On success: returns zero; NFS service no longer listens
1047 * on that transport
1048 * On error: return code is a negative errno value
1049 */
690static ssize_t write_ports(struct file *file, char *buf, size_t size) 1050static ssize_t write_ports(struct file *file, char *buf, size_t size)
691{ 1051{
692 ssize_t rv; 1052 ssize_t rv;
@@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
700 1060
701int nfsd_max_blksize; 1061int nfsd_max_blksize;
702 1062
1063/**
1064 * write_maxblksize - Set or report the current NFS blksize
1065 *
1066 * Input:
1067 * buf: ignored
1068 * size: zero
1069 *
1070 * OR
1071 *
1072 * Input:
1073 * buf: C string containing an unsigned
1074 * integer value representing the new
1075 * NFS blksize
1076 * size: non-zero length of C string in @buf
1077 * Output:
1078 * On success: passed-in buffer filled with '\n'-terminated C string
1079 * containing numeric value of the current NFS blksize
1080 * setting;
1081 * return code is the size in bytes of the string
1082 * On error: return code is zero or a negative errno value
1083 */
703static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) 1084static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
704{ 1085{
705 char *mesg = buf; 1086 char *mesg = buf;
@@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
752 return strlen(buf); 1133 return strlen(buf);
753} 1134}
754 1135
1136/**
1137 * write_leasetime - Set or report the current NFSv4 lease time
1138 *
1139 * Input:
1140 * buf: ignored
1141 * size: zero
1142 *
1143 * OR
1144 *
1145 * Input:
1146 * buf: C string containing an unsigned
1147 * integer value representing the new
1148 * NFSv4 lease expiry time
1149 * size: non-zero length of C string in @buf
1150 * Output:
1151 * On success: passed-in buffer filled with '\n'-terminated C
1152 * string containing unsigned integer value of the
1153 * current lease expiry time;
1154 * return code is the size in bytes of the string
1155 * On error: return code is zero or a negative errno value
1156 */
755static ssize_t write_leasetime(struct file *file, char *buf, size_t size) 1157static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
756{ 1158{
757 ssize_t rv; 1159 ssize_t rv;
@@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
788 return strlen(buf); 1190 return strlen(buf);
789} 1191}
790 1192
1193/**
1194 * write_recoverydir - Set or report the pathname of the recovery directory
1195 *
1196 * Input:
1197 * buf: ignored
1198 * size: zero
1199 *
1200 * OR
1201 *
1202 * Input:
1203 * buf: C string containing the pathname
1204 * of the directory on a local file
1205 * system containing permanent NFSv4
1206 * recovery data
1207 * size: non-zero length of C string in @buf
1208 * Output:
1209 * On success: passed-in buffer filled with '\n'-terminated C string
1210 * containing the current recovery pathname setting;
1211 * return code is the size in bytes of the string
1212 * On error: return code is zero or a negative errno value
1213 */
791static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) 1214static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
792{ 1215{
793 ssize_t rv; 1216 ssize_t rv;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f0da7d9c3a92..9f1ca17293d3 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -258,14 +258,32 @@ out:
258 return error; 258 return error;
259} 259}
260 260
261/* 261/**
262 * Perform sanity checks on the dentry in a client's file handle. 262 * fh_verify - filehandle lookup and access checking
263 * @rqstp: pointer to current rpc request
264 * @fhp: filehandle to be verified
265 * @type: expected type of object pointed to by filehandle
266 * @access: type of access needed to object
267 *
268 * Look up a dentry from the on-the-wire filehandle, check the client's
269 * access to the export, and set the current task's credentials.
270 *
271 * Regardless of success or failure of fh_verify(), fh_put() should be
272 * called on @fhp when the caller is finished with the filehandle.
263 * 273 *
264 * Note that the file handle dentry may need to be freed even after 274 * fh_verify() may be called multiple times on a given filehandle, for
265 * an error return. 275 * example, when processing an NFSv4 compound. The first call will look
276 * up a dentry using the on-the-wire filehandle. Subsequent calls will
277 * skip the lookup and just perform the other checks and possibly change
278 * the current task's credentials.
266 * 279 *
267 * This is only called at the start of an nfsproc call, so fhp points to 280 * @type specifies the type of object expected using one of the S_IF*
268 * a svc_fh which is all 0 except for the over-the-wire file handle. 281 * constants defined in include/linux/stat.h. The caller may use zero
282 * to indicate that it doesn't care, or a negative integer to indicate
283 * that it expects something not of the given type.
284 *
285 * @access is formed from the NFSD_MAY_* constants defined in
286 * include/linux/nfsd/nfsd.h.
269 */ 287 */
270__be32 288__be32
271fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) 289fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
@@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
466 goto retry; 484 goto retry;
467 break; 485 break;
468 } 486 }
487 } else if (exp->ex_flags & NFSEXP_FSID) {
488 fsid_type = FSID_NUM;
469 } else if (exp->ex_uuid) { 489 } else if (exp->ex_uuid) {
470 if (fhp->fh_maxsize >= 64) { 490 if (fhp->fh_maxsize >= 64) {
471 if (root_export) 491 if (root_export)
@@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
478 else 498 else
479 fsid_type = FSID_UUID4_INUM; 499 fsid_type = FSID_UUID4_INUM;
480 } 500 }
481 } else if (exp->ex_flags & NFSEXP_FSID) 501 } else if (!old_valid_dev(ex_dev))
482 fsid_type = FSID_NUM;
483 else if (!old_valid_dev(ex_dev))
484 /* for newer device numbers, we must use a newer fsid format */ 502 /* for newer device numbers, we must use a newer fsid format */
485 fsid_type = FSID_ENCODE_DEV; 503 fsid_type = FSID_ENCODE_DEV;
486 else 504 else
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cffeca7acef..6f7f26351227 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -622,6 +622,7 @@ nfserrno (int errno)
622 { nfserr_badname, -ESRCH }, 622 { nfserr_badname, -ESRCH },
623 { nfserr_io, -ETXTBSY }, 623 { nfserr_io, -ETXTBSY },
624 { nfserr_notsupp, -EOPNOTSUPP }, 624 { nfserr_notsupp, -EOPNOTSUPP },
625 { nfserr_toosmall, -ETOOSMALL },
625 }; 626 };
626 int i; 627 int i;
627 628
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b365..6e50aaa56ca2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
764 764
765 return err; 765 return err;
766} 766}
767
768 767
769static int 768static int
770nfsd_sync(struct file *filp) 769nfsd_sync(struct file *filp)
@@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1211 dirp = dentry->d_inode; 1210 dirp = dentry->d_inode;
1212 1211
1213 err = nfserr_notdir; 1212 err = nfserr_notdir;
1214 if(!dirp->i_op || !dirp->i_op->lookup) 1213 if (!dirp->i_op->lookup)
1215 goto out; 1214 goto out;
1216 /* 1215 /*
1217 * Check whether the response file handle has been verified yet. 1216 * Check whether the response file handle has been verified yet.
@@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1347 /* Get all the sanity checks out of the way before 1346 /* Get all the sanity checks out of the way before
1348 * we lock the parent. */ 1347 * we lock the parent. */
1349 err = nfserr_notdir; 1348 err = nfserr_notdir;
1350 if(!dirp->i_op || !dirp->i_op->lookup) 1349 if (!dirp->i_op->lookup)
1351 goto out; 1350 goto out;
1352 fh_lock_nested(fhp, I_MUTEX_PARENT); 1351 fh_lock_nested(fhp, I_MUTEX_PARENT);
1353 1352
@@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
1482 inode = dentry->d_inode; 1481 inode = dentry->d_inode;
1483 1482
1484 err = nfserr_inval; 1483 err = nfserr_inval;
1485 if (!inode->i_op || !inode->i_op->readlink) 1484 if (!inode->i_op->readlink)
1486 goto out; 1485 goto out;
1487 1486
1488 touch_atime(fhp->fh_export->ex_path.mnt, dentry); 1487 touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
2162 size_t size; 2161 size_t size;
2163 int error; 2162 int error;
2164 2163
2165 if (!IS_POSIXACL(inode) || !inode->i_op || 2164 if (!IS_POSIXACL(inode) ||
2166 !inode->i_op->setxattr || !inode->i_op->removexattr) 2165 !inode->i_op->setxattr || !inode->i_op->removexattr)
2167 return -EOPNOTSUPP; 2166 return -EOPNOTSUPP;
2168 switch(type) { 2167 switch(type) {
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 400f8064a548..81b8644b0136 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -704,7 +704,7 @@ fput_and_out:
704 return ret; 704 return ret;
705} 705}
706 706
707asmlinkage long sys_inotify_rm_watch(int fd, u32 wd) 707asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
708{ 708{
709 struct file *filp; 709 struct file *filp;
710 struct inotify_device *dev; 710 struct inotify_device *dev;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e2772..86bef156cf0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
1406 ni->allocated_size = sle64_to_cpu( 1406 ni->allocated_size = sle64_to_cpu(
1407 a->data.non_resident.allocated_size); 1407 a->data.non_resident.allocated_size);
1408 } 1408 }
1409 /* Setup the operations for this attribute inode. */
1410 vi->i_op = NULL;
1411 vi->i_fop = NULL;
1412 if (NInoMstProtected(ni)) 1409 if (NInoMstProtected(ni))
1413 vi->i_mapping->a_ops = &ntfs_mst_aops; 1410 vi->i_mapping->a_ops = &ntfs_mst_aops;
1414 else 1411 else
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3c..01596079dd63 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
12ocfs2-objs := \ 12ocfs2-objs := \
13 alloc.o \ 13 alloc.o \
14 aops.o \ 14 aops.o \
15 blockcheck.o \
15 buffer_head_io.o \ 16 buffer_head_io.o \
16 dcache.o \ 17 dcache.o \
17 dir.o \ 18 dir.o \
@@ -35,8 +36,14 @@ ocfs2-objs := \
35 sysfile.o \ 36 sysfile.o \
36 uptodate.o \ 37 uptodate.o \
37 ver.o \ 38 ver.o \
39 quota_local.o \
40 quota_global.o \
38 xattr.o 41 xattr.o
39 42
43ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
44ocfs2-objs += acl.o
45endif
46
40ocfs2_stackglue-objs := stackglue.o 47ocfs2_stackglue-objs := stackglue.o
41ocfs2_stack_o2cb-objs := stack_o2cb.o 48ocfs2_stack_o2cb-objs := stack_o2cb.o
42ocfs2_stack_user-objs := stack_user.o 49ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 000000000000..12dfb44c22e5
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * acl.c
5 *
6 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
7 *
8 * CREDITS:
9 * Lots of code in this file is copy from linux/fs/ext3/acl.c.
10 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public
14 * License version 2 as published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 */
21
22#include <linux/init.h>
23#include <linux/module.h>
24#include <linux/string.h>
25
26#define MLOG_MASK_PREFIX ML_INODE
27#include <cluster/masklog.h>
28
29#include "ocfs2.h"
30#include "alloc.h"
31#include "dlmglue.h"
32#include "file.h"
33#include "ocfs2_fs.h"
34
35#include "xattr.h"
36#include "acl.h"
37
38/*
39 * Convert from xattr value to acl struct.
40 */
41static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
42{
43 int n, count;
44 struct posix_acl *acl;
45
46 if (!value)
47 return NULL;
48 if (size < sizeof(struct posix_acl_entry))
49 return ERR_PTR(-EINVAL);
50
51 count = size / sizeof(struct posix_acl_entry);
52 if (count < 0)
53 return ERR_PTR(-EINVAL);
54 if (count == 0)
55 return NULL;
56
57 acl = posix_acl_alloc(count, GFP_NOFS);
58 if (!acl)
59 return ERR_PTR(-ENOMEM);
60 for (n = 0; n < count; n++) {
61 struct ocfs2_acl_entry *entry =
62 (struct ocfs2_acl_entry *)value;
63
64 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
65 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
66 acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
67 value += sizeof(struct posix_acl_entry);
68
69 }
70 return acl;
71}
72
73/*
74 * Convert acl struct to xattr value.
75 */
76static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
77{
78 struct ocfs2_acl_entry *entry = NULL;
79 char *ocfs2_acl;
80 size_t n;
81
82 *size = acl->a_count * sizeof(struct posix_acl_entry);
83
84 ocfs2_acl = kmalloc(*size, GFP_NOFS);
85 if (!ocfs2_acl)
86 return ERR_PTR(-ENOMEM);
87
88 entry = (struct ocfs2_acl_entry *)ocfs2_acl;
89 for (n = 0; n < acl->a_count; n++, entry++) {
90 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
91 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
92 entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
93 }
94 return ocfs2_acl;
95}
96
97static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
98 int type,
99 struct buffer_head *di_bh)
100{
101 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
102 int name_index;
103 char *value = NULL;
104 struct posix_acl *acl;
105 int retval;
106
107 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
108 return NULL;
109
110 switch (type) {
111 case ACL_TYPE_ACCESS:
112 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
113 break;
114 case ACL_TYPE_DEFAULT:
115 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
116 break;
117 default:
118 return ERR_PTR(-EINVAL);
119 }
120
121 retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
122 if (retval > 0) {
123 value = kmalloc(retval, GFP_NOFS);
124 if (!value)
125 return ERR_PTR(-ENOMEM);
126 retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
127 "", value, retval);
128 }
129
130 if (retval > 0)
131 acl = ocfs2_acl_from_xattr(value, retval);
132 else if (retval == -ENODATA || retval == 0)
133 acl = NULL;
134 else
135 acl = ERR_PTR(retval);
136
137 kfree(value);
138
139 return acl;
140}
141
142
143/*
144 * Get posix acl.
145 */
146static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
147{
148 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
149 struct buffer_head *di_bh = NULL;
150 struct posix_acl *acl;
151 int ret;
152
153 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
154 return NULL;
155
156 ret = ocfs2_inode_lock(inode, &di_bh, 0);
157 if (ret < 0) {
158 mlog_errno(ret);
159 acl = ERR_PTR(ret);
160 return acl;
161 }
162
163 acl = ocfs2_get_acl_nolock(inode, type, di_bh);
164
165 ocfs2_inode_unlock(inode, 0);
166
167 brelse(di_bh);
168
169 return acl;
170}
171
172/*
173 * Set the access or default ACL of an inode.
174 */
175static int ocfs2_set_acl(handle_t *handle,
176 struct inode *inode,
177 struct buffer_head *di_bh,
178 int type,
179 struct posix_acl *acl,
180 struct ocfs2_alloc_context *meta_ac,
181 struct ocfs2_alloc_context *data_ac)
182{
183 int name_index;
184 void *value = NULL;
185 size_t size = 0;
186 int ret;
187
188 if (S_ISLNK(inode->i_mode))
189 return -EOPNOTSUPP;
190
191 switch (type) {
192 case ACL_TYPE_ACCESS:
193 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
194 if (acl) {
195 mode_t mode = inode->i_mode;
196 ret = posix_acl_equiv_mode(acl, &mode);
197 if (ret < 0)
198 return ret;
199 else {
200 inode->i_mode = mode;
201 if (ret == 0)
202 acl = NULL;
203 }
204 }
205 break;
206 case ACL_TYPE_DEFAULT:
207 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
208 if (!S_ISDIR(inode->i_mode))
209 return acl ? -EACCES : 0;
210 break;
211 default:
212 return -EINVAL;
213 }
214
215 if (acl) {
216 value = ocfs2_acl_to_xattr(acl, &size);
217 if (IS_ERR(value))
218 return (int)PTR_ERR(value);
219 }
220
221 if (handle)
222 ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
223 "", value, size, 0,
224 meta_ac, data_ac);
225 else
226 ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
227
228 kfree(value);
229
230 return ret;
231}
232
233int ocfs2_check_acl(struct inode *inode, int mask)
234{
235 struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
236
237 if (IS_ERR(acl))
238 return PTR_ERR(acl);
239 if (acl) {
240 int ret = posix_acl_permission(inode, acl, mask);
241 posix_acl_release(acl);
242 return ret;
243 }
244
245 return -EAGAIN;
246}
247
248int ocfs2_acl_chmod(struct inode *inode)
249{
250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
251 struct posix_acl *acl, *clone;
252 int ret;
253
254 if (S_ISLNK(inode->i_mode))
255 return -EOPNOTSUPP;
256
257 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
258 return 0;
259
260 acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
261 if (IS_ERR(acl) || !acl)
262 return PTR_ERR(acl);
263 clone = posix_acl_clone(acl, GFP_KERNEL);
264 posix_acl_release(acl);
265 if (!clone)
266 return -ENOMEM;
267 ret = posix_acl_chmod_masq(clone, inode->i_mode);
268 if (!ret)
269 ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
270 clone, NULL, NULL);
271 posix_acl_release(clone);
272 return ret;
273}
274
275/*
276 * Initialize the ACLs of a new inode. If parent directory has default ACL,
277 * then clone to new inode. Called from ocfs2_mknod.
278 */
279int ocfs2_init_acl(handle_t *handle,
280 struct inode *inode,
281 struct inode *dir,
282 struct buffer_head *di_bh,
283 struct buffer_head *dir_bh,
284 struct ocfs2_alloc_context *meta_ac,
285 struct ocfs2_alloc_context *data_ac)
286{
287 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
288 struct posix_acl *acl = NULL;
289 int ret = 0;
290
291 if (!S_ISLNK(inode->i_mode)) {
292 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
293 acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
294 dir_bh);
295 if (IS_ERR(acl))
296 return PTR_ERR(acl);
297 }
298 if (!acl)
299 inode->i_mode &= ~current->fs->umask;
300 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone;
303 mode_t mode;
304
305 if (S_ISDIR(inode->i_mode)) {
306 ret = ocfs2_set_acl(handle, inode, di_bh,
307 ACL_TYPE_DEFAULT, acl,
308 meta_ac, data_ac);
309 if (ret)
310 goto cleanup;
311 }
312 clone = posix_acl_clone(acl, GFP_NOFS);
313 ret = -ENOMEM;
314 if (!clone)
315 goto cleanup;
316
317 mode = inode->i_mode;
318 ret = posix_acl_create_masq(clone, &mode);
319 if (ret >= 0) {
320 inode->i_mode = mode;
321 if (ret > 0) {
322 ret = ocfs2_set_acl(handle, inode,
323 di_bh, ACL_TYPE_ACCESS,
324 clone, meta_ac, data_ac);
325 }
326 }
327 posix_acl_release(clone);
328 }
329cleanup:
330 posix_acl_release(acl);
331 return ret;
332}
333
334static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
335 char *list,
336 size_t list_len,
337 const char *name,
338 size_t name_len)
339{
340 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
341 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
342
343 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
344 return 0;
345
346 if (list && size <= list_len)
347 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
348 return size;
349}
350
351static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
352 char *list,
353 size_t list_len,
354 const char *name,
355 size_t name_len)
356{
357 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
358 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
359
360 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
361 return 0;
362
363 if (list && size <= list_len)
364 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
365 return size;
366}
367
368static int ocfs2_xattr_get_acl(struct inode *inode,
369 int type,
370 void *buffer,
371 size_t size)
372{
373 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
374 struct posix_acl *acl;
375 int ret;
376
377 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
378 return -EOPNOTSUPP;
379
380 acl = ocfs2_get_acl(inode, type);
381 if (IS_ERR(acl))
382 return PTR_ERR(acl);
383 if (acl == NULL)
384 return -ENODATA;
385 ret = posix_acl_to_xattr(acl, buffer, size);
386 posix_acl_release(acl);
387
388 return ret;
389}
390
391static int ocfs2_xattr_get_acl_access(struct inode *inode,
392 const char *name,
393 void *buffer,
394 size_t size)
395{
396 if (strcmp(name, "") != 0)
397 return -EINVAL;
398 return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
399}
400
401static int ocfs2_xattr_get_acl_default(struct inode *inode,
402 const char *name,
403 void *buffer,
404 size_t size)
405{
406 if (strcmp(name, "") != 0)
407 return -EINVAL;
408 return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
409}
410
411static int ocfs2_xattr_set_acl(struct inode *inode,
412 int type,
413 const void *value,
414 size_t size)
415{
416 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
417 struct posix_acl *acl;
418 int ret = 0;
419
420 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
421 return -EOPNOTSUPP;
422
423 if (!is_owner_or_cap(inode))
424 return -EPERM;
425
426 if (value) {
427 acl = posix_acl_from_xattr(value, size);
428 if (IS_ERR(acl))
429 return PTR_ERR(acl);
430 else if (acl) {
431 ret = posix_acl_valid(acl);
432 if (ret)
433 goto cleanup;
434 }
435 } else
436 acl = NULL;
437
438 ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
439
440cleanup:
441 posix_acl_release(acl);
442 return ret;
443}
444
445static int ocfs2_xattr_set_acl_access(struct inode *inode,
446 const char *name,
447 const void *value,
448 size_t size,
449 int flags)
450{
451 if (strcmp(name, "") != 0)
452 return -EINVAL;
453 return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
454}
455
456static int ocfs2_xattr_set_acl_default(struct inode *inode,
457 const char *name,
458 const void *value,
459 size_t size,
460 int flags)
461{
462 if (strcmp(name, "") != 0)
463 return -EINVAL;
464 return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
465}
466
467struct xattr_handler ocfs2_xattr_acl_access_handler = {
468 .prefix = POSIX_ACL_XATTR_ACCESS,
469 .list = ocfs2_xattr_list_acl_access,
470 .get = ocfs2_xattr_get_acl_access,
471 .set = ocfs2_xattr_set_acl_access,
472};
473
474struct xattr_handler ocfs2_xattr_acl_default_handler = {
475 .prefix = POSIX_ACL_XATTR_DEFAULT,
476 .list = ocfs2_xattr_list_acl_default,
477 .get = ocfs2_xattr_get_acl_default,
478 .set = ocfs2_xattr_set_acl_default,
479};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 000000000000..8f6389ed4da5
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * acl.h
5 *
6 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17
18#ifndef OCFS2_ACL_H
19#define OCFS2_ACL_H
20
21#include <linux/posix_acl_xattr.h>
22
23struct ocfs2_acl_entry {
24 __le16 e_tag;
25 __le16 e_perm;
26 __le32 e_id;
27};
28
29#ifdef CONFIG_OCFS2_FS_POSIX_ACL
30
31extern int ocfs2_check_acl(struct inode *, int);
32extern int ocfs2_acl_chmod(struct inode *);
33extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
34 struct buffer_head *, struct buffer_head *,
35 struct ocfs2_alloc_context *,
36 struct ocfs2_alloc_context *);
37
38#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
39
40#define ocfs2_check_acl NULL
41static inline int ocfs2_acl_chmod(struct inode *inode)
42{
43 return 0;
44}
45static inline int ocfs2_init_acl(handle_t *handle,
46 struct inode *inode,
47 struct inode *dir,
48 struct buffer_head *di_bh,
49 struct buffer_head *dir_bh,
50 struct ocfs2_alloc_context *meta_ac,
51 struct ocfs2_alloc_context *data_ac)
52{
53 return 0;
54}
55
56#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
57
58#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394c..d861096c9d81 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/swap.h> 30#include <linux/swap.h>
31#include <linux/quotaops.h>
31 32
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC 33#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h> 34#include <cluster/masklog.h>
@@ -36,6 +37,7 @@
36 37
37#include "alloc.h" 38#include "alloc.h"
38#include "aops.h" 39#include "aops.h"
40#include "blockcheck.h"
39#include "dlmglue.h" 41#include "dlmglue.h"
40#include "extent_map.h" 42#include "extent_map.h"
41#include "inode.h" 43#include "inode.h"
@@ -46,6 +48,7 @@
46#include "file.h" 48#include "file.h"
47#include "super.h" 49#include "super.h"
48#include "uptodate.h" 50#include "uptodate.h"
51#include "xattr.h"
49 52
50#include "buffer_head_io.h" 53#include "buffer_head_io.h"
51 54
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
187static int ocfs2_dinode_sanity_check(struct inode *inode, 190static int ocfs2_dinode_sanity_check(struct inode *inode,
188 struct ocfs2_extent_tree *et) 191 struct ocfs2_extent_tree *et)
189{ 192{
190 int ret = 0; 193 struct ocfs2_dinode *di = et->et_object;
191 struct ocfs2_dinode *di;
192 194
193 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 195 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
196 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
194 197
195 di = et->et_object; 198 return 0;
196 if (!OCFS2_IS_VALID_DINODE(di)) {
197 ret = -EIO;
198 ocfs2_error(inode->i_sb,
199 "Inode %llu has invalid path root",
200 (unsigned long long)OCFS2_I(inode)->ip_blkno);
201 }
202
203 return ret;
204} 199}
205 200
206static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) 201static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
213 208
214static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) 209static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
215{ 210{
216 struct ocfs2_xattr_value_root *xv = et->et_object; 211 struct ocfs2_xattr_value_buf *vb = et->et_object;
217 212
218 et->et_root_el = &xv->xr_list; 213 et->et_root_el = &vb->vb_xv->xr_list;
219} 214}
220 215
221static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, 216static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
222 u64 blkno) 217 u64 blkno)
223{ 218{
224 struct ocfs2_xattr_value_root *xv = 219 struct ocfs2_xattr_value_buf *vb = et->et_object;
225 (struct ocfs2_xattr_value_root *)et->et_object;
226 220
227 xv->xr_last_eb_blk = cpu_to_le64(blkno); 221 vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
228} 222}
229 223
230static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) 224static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
231{ 225{
232 struct ocfs2_xattr_value_root *xv = 226 struct ocfs2_xattr_value_buf *vb = et->et_object;
233 (struct ocfs2_xattr_value_root *) et->et_object;
234 227
235 return le64_to_cpu(xv->xr_last_eb_blk); 228 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
236} 229}
237 230
238static void ocfs2_xattr_value_update_clusters(struct inode *inode, 231static void ocfs2_xattr_value_update_clusters(struct inode *inode,
239 struct ocfs2_extent_tree *et, 232 struct ocfs2_extent_tree *et,
240 u32 clusters) 233 u32 clusters)
241{ 234{
242 struct ocfs2_xattr_value_root *xv = 235 struct ocfs2_xattr_value_buf *vb = et->et_object;
243 (struct ocfs2_xattr_value_root *)et->et_object;
244 236
245 le32_add_cpu(&xv->xr_clusters, clusters); 237 le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
246} 238}
247 239
248static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { 240static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
304static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 296static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
305 struct inode *inode, 297 struct inode *inode,
306 struct buffer_head *bh, 298 struct buffer_head *bh,
299 ocfs2_journal_access_func access,
307 void *obj, 300 void *obj,
308 struct ocfs2_extent_tree_operations *ops) 301 struct ocfs2_extent_tree_operations *ops)
309{ 302{
310 et->et_ops = ops; 303 et->et_ops = ops;
311 et->et_root_bh = bh; 304 et->et_root_bh = bh;
305 et->et_root_journal_access = access;
312 if (!obj) 306 if (!obj)
313 obj = (void *)bh->b_data; 307 obj = (void *)bh->b_data;
314 et->et_object = obj; 308 et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
324 struct inode *inode, 318 struct inode *inode,
325 struct buffer_head *bh) 319 struct buffer_head *bh)
326{ 320{
327 __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); 321 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
322 NULL, &ocfs2_dinode_et_ops);
328} 323}
329 324
330void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 325void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
331 struct inode *inode, 326 struct inode *inode,
332 struct buffer_head *bh) 327 struct buffer_head *bh)
333{ 328{
334 __ocfs2_init_extent_tree(et, inode, bh, NULL, 329 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
335 &ocfs2_xattr_tree_et_ops); 330 NULL, &ocfs2_xattr_tree_et_ops);
336} 331}
337 332
338void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 333void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 struct inode *inode, 334 struct inode *inode,
340 struct buffer_head *bh, 335 struct ocfs2_xattr_value_buf *vb)
341 struct ocfs2_xattr_value_root *xv)
342{ 336{
343 __ocfs2_init_extent_tree(et, inode, bh, xv, 337 __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
344 &ocfs2_xattr_value_et_ops); 338 &ocfs2_xattr_value_et_ops);
345} 339}
346 340
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
362 et->et_ops->eo_update_clusters(inode, et, clusters); 356 et->et_ops->eo_update_clusters(inode, et, clusters);
363} 357}
364 358
359static inline int ocfs2_et_root_journal_access(handle_t *handle,
360 struct inode *inode,
361 struct ocfs2_extent_tree *et,
362 int type)
363{
364 return et->et_root_journal_access(handle, inode, et->et_root_bh,
365 type);
366}
367
365static inline int ocfs2_et_insert_check(struct inode *inode, 368static inline int ocfs2_et_insert_check(struct inode *inode,
366 struct ocfs2_extent_tree *et, 369 struct ocfs2_extent_tree *et,
367 struct ocfs2_extent_rec *rec) 370 struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
402#define OCFS2_MAX_PATH_DEPTH 5 405#define OCFS2_MAX_PATH_DEPTH 5
403 406
404struct ocfs2_path { 407struct ocfs2_path {
405 int p_tree_depth; 408 int p_tree_depth;
406 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; 409 ocfs2_journal_access_func p_root_access;
410 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
407}; 411};
408 412
409#define path_root_bh(_path) ((_path)->p_node[0].bh) 413#define path_root_bh(_path) ((_path)->p_node[0].bh)
410#define path_root_el(_path) ((_path)->p_node[0].el) 414#define path_root_el(_path) ((_path)->p_node[0].el)
415#define path_root_access(_path)((_path)->p_root_access)
411#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) 416#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
412#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 417#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
413#define path_num_items(_path) ((_path)->p_tree_depth + 1) 418#define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
440 */ 445 */
441 if (keep_root) 446 if (keep_root)
442 depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 447 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
448 else
449 path_root_access(path) = NULL;
443 450
444 path->p_tree_depth = depth; 451 path->p_tree_depth = depth;
445} 452}
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
465 472
466 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 473 BUG_ON(path_root_bh(dest) != path_root_bh(src));
467 BUG_ON(path_root_el(dest) != path_root_el(src)); 474 BUG_ON(path_root_el(dest) != path_root_el(src));
475 BUG_ON(path_root_access(dest) != path_root_access(src));
468 476
469 ocfs2_reinit_path(dest, 1); 477 ocfs2_reinit_path(dest, 1);
470 478
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
486 int i; 494 int i;
487 495
488 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 496 BUG_ON(path_root_bh(dest) != path_root_bh(src));
497 BUG_ON(path_root_access(dest) != path_root_access(src));
489 498
490 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 499 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
491 brelse(dest->p_node[i].bh); 500 brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
521} 530}
522 531
523static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 532static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
524 struct ocfs2_extent_list *root_el) 533 struct ocfs2_extent_list *root_el,
534 ocfs2_journal_access_func access)
525{ 535{
526 struct ocfs2_path *path; 536 struct ocfs2_path *path;
527 537
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
533 get_bh(root_bh); 543 get_bh(root_bh);
534 path_root_bh(path) = root_bh; 544 path_root_bh(path) = root_bh;
535 path_root_el(path) = root_el; 545 path_root_el(path) = root_el;
546 path_root_access(path) = access;
536 } 547 }
537 548
538 return path; 549 return path;
539} 550}
540 551
552static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
553{
554 return ocfs2_new_path(path_root_bh(path), path_root_el(path),
555 path_root_access(path));
556}
557
558static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
559{
560 return ocfs2_new_path(et->et_root_bh, et->et_root_el,
561 et->et_root_journal_access);
562}
563
564/*
565 * Journal the buffer at depth idx. All idx>0 are extent_blocks,
566 * otherwise it's the root_access function.
567 *
568 * I don't like the way this function's name looks next to
569 * ocfs2_journal_access_path(), but I don't have a better one.
570 */
571static int ocfs2_path_bh_journal_access(handle_t *handle,
572 struct inode *inode,
573 struct ocfs2_path *path,
574 int idx)
575{
576 ocfs2_journal_access_func access = path_root_access(path);
577
578 if (!access)
579 access = ocfs2_journal_access;
580
581 if (idx)
582 access = ocfs2_journal_access_eb;
583
584 return access(handle, inode, path->p_node[idx].bh,
585 OCFS2_JOURNAL_ACCESS_WRITE);
586}
587
541/* 588/*
542 * Convenience function to journal all components in a path. 589 * Convenience function to journal all components in a path.
543 */ 590 */
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
550 goto out; 597 goto out;
551 598
552 for(i = 0; i < path_num_items(path); i++) { 599 for(i = 0; i < path_num_items(path); i++) {
553 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, 600 ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
554 OCFS2_JOURNAL_ACCESS_WRITE);
555 if (ret < 0) { 601 if (ret < 0) {
556 mlog_errno(ret); 602 mlog_errno(ret);
557 goto out; 603 goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
686 int c_split_covers_rec; 732 int c_split_covers_rec;
687}; 733};
688 734
735static int ocfs2_validate_extent_block(struct super_block *sb,
736 struct buffer_head *bh)
737{
738 int rc;
739 struct ocfs2_extent_block *eb =
740 (struct ocfs2_extent_block *)bh->b_data;
741
742 mlog(0, "Validating extent block %llu\n",
743 (unsigned long long)bh->b_blocknr);
744
745 BUG_ON(!buffer_uptodate(bh));
746
747 /*
748 * If the ecc fails, we return the error but otherwise
749 * leave the filesystem running. We know any error is
750 * local to this block.
751 */
752 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
753 if (rc) {
754 mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
755 (unsigned long long)bh->b_blocknr);
756 return rc;
757 }
758
759 /*
760 * Errors after here are fatal.
761 */
762
763 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
764 ocfs2_error(sb,
765 "Extent block #%llu has bad signature %.*s",
766 (unsigned long long)bh->b_blocknr, 7,
767 eb->h_signature);
768 return -EINVAL;
769 }
770
771 if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
772 ocfs2_error(sb,
773 "Extent block #%llu has an invalid h_blkno "
774 "of %llu",
775 (unsigned long long)bh->b_blocknr,
776 (unsigned long long)le64_to_cpu(eb->h_blkno));
777 return -EINVAL;
778 }
779
780 if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
781 ocfs2_error(sb,
782 "Extent block #%llu has an invalid "
783 "h_fs_generation of #%u",
784 (unsigned long long)bh->b_blocknr,
785 le32_to_cpu(eb->h_fs_generation));
786 return -EINVAL;
787 }
788
789 return 0;
790}
791
792int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
793 struct buffer_head **bh)
794{
795 int rc;
796 struct buffer_head *tmp = *bh;
797
798 rc = ocfs2_read_block(inode, eb_blkno, &tmp,
799 ocfs2_validate_extent_block);
800
801 /* If ocfs2_read_block() got us a new bh, pass it up. */
802 if (!rc && !*bh)
803 *bh = tmp;
804
805 return rc;
806}
807
808
689/* 809/*
690 * How many free extents have we got before we need more meta data? 810 * How many free extents have we got before we need more meta data?
691 */ 811 */
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
705 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 825 last_eb_blk = ocfs2_et_get_last_eb_blk(et);
706 826
707 if (last_eb_blk) { 827 if (last_eb_blk) {
708 retval = ocfs2_read_block(inode, last_eb_blk, 828 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
709 &eb_bh);
710 if (retval < 0) { 829 if (retval < 0) {
711 mlog_errno(retval); 830 mlog_errno(retval);
712 goto bail; 831 goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
768 } 887 }
769 ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 888 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
770 889
771 status = ocfs2_journal_access(handle, inode, bhs[i], 890 status = ocfs2_journal_access_eb(handle, inode, bhs[i],
772 OCFS2_JOURNAL_ACCESS_CREATE); 891 OCFS2_JOURNAL_ACCESS_CREATE);
773 if (status < 0) { 892 if (status < 0) {
774 mlog_errno(status); 893 mlog_errno(status);
775 goto bail; 894 goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
908 for(i = 0; i < new_blocks; i++) { 1027 for(i = 0; i < new_blocks; i++) {
909 bh = new_eb_bhs[i]; 1028 bh = new_eb_bhs[i];
910 eb = (struct ocfs2_extent_block *) bh->b_data; 1029 eb = (struct ocfs2_extent_block *) bh->b_data;
911 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1030 /* ocfs2_create_new_meta_bhs() should create it right! */
912 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1031 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
913 status = -EIO;
914 goto bail;
915 }
916 eb_el = &eb->h_list; 1032 eb_el = &eb->h_list;
917 1033
918 status = ocfs2_journal_access(handle, inode, bh, 1034 status = ocfs2_journal_access_eb(handle, inode, bh,
919 OCFS2_JOURNAL_ACCESS_CREATE); 1035 OCFS2_JOURNAL_ACCESS_CREATE);
920 if (status < 0) { 1036 if (status < 0) {
921 mlog_errno(status); 1037 mlog_errno(status);
922 goto bail; 1038 goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
955 * journal_dirty erroring as it won't unless we've aborted the 1071 * journal_dirty erroring as it won't unless we've aborted the
956 * handle (in which case we would never be here) so reserving 1072 * handle (in which case we would never be here) so reserving
957 * the write with journal_access is all we need to do. */ 1073 * the write with journal_access is all we need to do. */
958 status = ocfs2_journal_access(handle, inode, *last_eb_bh, 1074 status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
959 OCFS2_JOURNAL_ACCESS_WRITE); 1075 OCFS2_JOURNAL_ACCESS_WRITE);
960 if (status < 0) { 1076 if (status < 0) {
961 mlog_errno(status); 1077 mlog_errno(status);
962 goto bail; 1078 goto bail;
963 } 1079 }
964 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 1080 status = ocfs2_et_root_journal_access(handle, inode, et,
965 OCFS2_JOURNAL_ACCESS_WRITE); 1081 OCFS2_JOURNAL_ACCESS_WRITE);
966 if (status < 0) { 1082 if (status < 0) {
967 mlog_errno(status); 1083 mlog_errno(status);
968 goto bail; 1084 goto bail;
969 } 1085 }
970 if (eb_bh) { 1086 if (eb_bh) {
971 status = ocfs2_journal_access(handle, inode, eb_bh, 1087 status = ocfs2_journal_access_eb(handle, inode, eb_bh,
972 OCFS2_JOURNAL_ACCESS_WRITE); 1088 OCFS2_JOURNAL_ACCESS_WRITE);
973 if (status < 0) { 1089 if (status < 0) {
974 mlog_errno(status); 1090 mlog_errno(status);
975 goto bail; 1091 goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1052 } 1168 }
1053 1169
1054 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 1170 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1055 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1171 /* ocfs2_create_new_meta_bhs() should create it right! */
1056 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1172 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1057 status = -EIO;
1058 goto bail;
1059 }
1060 1173
1061 eb_el = &eb->h_list; 1174 eb_el = &eb->h_list;
1062 root_el = et->et_root_el; 1175 root_el = et->et_root_el;
1063 1176
1064 status = ocfs2_journal_access(handle, inode, new_eb_bh, 1177 status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
1065 OCFS2_JOURNAL_ACCESS_CREATE); 1178 OCFS2_JOURNAL_ACCESS_CREATE);
1066 if (status < 0) { 1179 if (status < 0) {
1067 mlog_errno(status); 1180 mlog_errno(status);
1068 goto bail; 1181 goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1080 goto bail; 1193 goto bail;
1081 } 1194 }
1082 1195
1083 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 1196 status = ocfs2_et_root_journal_access(handle, inode, et,
1084 OCFS2_JOURNAL_ACCESS_WRITE); 1197 OCFS2_JOURNAL_ACCESS_WRITE);
1085 if (status < 0) { 1198 if (status < 0) {
1086 mlog_errno(status); 1199 mlog_errno(status);
1087 goto bail; 1200 goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1176 brelse(bh); 1289 brelse(bh);
1177 bh = NULL; 1290 bh = NULL;
1178 1291
1179 status = ocfs2_read_block(inode, blkno, &bh); 1292 status = ocfs2_read_extent_block(inode, blkno, &bh);
1180 if (status < 0) { 1293 if (status < 0) {
1181 mlog_errno(status); 1294 mlog_errno(status);
1182 goto bail; 1295 goto bail;
1183 } 1296 }
1184 1297
1185 eb = (struct ocfs2_extent_block *) bh->b_data; 1298 eb = (struct ocfs2_extent_block *) bh->b_data;
1186 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1187 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1188 status = -EIO;
1189 goto bail;
1190 }
1191 el = &eb->h_list; 1299 el = &eb->h_list;
1192 1300
1193 if (le16_to_cpu(el->l_next_free_rec) < 1301 if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
1540 1648
1541 brelse(bh); 1649 brelse(bh);
1542 bh = NULL; 1650 bh = NULL;
1543 ret = ocfs2_read_block(inode, blkno, &bh); 1651 ret = ocfs2_read_extent_block(inode, blkno, &bh);
1544 if (ret) { 1652 if (ret) {
1545 mlog_errno(ret); 1653 mlog_errno(ret);
1546 goto out; 1654 goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
1548 1656
1549 eb = (struct ocfs2_extent_block *) bh->b_data; 1657 eb = (struct ocfs2_extent_block *) bh->b_data;
1550 el = &eb->h_list; 1658 el = &eb->h_list;
1551 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1552 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1553 ret = -EIO;
1554 goto out;
1555 }
1556 1659
1557 if (le16_to_cpu(el->l_next_free_rec) > 1660 if (le16_to_cpu(el->l_next_free_rec) >
1558 le16_to_cpu(el->l_count)) { 1661 le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
1860 root_bh = left_path->p_node[subtree_index].bh; 1963 root_bh = left_path->p_node[subtree_index].bh;
1861 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 1964 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1862 1965
1863 ret = ocfs2_journal_access(handle, inode, root_bh, 1966 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
1864 OCFS2_JOURNAL_ACCESS_WRITE); 1967 subtree_index);
1865 if (ret) { 1968 if (ret) {
1866 mlog_errno(ret); 1969 mlog_errno(ret);
1867 goto out; 1970 goto out;
1868 } 1971 }
1869 1972
1870 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 1973 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1871 ret = ocfs2_journal_access(handle, inode, 1974 ret = ocfs2_path_bh_journal_access(handle, inode,
1872 right_path->p_node[i].bh, 1975 right_path, i);
1873 OCFS2_JOURNAL_ACCESS_WRITE);
1874 if (ret) { 1976 if (ret) {
1875 mlog_errno(ret); 1977 mlog_errno(ret);
1876 goto out; 1978 goto out;
1877 } 1979 }
1878 1980
1879 ret = ocfs2_journal_access(handle, inode, 1981 ret = ocfs2_path_bh_journal_access(handle, inode,
1880 left_path->p_node[i].bh, 1982 left_path, i);
1881 OCFS2_JOURNAL_ACCESS_WRITE);
1882 if (ret) { 1983 if (ret) {
1883 mlog_errno(ret); 1984 mlog_errno(ret);
1884 goto out; 1985 goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2102 2203
2103 *ret_left_path = NULL; 2204 *ret_left_path = NULL;
2104 2205
2105 left_path = ocfs2_new_path(path_root_bh(right_path), 2206 left_path = ocfs2_new_path_from_path(right_path);
2106 path_root_el(right_path));
2107 if (!left_path) { 2207 if (!left_path) {
2108 ret = -ENOMEM; 2208 ret = -ENOMEM;
2109 mlog_errno(ret); 2209 mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2398 return -EAGAIN; 2498 return -EAGAIN;
2399 2499
2400 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2500 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2401 ret = ocfs2_journal_access(handle, inode, 2501 ret = ocfs2_journal_access_eb(handle, inode,
2402 path_leaf_bh(right_path), 2502 path_leaf_bh(right_path),
2403 OCFS2_JOURNAL_ACCESS_WRITE); 2503 OCFS2_JOURNAL_ACCESS_WRITE);
2404 if (ret) { 2504 if (ret) {
2405 mlog_errno(ret); 2505 mlog_errno(ret);
2406 goto out; 2506 goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2417 * We have to update i_last_eb_blk during the meta 2517 * We have to update i_last_eb_blk during the meta
2418 * data delete. 2518 * data delete.
2419 */ 2519 */
2420 ret = ocfs2_journal_access(handle, inode, et_root_bh, 2520 ret = ocfs2_et_root_journal_access(handle, inode, et,
2421 OCFS2_JOURNAL_ACCESS_WRITE); 2521 OCFS2_JOURNAL_ACCESS_WRITE);
2422 if (ret) { 2522 if (ret) {
2423 mlog_errno(ret); 2523 mlog_errno(ret);
2424 goto out; 2524 goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2433 */ 2533 */
2434 BUG_ON(right_has_empty && !del_right_subtree); 2534 BUG_ON(right_has_empty && !del_right_subtree);
2435 2535
2436 ret = ocfs2_journal_access(handle, inode, root_bh, 2536 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
2437 OCFS2_JOURNAL_ACCESS_WRITE); 2537 subtree_index);
2438 if (ret) { 2538 if (ret) {
2439 mlog_errno(ret); 2539 mlog_errno(ret);
2440 goto out; 2540 goto out;
2441 } 2541 }
2442 2542
2443 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2543 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2444 ret = ocfs2_journal_access(handle, inode, 2544 ret = ocfs2_path_bh_journal_access(handle, inode,
2445 right_path->p_node[i].bh, 2545 right_path, i);
2446 OCFS2_JOURNAL_ACCESS_WRITE);
2447 if (ret) { 2546 if (ret) {
2448 mlog_errno(ret); 2547 mlog_errno(ret);
2449 goto out; 2548 goto out;
2450 } 2549 }
2451 2550
2452 ret = ocfs2_journal_access(handle, inode, 2551 ret = ocfs2_path_bh_journal_access(handle, inode,
2453 left_path->p_node[i].bh, 2552 left_path, i);
2454 OCFS2_JOURNAL_ACCESS_WRITE);
2455 if (ret) { 2553 if (ret) {
2456 mlog_errno(ret); 2554 mlog_errno(ret);
2457 goto out; 2555 goto out;
@@ -2596,16 +2694,17 @@ out:
2596 2694
2597static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, 2695static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2598 handle_t *handle, 2696 handle_t *handle,
2599 struct buffer_head *bh, 2697 struct ocfs2_path *path)
2600 struct ocfs2_extent_list *el)
2601{ 2698{
2602 int ret; 2699 int ret;
2700 struct buffer_head *bh = path_leaf_bh(path);
2701 struct ocfs2_extent_list *el = path_leaf_el(path);
2603 2702
2604 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2703 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2605 return 0; 2704 return 0;
2606 2705
2607 ret = ocfs2_journal_access(handle, inode, bh, 2706 ret = ocfs2_path_bh_journal_access(handle, inode, path,
2608 OCFS2_JOURNAL_ACCESS_WRITE); 2707 path_num_items(path) - 1);
2609 if (ret) { 2708 if (ret) {
2610 mlog_errno(ret); 2709 mlog_errno(ret);
2611 goto out; 2710 goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2644 goto out; 2743 goto out;
2645 } 2744 }
2646 2745
2647 left_path = ocfs2_new_path(path_root_bh(path), 2746 left_path = ocfs2_new_path_from_path(path);
2648 path_root_el(path));
2649 if (!left_path) { 2747 if (!left_path) {
2650 ret = -ENOMEM; 2748 ret = -ENOMEM;
2651 mlog_errno(ret); 2749 mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2654 2752
2655 ocfs2_cp_path(left_path, path); 2753 ocfs2_cp_path(left_path, path);
2656 2754
2657 right_path = ocfs2_new_path(path_root_bh(path), 2755 right_path = ocfs2_new_path_from_path(path);
2658 path_root_el(path));
2659 if (!right_path) { 2756 if (!right_path) {
2660 ret = -ENOMEM; 2757 ret = -ENOMEM;
2661 mlog_errno(ret); 2758 mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2689 * Caller might still want to make changes to the 2786 * Caller might still want to make changes to the
2690 * tree root, so re-add it to the journal here. 2787 * tree root, so re-add it to the journal here.
2691 */ 2788 */
2692 ret = ocfs2_journal_access(handle, inode, 2789 ret = ocfs2_path_bh_journal_access(handle, inode,
2693 path_root_bh(left_path), 2790 left_path, 0);
2694 OCFS2_JOURNAL_ACCESS_WRITE);
2695 if (ret) { 2791 if (ret) {
2696 mlog_errno(ret); 2792 mlog_errno(ret);
2697 goto out; 2793 goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2785 * We have a path to the left of this one - it needs 2881 * We have a path to the left of this one - it needs
2786 * an update too. 2882 * an update too.
2787 */ 2883 */
2788 left_path = ocfs2_new_path(path_root_bh(path), 2884 left_path = ocfs2_new_path_from_path(path);
2789 path_root_el(path));
2790 if (!left_path) { 2885 if (!left_path) {
2791 ret = -ENOMEM; 2886 ret = -ENOMEM;
2792 mlog_errno(ret); 2887 mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
2875 * it up front. 2970 * it up front.
2876 */ 2971 */
2877 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 2972 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2878 path_leaf_bh(path), 2973 path);
2879 path_leaf_el(path));
2880 if (ret) 2974 if (ret)
2881 mlog_errno(ret); 2975 mlog_errno(ret);
2882 goto out; 2976 goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
3027 /* This function shouldn't be called for the rightmost leaf. */ 3121 /* This function shouldn't be called for the rightmost leaf. */
3028 BUG_ON(right_cpos == 0); 3122 BUG_ON(right_cpos == 0);
3029 3123
3030 right_path = ocfs2_new_path(path_root_bh(left_path), 3124 right_path = ocfs2_new_path_from_path(left_path);
3031 path_root_el(left_path));
3032 if (!right_path) { 3125 if (!right_path) {
3033 ret = -ENOMEM; 3126 ret = -ENOMEM;
3034 mlog_errno(ret); 3127 mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3111 root_bh = left_path->p_node[subtree_index].bh; 3204 root_bh = left_path->p_node[subtree_index].bh;
3112 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3205 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3113 3206
3114 ret = ocfs2_journal_access(handle, inode, root_bh, 3207 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3115 OCFS2_JOURNAL_ACCESS_WRITE); 3208 subtree_index);
3116 if (ret) { 3209 if (ret) {
3117 mlog_errno(ret); 3210 mlog_errno(ret);
3118 goto out; 3211 goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3120 3213
3121 for (i = subtree_index + 1; 3214 for (i = subtree_index + 1;
3122 i < path_num_items(right_path); i++) { 3215 i < path_num_items(right_path); i++) {
3123 ret = ocfs2_journal_access(handle, inode, 3216 ret = ocfs2_path_bh_journal_access(handle, inode,
3124 right_path->p_node[i].bh, 3217 right_path, i);
3125 OCFS2_JOURNAL_ACCESS_WRITE);
3126 if (ret) { 3218 if (ret) {
3127 mlog_errno(ret); 3219 mlog_errno(ret);
3128 goto out; 3220 goto out;
3129 } 3221 }
3130 3222
3131 ret = ocfs2_journal_access(handle, inode, 3223 ret = ocfs2_path_bh_journal_access(handle, inode,
3132 left_path->p_node[i].bh, 3224 left_path, i);
3133 OCFS2_JOURNAL_ACCESS_WRITE);
3134 if (ret) { 3225 if (ret) {
3135 mlog_errno(ret); 3226 mlog_errno(ret);
3136 goto out; 3227 goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3142 right_rec = &el->l_recs[index + 1]; 3233 right_rec = &el->l_recs[index + 1];
3143 } 3234 }
3144 3235
3145 ret = ocfs2_journal_access(handle, inode, bh, 3236 ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
3146 OCFS2_JOURNAL_ACCESS_WRITE); 3237 path_num_items(left_path) - 1);
3147 if (ret) { 3238 if (ret) {
3148 mlog_errno(ret); 3239 mlog_errno(ret);
3149 goto out; 3240 goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
3199 /* This function shouldn't be called for the leftmost leaf. */ 3290 /* This function shouldn't be called for the leftmost leaf. */
3200 BUG_ON(left_cpos == 0); 3291 BUG_ON(left_cpos == 0);
3201 3292
3202 left_path = ocfs2_new_path(path_root_bh(right_path), 3293 left_path = ocfs2_new_path_from_path(right_path);
3203 path_root_el(right_path));
3204 if (!left_path) { 3294 if (!left_path) {
3205 ret = -ENOMEM; 3295 ret = -ENOMEM;
3206 mlog_errno(ret); 3296 mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3283 root_bh = left_path->p_node[subtree_index].bh; 3373 root_bh = left_path->p_node[subtree_index].bh;
3284 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3374 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3285 3375
3286 ret = ocfs2_journal_access(handle, inode, root_bh, 3376 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3287 OCFS2_JOURNAL_ACCESS_WRITE); 3377 subtree_index);
3288 if (ret) { 3378 if (ret) {
3289 mlog_errno(ret); 3379 mlog_errno(ret);
3290 goto out; 3380 goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3292 3382
3293 for (i = subtree_index + 1; 3383 for (i = subtree_index + 1;
3294 i < path_num_items(right_path); i++) { 3384 i < path_num_items(right_path); i++) {
3295 ret = ocfs2_journal_access(handle, inode, 3385 ret = ocfs2_path_bh_journal_access(handle, inode,
3296 right_path->p_node[i].bh, 3386 right_path, i);
3297 OCFS2_JOURNAL_ACCESS_WRITE);
3298 if (ret) { 3387 if (ret) {
3299 mlog_errno(ret); 3388 mlog_errno(ret);
3300 goto out; 3389 goto out;
3301 } 3390 }
3302 3391
3303 ret = ocfs2_journal_access(handle, inode, 3392 ret = ocfs2_path_bh_journal_access(handle, inode,
3304 left_path->p_node[i].bh, 3393 left_path, i);
3305 OCFS2_JOURNAL_ACCESS_WRITE);
3306 if (ret) { 3394 if (ret) {
3307 mlog_errno(ret); 3395 mlog_errno(ret);
3308 goto out; 3396 goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3314 has_empty_extent = 1; 3402 has_empty_extent = 1;
3315 } 3403 }
3316 3404
3317 ret = ocfs2_journal_access(handle, inode, bh, 3405 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3318 OCFS2_JOURNAL_ACCESS_WRITE); 3406 path_num_items(right_path) - 1);
3319 if (ret) { 3407 if (ret) {
3320 mlog_errno(ret); 3408 mlog_errno(ret);
3321 goto out; 3409 goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3732 * leftmost leaf. 3820 * leftmost leaf.
3733 */ 3821 */
3734 if (left_cpos) { 3822 if (left_cpos) {
3735 left_path = ocfs2_new_path(path_root_bh(right_path), 3823 left_path = ocfs2_new_path_from_path(right_path);
3736 path_root_el(right_path));
3737 if (!left_path) { 3824 if (!left_path) {
3738 ret = -ENOMEM; 3825 ret = -ENOMEM;
3739 mlog_errno(ret); 3826 mlog_errno(ret);
@@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
3781 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; 3868 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3782 struct ocfs2_extent_rec *rec, *tmprec; 3869 struct ocfs2_extent_rec *rec, *tmprec;
3783 3870
3784 right_el = path_leaf_el(right_path);; 3871 right_el = path_leaf_el(right_path);
3785 if (left_path) 3872 if (left_path)
3786 left_el = path_leaf_el(left_path); 3873 left_el = path_leaf_el(left_path);
3787 3874
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3958 4045
3959 el = et->et_root_el; 4046 el = et->et_root_el;
3960 4047
3961 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 4048 ret = ocfs2_et_root_journal_access(handle, inode, et,
3962 OCFS2_JOURNAL_ACCESS_WRITE); 4049 OCFS2_JOURNAL_ACCESS_WRITE);
3963 if (ret) { 4050 if (ret) {
3964 mlog_errno(ret); 4051 mlog_errno(ret);
3965 goto out; 4052 goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3970 goto out_update_clusters; 4057 goto out_update_clusters;
3971 } 4058 }
3972 4059
3973 right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4060 right_path = ocfs2_new_path_from_et(et);
3974 if (!right_path) { 4061 if (!right_path) {
3975 ret = -ENOMEM; 4062 ret = -ENOMEM;
3976 mlog_errno(ret); 4063 mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4020 * ocfs2_rotate_tree_right() might have extended the 4107 * ocfs2_rotate_tree_right() might have extended the
4021 * transaction without re-journaling our tree root. 4108 * transaction without re-journaling our tree root.
4022 */ 4109 */
4023 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 4110 ret = ocfs2_et_root_journal_access(handle, inode, et,
4024 OCFS2_JOURNAL_ACCESS_WRITE); 4111 OCFS2_JOURNAL_ACCESS_WRITE);
4025 if (ret) { 4112 if (ret) {
4026 mlog_errno(ret); 4113 mlog_errno(ret);
4027 goto out; 4114 goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4082 goto out; 4169 goto out;
4083 4170
4084 if (left_cpos != 0) { 4171 if (left_cpos != 0) {
4085 left_path = ocfs2_new_path(path_root_bh(path), 4172 left_path = ocfs2_new_path_from_path(path);
4086 path_root_el(path));
4087 if (!left_path) 4173 if (!left_path)
4088 goto out; 4174 goto out;
4089 4175
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4097 le16_to_cpu(new_el->l_count)) { 4183 le16_to_cpu(new_el->l_count)) {
4098 bh = path_leaf_bh(left_path); 4184 bh = path_leaf_bh(left_path);
4099 eb = (struct ocfs2_extent_block *)bh->b_data; 4185 eb = (struct ocfs2_extent_block *)bh->b_data;
4100 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 4186 ocfs2_error(inode->i_sb,
4101 eb); 4187 "Extent block #%llu has an "
4188 "invalid l_next_free_rec of "
4189 "%d. It should have "
4190 "matched the l_count of %d",
4191 (unsigned long long)le64_to_cpu(eb->h_blkno),
4192 le16_to_cpu(new_el->l_next_free_rec),
4193 le16_to_cpu(new_el->l_count));
4194 status = -EINVAL;
4102 goto out; 4195 goto out;
4103 } 4196 }
4104 rec = &new_el->l_recs[ 4197 rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4132 if (right_cpos == 0) 4225 if (right_cpos == 0)
4133 goto out; 4226 goto out;
4134 4227
4135 right_path = ocfs2_new_path(path_root_bh(path), 4228 right_path = ocfs2_new_path_from_path(path);
4136 path_root_el(path));
4137 if (!right_path) 4229 if (!right_path)
4138 goto out; 4230 goto out;
4139 4231
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4147 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4239 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4148 bh = path_leaf_bh(right_path); 4240 bh = path_leaf_bh(right_path);
4149 eb = (struct ocfs2_extent_block *)bh->b_data; 4241 eb = (struct ocfs2_extent_block *)bh->b_data;
4150 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 4242 ocfs2_error(inode->i_sb,
4151 eb); 4243 "Extent block #%llu has an "
4244 "invalid l_next_free_rec of %d",
4245 (unsigned long long)le64_to_cpu(eb->h_blkno),
4246 le16_to_cpu(new_el->l_next_free_rec));
4247 status = -EINVAL;
4152 goto out; 4248 goto out;
4153 } 4249 }
4154 rec = &new_el->l_recs[1]; 4250 rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4294 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4390 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4295 * may want it later. 4391 * may want it later.
4296 */ 4392 */
4297 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); 4393 ret = ocfs2_read_extent_block(inode,
4394 ocfs2_et_get_last_eb_blk(et),
4395 &bh);
4298 if (ret) { 4396 if (ret) {
4299 mlog_exit(ret); 4397 mlog_exit(ret);
4300 goto out; 4398 goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4320 return 0; 4418 return 0;
4321 } 4419 }
4322 4420
4323 path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4421 path = ocfs2_new_path_from_et(et);
4324 if (!path) { 4422 if (!path) {
4325 ret = -ENOMEM; 4423 ret = -ENOMEM;
4326 mlog_errno(ret); 4424 mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4531 4629
4532 BUG_ON(num_bits > clusters_to_add); 4630 BUG_ON(num_bits > clusters_to_add);
4533 4631
4534 /* reserve our write early -- insert_extent may update the inode */ 4632 /* reserve our write early -- insert_extent may update the tree root */
4535 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 4633 status = ocfs2_et_root_journal_access(handle, inode, et,
4536 OCFS2_JOURNAL_ACCESS_WRITE); 4634 OCFS2_JOURNAL_ACCESS_WRITE);
4537 if (status < 0) { 4635 if (status < 0) {
4538 mlog_errno(status); 4636 mlog_errno(status);
4539 goto leave; 4637 goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4760 if (path->p_tree_depth) { 4858 if (path->p_tree_depth) {
4761 struct ocfs2_extent_block *eb; 4859 struct ocfs2_extent_block *eb;
4762 4860
4763 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), 4861 ret = ocfs2_read_extent_block(inode,
4764 &last_eb_bh); 4862 ocfs2_et_get_last_eb_blk(et),
4863 &last_eb_bh);
4765 if (ret) { 4864 if (ret) {
4766 mlog_exit(ret); 4865 mlog_exit(ret);
4767 goto out; 4866 goto out;
4768 } 4867 }
4769 4868
4770 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4869 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4771 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4772 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4773 ret = -EROFS;
4774 goto out;
4775 }
4776
4777 rightmost_el = &eb->h_list; 4870 rightmost_el = &eb->h_list;
4778 } else 4871 } else
4779 rightmost_el = path_root_el(path); 4872 rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
4854 if (et->et_ops == &ocfs2_dinode_et_ops) 4947 if (et->et_ops == &ocfs2_dinode_et_ops)
4855 ocfs2_extent_map_trunc(inode, 0); 4948 ocfs2_extent_map_trunc(inode, 0);
4856 4949
4857 left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4950 left_path = ocfs2_new_path_from_et(et);
4858 if (!left_path) { 4951 if (!left_path) {
4859 ret = -ENOMEM; 4952 ret = -ENOMEM;
4860 mlog_errno(ret); 4953 mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4918 5011
4919 depth = path->p_tree_depth; 5012 depth = path->p_tree_depth;
4920 if (depth > 0) { 5013 if (depth > 0) {
4921 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), 5014 ret = ocfs2_read_extent_block(inode,
4922 &last_eb_bh); 5015 ocfs2_et_get_last_eb_blk(et),
5016 &last_eb_bh);
4923 if (ret < 0) { 5017 if (ret < 0) {
4924 mlog_errno(ret); 5018 mlog_errno(ret);
4925 goto out; 5019 goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5025 } 5119 }
5026 5120
5027 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { 5121 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5028 left_path = ocfs2_new_path(path_root_bh(path), 5122 left_path = ocfs2_new_path_from_path(path);
5029 path_root_el(path));
5030 if (!left_path) { 5123 if (!left_path) {
5031 ret = -ENOMEM; 5124 ret = -ENOMEM;
5032 mlog_errno(ret); 5125 mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
5135 5228
5136 ocfs2_extent_map_trunc(inode, 0); 5229 ocfs2_extent_map_trunc(inode, 0);
5137 5230
5138 path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 5231 path = ocfs2_new_path_from_et(et);
5139 if (!path) { 5232 if (!path) {
5140 ret = -ENOMEM; 5233 ret = -ENOMEM;
5141 mlog_errno(ret); 5234 mlog_errno(ret);
@@ -5255,6 +5348,78 @@ out:
5255 return ret; 5348 return ret;
5256} 5349}
5257 5350
5351int ocfs2_remove_btree_range(struct inode *inode,
5352 struct ocfs2_extent_tree *et,
5353 u32 cpos, u32 phys_cpos, u32 len,
5354 struct ocfs2_cached_dealloc_ctxt *dealloc)
5355{
5356 int ret;
5357 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5358 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5359 struct inode *tl_inode = osb->osb_tl_inode;
5360 handle_t *handle;
5361 struct ocfs2_alloc_context *meta_ac = NULL;
5362
5363 ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5364 if (ret) {
5365 mlog_errno(ret);
5366 return ret;
5367 }
5368
5369 mutex_lock(&tl_inode->i_mutex);
5370
5371 if (ocfs2_truncate_log_needs_flush(osb)) {
5372 ret = __ocfs2_flush_truncate_log(osb);
5373 if (ret < 0) {
5374 mlog_errno(ret);
5375 goto out;
5376 }
5377 }
5378
5379 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5380 if (IS_ERR(handle)) {
5381 ret = PTR_ERR(handle);
5382 mlog_errno(ret);
5383 goto out;
5384 }
5385
5386 ret = ocfs2_et_root_journal_access(handle, inode, et,
5387 OCFS2_JOURNAL_ACCESS_WRITE);
5388 if (ret) {
5389 mlog_errno(ret);
5390 goto out;
5391 }
5392
5393 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5394 dealloc);
5395 if (ret) {
5396 mlog_errno(ret);
5397 goto out_commit;
5398 }
5399
5400 ocfs2_et_update_clusters(inode, et, -len);
5401
5402 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5403 if (ret) {
5404 mlog_errno(ret);
5405 goto out_commit;
5406 }
5407
5408 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5409 if (ret)
5410 mlog_errno(ret);
5411
5412out_commit:
5413 ocfs2_commit_trans(osb, handle);
5414out:
5415 mutex_unlock(&tl_inode->i_mutex);
5416
5417 if (meta_ac)
5418 ocfs2_free_alloc_context(meta_ac);
5419
5420 return ret;
5421}
5422
5258int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 5423int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5259{ 5424{
5260 struct buffer_head *tl_bh = osb->osb_tl_bh; 5425 struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5308 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 5473 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5309 5474
5310 di = (struct ocfs2_dinode *) tl_bh->b_data; 5475 di = (struct ocfs2_dinode *) tl_bh->b_data;
5311 tl = &di->id2.i_dealloc;
5312 if (!OCFS2_IS_VALID_DINODE(di)) {
5313 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5314 status = -EIO;
5315 goto bail;
5316 }
5317 5476
5477 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
5478 * by the underlying call to ocfs2_read_inode_block(), so any
5479 * corruption is a code bug */
5480 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5481
5482 tl = &di->id2.i_dealloc;
5318 tl_count = le16_to_cpu(tl->tl_count); 5483 tl_count = le16_to_cpu(tl->tl_count);
5319 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 5484 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5320 tl_count == 0, 5485 tl_count == 0,
@@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5332 goto bail; 5497 goto bail;
5333 } 5498 }
5334 5499
5335 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5500 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5336 OCFS2_JOURNAL_ACCESS_WRITE); 5501 OCFS2_JOURNAL_ACCESS_WRITE);
5337 if (status < 0) { 5502 if (status < 0) {
5338 mlog_errno(status); 5503 mlog_errno(status);
5339 goto bail; 5504 goto bail;
@@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5394 while (i >= 0) { 5559 while (i >= 0) {
5395 /* Caller has given us at least enough credits to 5560 /* Caller has given us at least enough credits to
5396 * update the truncate log dinode */ 5561 * update the truncate log dinode */
5397 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5562 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5398 OCFS2_JOURNAL_ACCESS_WRITE); 5563 OCFS2_JOURNAL_ACCESS_WRITE);
5399 if (status < 0) { 5564 if (status < 0) {
5400 mlog_errno(status); 5565 mlog_errno(status);
5401 goto bail; 5566 goto bail;
@@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5464 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5629 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5465 5630
5466 di = (struct ocfs2_dinode *) tl_bh->b_data; 5631 di = (struct ocfs2_dinode *) tl_bh->b_data;
5467 tl = &di->id2.i_dealloc;
5468 if (!OCFS2_IS_VALID_DINODE(di)) {
5469 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5470 status = -EIO;
5471 goto out;
5472 }
5473 5632
5633 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
5634 * by the underlying call to ocfs2_read_inode_block(), so any
5635 * corruption is a code bug */
5636 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5637
5638 tl = &di->id2.i_dealloc;
5474 num_to_flush = le16_to_cpu(tl->tl_used); 5639 num_to_flush = le16_to_cpu(tl->tl_used);
5475 mlog(0, "Flush %u records from truncate log #%llu\n", 5640 mlog(0, "Flush %u records from truncate log #%llu\n",
5476 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 5641 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5586 goto bail; 5751 goto bail;
5587 } 5752 }
5588 5753
5589 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 5754 status = ocfs2_read_inode_block(inode, &bh);
5590 if (status < 0) { 5755 if (status < 0) {
5591 iput(inode); 5756 iput(inode);
5592 mlog_errno(status); 5757 mlog_errno(status);
@@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5625 } 5790 }
5626 5791
5627 di = (struct ocfs2_dinode *) tl_bh->b_data; 5792 di = (struct ocfs2_dinode *) tl_bh->b_data;
5628 tl = &di->id2.i_dealloc;
5629 if (!OCFS2_IS_VALID_DINODE(di)) {
5630 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5631 status = -EIO;
5632 goto bail;
5633 }
5634 5793
5794 /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's
5795 * validated by the underlying call to ocfs2_read_inode_block(),
5796 * so any corruption is a code bug */
5797 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5798
5799 tl = &di->id2.i_dealloc;
5635 if (le16_to_cpu(tl->tl_used)) { 5800 if (le16_to_cpu(tl->tl_used)) {
5636 mlog(0, "We'll have %u logs to recover\n", 5801 mlog(0, "We'll have %u logs to recover\n",
5637 le16_to_cpu(tl->tl_used)); 5802 le16_to_cpu(tl->tl_used));
@@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5651 * tl_used. */ 5816 * tl_used. */
5652 tl->tl_used = 0; 5817 tl->tl_used = 0;
5653 5818
5819 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
5654 status = ocfs2_write_block(osb, tl_bh, tl_inode); 5820 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5655 if (status < 0) { 5821 if (status < 0) {
5656 mlog_errno(status); 5822 mlog_errno(status);
@@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5800 */ 5966 */
5801 5967
5802/* 5968/*
5803 * Describes a single block free from a suballocator 5969 * Describe a single bit freed from a suballocator. For the block
5970 * suballocators, it represents one block. For the global cluster
5971 * allocator, it represents some clusters and free_bit indicates
5972 * clusters number.
5804 */ 5973 */
5805struct ocfs2_cached_block_free { 5974struct ocfs2_cached_block_free {
5806 struct ocfs2_cached_block_free *free_next; 5975 struct ocfs2_cached_block_free *free_next;
@@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list {
5815 struct ocfs2_cached_block_free *f_first; 5984 struct ocfs2_cached_block_free *f_first;
5816}; 5985};
5817 5986
5818static int ocfs2_free_cached_items(struct ocfs2_super *osb, 5987static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
5819 int sysfile_type, 5988 int sysfile_type,
5820 int slot, 5989 int slot,
5821 struct ocfs2_cached_block_free *head) 5990 struct ocfs2_cached_block_free *head)
5822{ 5991{
5823 int ret; 5992 int ret;
5824 u64 bg_blkno; 5993 u64 bg_blkno;
@@ -5893,6 +6062,82 @@ out:
5893 return ret; 6062 return ret;
5894} 6063}
5895 6064
6065int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6066 u64 blkno, unsigned int bit)
6067{
6068 int ret = 0;
6069 struct ocfs2_cached_block_free *item;
6070
6071 item = kmalloc(sizeof(*item), GFP_NOFS);
6072 if (item == NULL) {
6073 ret = -ENOMEM;
6074 mlog_errno(ret);
6075 return ret;
6076 }
6077
6078 mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6079 bit, (unsigned long long)blkno);
6080
6081 item->free_blk = blkno;
6082 item->free_bit = bit;
6083 item->free_next = ctxt->c_global_allocator;
6084
6085 ctxt->c_global_allocator = item;
6086 return ret;
6087}
6088
6089static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6090 struct ocfs2_cached_block_free *head)
6091{
6092 struct ocfs2_cached_block_free *tmp;
6093 struct inode *tl_inode = osb->osb_tl_inode;
6094 handle_t *handle;
6095 int ret = 0;
6096
6097 mutex_lock(&tl_inode->i_mutex);
6098
6099 while (head) {
6100 if (ocfs2_truncate_log_needs_flush(osb)) {
6101 ret = __ocfs2_flush_truncate_log(osb);
6102 if (ret < 0) {
6103 mlog_errno(ret);
6104 break;
6105 }
6106 }
6107
6108 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6109 if (IS_ERR(handle)) {
6110 ret = PTR_ERR(handle);
6111 mlog_errno(ret);
6112 break;
6113 }
6114
6115 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6116 head->free_bit);
6117
6118 ocfs2_commit_trans(osb, handle);
6119 tmp = head;
6120 head = head->free_next;
6121 kfree(tmp);
6122
6123 if (ret < 0) {
6124 mlog_errno(ret);
6125 break;
6126 }
6127 }
6128
6129 mutex_unlock(&tl_inode->i_mutex);
6130
6131 while (head) {
6132 /* Premature exit may have left some dangling items. */
6133 tmp = head;
6134 head = head->free_next;
6135 kfree(tmp);
6136 }
6137
6138 return ret;
6139}
6140
5896int ocfs2_run_deallocs(struct ocfs2_super *osb, 6141int ocfs2_run_deallocs(struct ocfs2_super *osb,
5897 struct ocfs2_cached_dealloc_ctxt *ctxt) 6142 struct ocfs2_cached_dealloc_ctxt *ctxt)
5898{ 6143{
@@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
5908 if (fl->f_first) { 6153 if (fl->f_first) {
5909 mlog(0, "Free items: (type %u, slot %d)\n", 6154 mlog(0, "Free items: (type %u, slot %d)\n",
5910 fl->f_inode_type, fl->f_slot); 6155 fl->f_inode_type, fl->f_slot);
5911 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, 6156 ret2 = ocfs2_free_cached_blocks(osb,
5912 fl->f_slot, fl->f_first); 6157 fl->f_inode_type,
6158 fl->f_slot,
6159 fl->f_first);
5913 if (ret2) 6160 if (ret2)
5914 mlog_errno(ret2); 6161 mlog_errno(ret2);
5915 if (!ret) 6162 if (!ret)
@@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
5920 kfree(fl); 6167 kfree(fl);
5921 } 6168 }
5922 6169
6170 if (ctxt->c_global_allocator) {
6171 ret2 = ocfs2_free_cached_clusters(osb,
6172 ctxt->c_global_allocator);
6173 if (ret2)
6174 mlog_errno(ret2);
6175 if (!ret)
6176 ret = ret2;
6177
6178 ctxt->c_global_allocator = NULL;
6179 }
6180
5923 return ret; 6181 return ret;
5924} 6182}
5925 6183
@@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6075 6333
6076 eb = (struct ocfs2_extent_block *) bh->b_data; 6334 eb = (struct ocfs2_extent_block *) bh->b_data;
6077 el = &eb->h_list; 6335 el = &eb->h_list;
6078 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 6336
6079 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 6337 /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6080 ret = -EROFS; 6338 * Any corruption is a code bug. */
6081 goto out; 6339 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6082 }
6083 6340
6084 *new_last_eb = bh; 6341 *new_last_eb = bh;
6085 get_bh(*new_last_eb); 6342 get_bh(*new_last_eb);
@@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6326 } 6583 }
6327 6584
6328 if (last_eb_bh) { 6585 if (last_eb_bh) {
6329 status = ocfs2_journal_access(handle, inode, last_eb_bh, 6586 status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
6330 OCFS2_JOURNAL_ACCESS_WRITE); 6587 OCFS2_JOURNAL_ACCESS_WRITE);
6331 if (status < 0) { 6588 if (status < 0) {
6332 mlog_errno(status); 6589 mlog_errno(status);
6333 goto bail; 6590 goto bail;
@@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6350 goto bail; 6607 goto bail;
6351 } 6608 }
6352 6609
6610 vfs_dq_free_space_nodirty(inode,
6611 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6353 spin_lock(&OCFS2_I(inode)->ip_lock); 6612 spin_lock(&OCFS2_I(inode)->ip_lock);
6354 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6613 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6355 clusters_to_del; 6614 clusters_to_del;
@@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6436 mlog_errno(ret); 6695 mlog_errno(ret);
6437 else if (ocfs2_should_order_data(inode)) { 6696 else if (ocfs2_should_order_data(inode)) {
6438 ret = ocfs2_jbd2_file_inode(handle, inode); 6697 ret = ocfs2_jbd2_file_inode(handle, inode);
6439#ifdef CONFIG_OCFS2_COMPAT_JBD
6440 ret = walk_page_buffers(handle, page_buffers(page),
6441 from, to, &partial,
6442 ocfs2_journal_dirty_data);
6443#endif
6444 if (ret < 0) 6698 if (ret < 0)
6445 mlog_errno(ret); 6699 mlog_errno(ret);
6446 } 6700 }
@@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6663 struct page **pages = NULL; 6917 struct page **pages = NULL;
6664 loff_t end = osb->s_clustersize; 6918 loff_t end = osb->s_clustersize;
6665 struct ocfs2_extent_tree et; 6919 struct ocfs2_extent_tree et;
6920 int did_quota = 0;
6666 6921
6667 has_data = i_size_read(inode) ? 1 : 0; 6922 has_data = i_size_read(inode) ? 1 : 0;
6668 6923
@@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6682 } 6937 }
6683 } 6938 }
6684 6939
6685 handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS); 6940 handle = ocfs2_start_trans(osb,
6941 ocfs2_inline_to_extents_credits(osb->sb));
6686 if (IS_ERR(handle)) { 6942 if (IS_ERR(handle)) {
6687 ret = PTR_ERR(handle); 6943 ret = PTR_ERR(handle);
6688 mlog_errno(ret); 6944 mlog_errno(ret);
6689 goto out_unlock; 6945 goto out_unlock;
6690 } 6946 }
6691 6947
6692 ret = ocfs2_journal_access(handle, inode, di_bh, 6948 ret = ocfs2_journal_access_di(handle, inode, di_bh,
6693 OCFS2_JOURNAL_ACCESS_WRITE); 6949 OCFS2_JOURNAL_ACCESS_WRITE);
6694 if (ret) { 6950 if (ret) {
6695 mlog_errno(ret); 6951 mlog_errno(ret);
6696 goto out_commit; 6952 goto out_commit;
@@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6701 unsigned int page_end; 6957 unsigned int page_end;
6702 u64 phys; 6958 u64 phys;
6703 6959
6960 if (vfs_dq_alloc_space_nodirty(inode,
6961 ocfs2_clusters_to_bytes(osb->sb, 1))) {
6962 ret = -EDQUOT;
6963 goto out_commit;
6964 }
6965 did_quota = 1;
6966
6704 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6967 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6705 &num); 6968 &num);
6706 if (ret) { 6969 if (ret) {
@@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6774 } 7037 }
6775 7038
6776out_commit: 7039out_commit:
7040 if (ret < 0 && did_quota)
7041 vfs_dq_free_space_nodirty(inode,
7042 ocfs2_clusters_to_bytes(osb->sb, 1));
7043
6777 ocfs2_commit_trans(osb, handle); 7044 ocfs2_commit_trans(osb, handle);
6778 7045
6779out_unlock: 7046out_unlock:
@@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
6813 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7080 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6814 i_size_read(inode)); 7081 i_size_read(inode));
6815 7082
6816 path = ocfs2_new_path(fe_bh, &di->id2.i_list); 7083 path = ocfs2_new_path(fe_bh, &di->id2.i_list,
7084 ocfs2_journal_access_di);
6817 if (!path) { 7085 if (!path) {
6818 status = -ENOMEM; 7086 status = -ENOMEM;
6819 mlog_errno(status); 7087 mlog_errno(status);
@@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6984 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 7252 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6985 7253
6986 if (fe->id2.i_list.l_tree_depth) { 7254 if (fe->id2.i_list.l_tree_depth) {
6987 status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), 7255 status = ocfs2_read_extent_block(inode,
6988 &last_eb_bh); 7256 le64_to_cpu(fe->i_last_eb_blk),
7257 &last_eb_bh);
6989 if (status < 0) { 7258 if (status < 0) {
6990 mlog_errno(status); 7259 mlog_errno(status);
6991 goto bail; 7260 goto bail;
6992 } 7261 }
6993 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 7262 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6994 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6995 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6996
6997 brelse(last_eb_bh);
6998 status = -EIO;
6999 goto bail;
7000 }
7001 } 7263 }
7002 7264
7003 (*tc)->tc_last_eb_bh = last_eb_bh; 7265 (*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7052 goto out; 7314 goto out;
7053 } 7315 }
7054 7316
7055 ret = ocfs2_journal_access(handle, inode, di_bh, 7317 ret = ocfs2_journal_access_di(handle, inode, di_bh,
7056 OCFS2_JOURNAL_ACCESS_WRITE); 7318 OCFS2_JOURNAL_ACCESS_WRITE);
7057 if (ret) { 7319 if (ret) {
7058 mlog_errno(ret); 7320 mlog_errno(ret);
7059 goto out_commit; 7321 goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfbe..cceff5c37f47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
45 * 45 *
46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a 46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree 47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
48 * functions. 48 * functions. With metadata ecc, we now call different journal_access
49 * functions for each type of metadata, so it must have the
50 * root_journal_access function.
49 * ocfs2_extent_tree_operations abstract the normal operations we do for 51 * ocfs2_extent_tree_operations abstract the normal operations we do for
50 * the root of extent b-tree. 52 * the root of extent b-tree.
51 */ 53 */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
54 struct ocfs2_extent_tree_operations *et_ops; 56 struct ocfs2_extent_tree_operations *et_ops;
55 struct buffer_head *et_root_bh; 57 struct buffer_head *et_root_bh;
56 struct ocfs2_extent_list *et_root_el; 58 struct ocfs2_extent_list *et_root_el;
59 ocfs2_journal_access_func et_root_journal_access;
57 void *et_object; 60 void *et_object;
58 unsigned int et_max_leaf_clusters; 61 unsigned int et_max_leaf_clusters;
59}; 62};
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
68void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 71void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
69 struct inode *inode, 72 struct inode *inode,
70 struct buffer_head *bh); 73 struct buffer_head *bh);
74struct ocfs2_xattr_value_buf;
71void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
72 struct inode *inode, 76 struct inode *inode,
73 struct buffer_head *bh, 77 struct ocfs2_xattr_value_buf *vb);
74 struct ocfs2_xattr_value_root *xv); 78
79/*
80 * Read an extent block into *bh. If *bh is NULL, a bh will be
81 * allocated. This is a cached read. The extent block will be validated
82 * with ocfs2_validate_extent_block().
83 */
84int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
85 struct buffer_head **bh);
75 86
76struct ocfs2_alloc_context; 87struct ocfs2_alloc_context;
77int ocfs2_insert_extent(struct ocfs2_super *osb, 88int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
110 u32 cpos, u32 len, handle_t *handle, 121 u32 cpos, u32 len, handle_t *handle,
111 struct ocfs2_alloc_context *meta_ac, 122 struct ocfs2_alloc_context *meta_ac,
112 struct ocfs2_cached_dealloc_ctxt *dealloc); 123 struct ocfs2_cached_dealloc_ctxt *dealloc);
124int ocfs2_remove_btree_range(struct inode *inode,
125 struct ocfs2_extent_tree *et,
126 u32 cpos, u32 phys_cpos, u32 len,
127 struct ocfs2_cached_dealloc_ctxt *dealloc);
128
113int ocfs2_num_free_extents(struct ocfs2_super *osb, 129int ocfs2_num_free_extents(struct ocfs2_super *osb,
114 struct inode *inode, 130 struct inode *inode,
115 struct ocfs2_extent_tree *et); 131 struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
167 */ 183 */
168struct ocfs2_cached_dealloc_ctxt { 184struct ocfs2_cached_dealloc_ctxt {
169 struct ocfs2_per_slot_free_list *c_first_suballocator; 185 struct ocfs2_per_slot_free_list *c_first_suballocator;
186 struct ocfs2_cached_block_free *c_global_allocator;
170}; 187};
171static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) 188static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
172{ 189{
173 c->c_first_suballocator = NULL; 190 c->c_first_suballocator = NULL;
191 c->c_global_allocator = NULL;
192}
193int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
194 u64 blkno, unsigned int bit);
195static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
196{
197 return c->c_global_allocator != NULL;
174} 198}
175int ocfs2_run_deallocs(struct ocfs2_super *osb, 199int ocfs2_run_deallocs(struct ocfs2_super *osb,
176 struct ocfs2_cached_dealloc_ctxt *ctxt); 200 struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b33420..a067a6cffb01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
27#include <linux/swap.h> 27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h> 29#include <linux/mpage.h>
30#include <linux/quotaops.h>
30 31
31#define MLOG_MASK_PREFIX ML_FILE_IO 32#define MLOG_MASK_PREFIX ML_FILE_IO
32#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
68 goto bail; 69 goto bail;
69 } 70 }
70 71
71 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 72 status = ocfs2_read_inode_block(inode, &bh);
72 if (status < 0) { 73 if (status < 0) {
73 mlog_errno(status); 74 mlog_errno(status);
74 goto bail; 75 goto bail;
75 } 76 }
76 fe = (struct ocfs2_dinode *) bh->b_data; 77 fe = (struct ocfs2_dinode *) bh->b_data;
77 78
78 if (!OCFS2_IS_VALID_DINODE(fe)) {
79 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
80 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
81 fe->i_signature);
82 goto bail;
83 }
84
85 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, 79 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
86 le32_to_cpu(fe->i_clusters))) { 80 le32_to_cpu(fe->i_clusters))) {
87 mlog(ML_ERROR, "block offset is outside the allocated size: " 81 mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
262 BUG_ON(!PageLocked(page)); 256 BUG_ON(!PageLocked(page));
263 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); 257 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
264 258
265 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); 259 ret = ocfs2_read_inode_block(inode, &di_bh);
266 if (ret) { 260 if (ret) {
267 mlog_errno(ret); 261 mlog_errno(ret);
268 goto out; 262 goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
481 475
482 if (ocfs2_should_order_data(inode)) { 476 if (ocfs2_should_order_data(inode)) {
483 ret = ocfs2_jbd2_file_inode(handle, inode); 477 ret = ocfs2_jbd2_file_inode(handle, inode);
484#ifdef CONFIG_OCFS2_COMPAT_JBD
485 ret = walk_page_buffers(handle,
486 page_buffers(page),
487 from, to, NULL,
488 ocfs2_journal_dirty_data);
489#endif
490 if (ret < 0) 478 if (ret < 0)
491 mlog_errno(ret); 479 mlog_errno(ret);
492 } 480 }
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
1072 tmppage = wc->w_pages[i]; 1060 tmppage = wc->w_pages[i];
1073 1061
1074 if (page_has_buffers(tmppage)) { 1062 if (page_has_buffers(tmppage)) {
1075 if (ocfs2_should_order_data(inode)) { 1063 if (ocfs2_should_order_data(inode))
1076 ocfs2_jbd2_file_inode(wc->w_handle, inode); 1064 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1077#ifdef CONFIG_OCFS2_COMPAT_JBD
1078 walk_page_buffers(wc->w_handle,
1079 page_buffers(tmppage),
1080 from, to, NULL,
1081 ocfs2_journal_dirty_data);
1082#endif
1083 }
1084 1065
1085 block_commit_write(tmppage, from, to); 1066 block_commit_write(tmppage, from, to);
1086 } 1067 }
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1531 goto out; 1512 goto out;
1532 } 1513 }
1533 1514
1534 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, 1515 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
1535 OCFS2_JOURNAL_ACCESS_WRITE); 1516 OCFS2_JOURNAL_ACCESS_WRITE);
1536 if (ret) { 1517 if (ret) {
1537 ocfs2_commit_trans(osb, handle); 1518 ocfs2_commit_trans(osb, handle);
1538 1519
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1750 1731
1751 wc->w_handle = handle; 1732 wc->w_handle = handle;
1752 1733
1734 if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
1735 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
1736 ret = -EDQUOT;
1737 goto out_commit;
1738 }
1753 /* 1739 /*
1754 * We don't want this to fail in ocfs2_write_end(), so do it 1740 * We don't want this to fail in ocfs2_write_end(), so do it
1755 * here. 1741 * here.
1756 */ 1742 */
1757 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, 1743 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
1758 OCFS2_JOURNAL_ACCESS_WRITE); 1744 OCFS2_JOURNAL_ACCESS_WRITE);
1759 if (ret) { 1745 if (ret) {
1760 mlog_errno(ret); 1746 mlog_errno(ret);
1761 goto out_commit; 1747 goto out_quota;
1762 } 1748 }
1763 1749
1764 /* 1750 /*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1771 mmap_page); 1757 mmap_page);
1772 if (ret) { 1758 if (ret) {
1773 mlog_errno(ret); 1759 mlog_errno(ret);
1774 goto out_commit; 1760 goto out_quota;
1775 } 1761 }
1776 1762
1777 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, 1763 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1778 len); 1764 len);
1779 if (ret) { 1765 if (ret) {
1780 mlog_errno(ret); 1766 mlog_errno(ret);
1781 goto out_commit; 1767 goto out_quota;
1782 } 1768 }
1783 1769
1784 if (data_ac) 1770 if (data_ac)
@@ -1790,6 +1776,10 @@ success:
1790 *pagep = wc->w_target_page; 1776 *pagep = wc->w_target_page;
1791 *fsdata = wc; 1777 *fsdata = wc;
1792 return 0; 1778 return 0;
1779out_quota:
1780 if (clusters_to_alloc)
1781 vfs_dq_free_space(inode,
1782 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1793out_commit: 1783out_commit:
1794 ocfs2_commit_trans(osb, handle); 1784 ocfs2_commit_trans(osb, handle);
1795 1785
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
1919 } 1909 }
1920 1910
1921 if (page_has_buffers(tmppage)) { 1911 if (page_has_buffers(tmppage)) {
1922 if (ocfs2_should_order_data(inode)) { 1912 if (ocfs2_should_order_data(inode))
1923 ocfs2_jbd2_file_inode(wc->w_handle, inode); 1913 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1924#ifdef CONFIG_OCFS2_COMPAT_JBD
1925 walk_page_buffers(wc->w_handle,
1926 page_buffers(tmppage),
1927 from, to, NULL,
1928 ocfs2_journal_dirty_data);
1929#endif
1930 }
1931 block_commit_write(tmppage, from, to); 1914 block_commit_write(tmppage, from, to);
1932 } 1915 }
1933 } 1916 }
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000000..2a947c44e594
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * blockcheck.c
5 *
6 * Checksum and ECC codes for the OCFS2 userspace library.
7 *
8 * Copyright (C) 2006, 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/kernel.h>
21#include <linux/types.h>
22#include <linux/crc32.h>
23#include <linux/buffer_head.h>
24#include <linux/bitops.h>
25#include <asm/byteorder.h>
26
27#include <cluster/masklog.h>
28
29#include "ocfs2.h"
30
31#include "blockcheck.h"
32
33
34/*
35 * We use the following conventions:
36 *
37 * d = # data bits
38 * p = # parity bits
39 * c = # total code bits (d + p)
40 */
41
42
43/*
44 * Calculate the bit offset in the hamming code buffer based on the bit's
45 * offset in the data buffer. Since the hamming code reserves all
46 * power-of-two bits for parity, the data bit number and the code bit
47 * number are offest by all the parity bits beforehand.
48 *
49 * Recall that bit numbers in hamming code are 1-based. This function
50 * takes the 0-based data bit from the caller.
51 *
52 * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0),
53 * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit.
54 * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3
55 * in the code buffer.
56 *
57 * The caller can pass in *p if it wants to keep track of the most recent
58 * number of parity bits added. This allows the function to start the
59 * calculation at the last place.
60 */
61static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
62{
63 unsigned int b, p = 0;
64
65 /*
66 * Data bits are 0-based, but we're talking code bits, which
67 * are 1-based.
68 */
69 b = i + 1;
70
71 /* Use the cache if it is there */
72 if (p_cache)
73 p = *p_cache;
74 b += p;
75
76 /*
77 * For every power of two below our bit number, bump our bit.
78 *
79 * We compare with (b + 1) because we have to compare with what b
80 * would be _if_ it were bumped up by the parity bit. Capice?
81 *
82 * p is set above.
83 */
84 for (; (1 << p) < (b + 1); p++)
85 b++;
86
87 if (p_cache)
88 *p_cache = p;
89
90 return b;
91}
92
93/*
94 * This is the low level encoder function. It can be called across
95 * multiple hunks just like the crc32 code. 'd' is the number of bits
96 * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
97 * two 512B buffers, you would do it like so:
98 *
99 * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
100 * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
101 *
102 * If you just have one buffer, use ocfs2_hamming_encode_block().
103 */
104u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
105{
106 unsigned int i, b, p = 0;
107
108 BUG_ON(!d);
109
110 /*
111 * b is the hamming code bit number. Hamming code specifies a
112 * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is
113 * for the algorithm.
114 *
115 * The i++ in the for loop is so that the start offset passed
116 * to ocfs2_find_next_bit_set() is one greater than the previously
117 * found bit.
118 */
119 for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
120 {
121 /*
122 * i is the offset in this hunk, nr + i is the total bit
123 * offset.
124 */
125 b = calc_code_bit(nr + i, &p);
126
127 /*
128 * Data bits in the resultant code are checked by
129 * parity bits that are part of the bit number
130 * representation. Huh?
131 *
132 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
133 * In other words, the parity bit at position 2^k
134 * checks bits in positions having bit k set in
135 * their binary representation. Conversely, for
136 * instance, bit 13, i.e. 1101(2), is checked by
137 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
138 * </wikipedia>
139 *
140 * Note that 'k' is the _code_ bit number. 'b' in
141 * our loop.
142 */
143 parity ^= b;
144 }
145
146 /* While the data buffer was treated as little endian, the
147 * return value is in host endian. */
148 return parity;
149}
150
151u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
152{
153 return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
154}
155
156/*
157 * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
158 * offset of the current hunk. If bit to be fixed is not part of the
159 * current hunk, this does nothing.
160 *
161 * If you only have one hunk, use ocfs2_hamming_fix_block().
162 */
163void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
164 unsigned int fix)
165{
166 unsigned int i, b;
167
168 BUG_ON(!d);
169
170 /*
171 * If the bit to fix has an hweight of 1, it's a parity bit. One
172 * busted parity bit is its own error. Nothing to do here.
173 */
174 if (hweight32(fix) == 1)
175 return;
176
177 /*
178 * nr + d is the bit right past the data hunk we're looking at.
179 * If fix after that, nothing to do
180 */
181 if (fix >= calc_code_bit(nr + d, NULL))
182 return;
183
184 /*
185 * nr is the offset in the data hunk we're starting at. Let's
186 * start b at the offset in the code buffer. See hamming_encode()
187 * for a more detailed description of 'b'.
188 */
189 b = calc_code_bit(nr, NULL);
190 /* If the fix is before this hunk, nothing to do */
191 if (fix < b)
192 return;
193
194 for (i = 0; i < d; i++, b++)
195 {
196 /* Skip past parity bits */
197 while (hweight32(b) == 1)
198 b++;
199
200 /*
201 * i is the offset in this data hunk.
202 * nr + i is the offset in the total data buffer.
203 * b is the offset in the total code buffer.
204 *
205 * Thus, when b == fix, bit i in the current hunk needs
206 * fixing.
207 */
208 if (b == fix)
209 {
210 if (ocfs2_test_bit(i, data))
211 ocfs2_clear_bit(i, data);
212 else
213 ocfs2_set_bit(i, data);
214 break;
215 }
216 }
217}
218
219void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
220 unsigned int fix)
221{
222 ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
223}
224
225/*
226 * This function generates check information for a block.
227 * data is the block to be checked. bc is a pointer to the
228 * ocfs2_block_check structure describing the crc32 and the ecc.
229 *
230 * bc should be a pointer inside data, as the function will
231 * take care of zeroing it before calculating the check information. If
232 * bc does not point inside data, the caller must make sure any inline
233 * ocfs2_block_check structures are zeroed.
234 *
235 * The data buffer must be in on-disk endian (little endian for ocfs2).
236 * bc will be filled with little-endian values and will be ready to go to
237 * disk.
238 */
239void ocfs2_block_check_compute(void *data, size_t blocksize,
240 struct ocfs2_block_check *bc)
241{
242 u32 crc;
243 u32 ecc;
244
245 memset(bc, 0, sizeof(struct ocfs2_block_check));
246
247 crc = crc32_le(~0, data, blocksize);
248 ecc = ocfs2_hamming_encode_block(data, blocksize);
249
250 /*
251 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
252 * larger than 16 bits.
253 */
254 BUG_ON(ecc > USHORT_MAX);
255
256 bc->bc_crc32e = cpu_to_le32(crc);
257 bc->bc_ecc = cpu_to_le16((u16)ecc);
258}
259
260/*
261 * This function validates existing check information. Like _compute,
262 * the function will take care of zeroing bc before calculating check codes.
263 * If bc is not a pointer inside data, the caller must have zeroed any
264 * inline ocfs2_block_check structures.
265 *
266 * Again, the data passed in should be the on-disk endian.
267 */
268int ocfs2_block_check_validate(void *data, size_t blocksize,
269 struct ocfs2_block_check *bc)
270{
271 int rc = 0;
272 struct ocfs2_block_check check;
273 u32 crc, ecc;
274
275 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
276 check.bc_ecc = le16_to_cpu(bc->bc_ecc);
277
278 memset(bc, 0, sizeof(struct ocfs2_block_check));
279
280 /* Fast path - if the crc32 validates, we're good to go */
281 crc = crc32_le(~0, data, blocksize);
282 if (crc == check.bc_crc32e)
283 goto out;
284
285 mlog(ML_ERROR,
286 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
287 (unsigned int)check.bc_crc32e, (unsigned int)crc);
288
289 /* Ok, try ECC fixups */
290 ecc = ocfs2_hamming_encode_block(data, blocksize);
291 ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
292
293 /* And check the crc32 again */
294 crc = crc32_le(~0, data, blocksize);
295 if (crc == check.bc_crc32e)
296 goto out;
297
298 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
299 (unsigned int)check.bc_crc32e, (unsigned int)crc);
300
301 rc = -EIO;
302
303out:
304 bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
305 bc->bc_ecc = cpu_to_le16(check.bc_ecc);
306
307 return rc;
308}
309
310/*
311 * This function generates check information for a list of buffer_heads.
312 * bhs is the blocks to be checked. bc is a pointer to the
313 * ocfs2_block_check structure describing the crc32 and the ecc.
314 *
315 * bc should be a pointer inside data, as the function will
316 * take care of zeroing it before calculating the check information. If
317 * bc does not point inside data, the caller must make sure any inline
318 * ocfs2_block_check structures are zeroed.
319 *
320 * The data buffer must be in on-disk endian (little endian for ocfs2).
321 * bc will be filled with little-endian values and will be ready to go to
322 * disk.
323 */
324void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
325 struct ocfs2_block_check *bc)
326{
327 int i;
328 u32 crc, ecc;
329
330 BUG_ON(nr < 0);
331
332 if (!nr)
333 return;
334
335 memset(bc, 0, sizeof(struct ocfs2_block_check));
336
337 for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
338 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
339 /*
340 * The number of bits in a buffer is obviously b_size*8.
341 * The offset of this buffer is b_size*i, so the bit offset
342 * of this buffer is b_size*8*i.
343 */
344 ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
345 bhs[i]->b_size * 8,
346 bhs[i]->b_size * 8 * i);
347 }
348
349 /*
350 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
351 * larger than 16 bits.
352 */
353 BUG_ON(ecc > USHORT_MAX);
354
355 bc->bc_crc32e = cpu_to_le32(crc);
356 bc->bc_ecc = cpu_to_le16((u16)ecc);
357}
358
359/*
360 * This function validates existing check information on a list of
361 * buffer_heads. Like _compute_bhs, the function will take care of
362 * zeroing bc before calculating check codes. If bc is not a pointer
363 * inside data, the caller must have zeroed any inline
364 * ocfs2_block_check structures.
365 *
366 * Again, the data passed in should be the on-disk endian.
367 */
368int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
369 struct ocfs2_block_check *bc)
370{
371 int i, rc = 0;
372 struct ocfs2_block_check check;
373 u32 crc, ecc, fix;
374
375 BUG_ON(nr < 0);
376
377 if (!nr)
378 return 0;
379
380 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
381 check.bc_ecc = le16_to_cpu(bc->bc_ecc);
382
383 memset(bc, 0, sizeof(struct ocfs2_block_check));
384
385 /* Fast path - if the crc32 validates, we're good to go */
386 for (i = 0, crc = ~0; i < nr; i++)
387 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
388 if (crc == check.bc_crc32e)
389 goto out;
390
391 mlog(ML_ERROR,
392 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
393 (unsigned int)check.bc_crc32e, (unsigned int)crc);
394
395 /* Ok, try ECC fixups */
396 for (i = 0, ecc = 0; i < nr; i++) {
397 /*
398 * The number of bits in a buffer is obviously b_size*8.
399 * The offset of this buffer is b_size*i, so the bit offset
400 * of this buffer is b_size*8*i.
401 */
402 ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
403 bhs[i]->b_size * 8,
404 bhs[i]->b_size * 8 * i);
405 }
406 fix = ecc ^ check.bc_ecc;
407 for (i = 0; i < nr; i++) {
408 /*
409 * Try the fix against each buffer. It will only affect
410 * one of them.
411 */
412 ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
413 bhs[i]->b_size * 8 * i, fix);
414 }
415
416 /* And check the crc32 again */
417 for (i = 0, crc = ~0; i < nr; i++)
418 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
419 if (crc == check.bc_crc32e)
420 goto out;
421
422 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
423 (unsigned int)check.bc_crc32e, (unsigned int)crc);
424
425 rc = -EIO;
426
427out:
428 bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
429 bc->bc_ecc = cpu_to_le16(check.bc_ecc);
430
431 return rc;
432}
433
434/*
435 * These are the main API. They check the superblock flag before
436 * calling the underlying operations.
437 *
438 * They expect the buffer(s) to be in disk format.
439 */
440void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
441 struct ocfs2_block_check *bc)
442{
443 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
444 ocfs2_block_check_compute(data, sb->s_blocksize, bc);
445}
446
447int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
448 struct ocfs2_block_check *bc)
449{
450 int rc = 0;
451
452 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
453 rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
454
455 return rc;
456}
457
458void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
459 struct buffer_head **bhs, int nr,
460 struct ocfs2_block_check *bc)
461{
462 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
463 ocfs2_block_check_compute_bhs(bhs, nr, bc);
464}
465
466int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
467 struct buffer_head **bhs, int nr,
468 struct ocfs2_block_check *bc)
469{
470 int rc = 0;
471
472 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
473 rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
474
475 return rc;
476}
477
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000000..70ec3feda32f
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * blockcheck.h
5 *
6 * Checksum and ECC codes for the OCFS2 userspace library.
7 *
8 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_BLOCKCHECK_H
21#define OCFS2_BLOCKCHECK_H
22
23
24/* High level block API */
25void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
26 struct ocfs2_block_check *bc);
27int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
28 struct ocfs2_block_check *bc);
29void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
30 struct buffer_head **bhs, int nr,
31 struct ocfs2_block_check *bc);
32int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
33 struct buffer_head **bhs, int nr,
34 struct ocfs2_block_check *bc);
35
36/* Lower level API */
37void ocfs2_block_check_compute(void *data, size_t blocksize,
38 struct ocfs2_block_check *bc);
39int ocfs2_block_check_validate(void *data, size_t blocksize,
40 struct ocfs2_block_check *bc);
41void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
42 struct ocfs2_block_check *bc);
43int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
44 struct ocfs2_block_check *bc);
45
46/*
47 * Hamming code functions
48 */
49
50/*
51 * Encoding hamming code parity bits for a buffer.
52 *
53 * This is the low level encoder function. It can be called across
54 * multiple hunks just like the crc32 code. 'd' is the number of bits
55 * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
56 * two 512B buffers, you would do it like so:
57 *
58 * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
59 * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
60 *
61 * If you just have one buffer, use ocfs2_hamming_encode_block().
62 */
63u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
64 unsigned int nr);
65/*
66 * Fix a buffer with a bit error. The 'fix' is the original parity
67 * xor'd with the parity calculated now.
68 *
69 * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
70 * offset of the current hunk. If bit to be fixed is not part of the
71 * current hunk, this does nothing.
72 *
73 * If you only have one buffer, use ocfs2_hamming_fix_block().
74 */
75void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
76 unsigned int fix);
77
78/* Convenience wrappers for a single buffer of data */
79extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
80extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
81 unsigned int fix);
82#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7c..15c8e6deee2e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
39 39
40#include "buffer_head_io.h" 40#include "buffer_head_io.h"
41 41
42/*
43 * Bits on bh->b_state used by ocfs2.
44 *
45 * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart.
46 */
47enum ocfs2_state_bits {
48 BH_NeedsValidate = BH_JBDPrivateStart,
49};
50
51/* Expand the magic b_state functions */
52BUFFER_FNS(NeedsValidate, needs_validate);
53
42int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, 54int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
43 struct inode *inode) 55 struct inode *inode)
44{ 56{
@@ -166,7 +178,9 @@ bail:
166} 178}
167 179
168int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, 180int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
169 struct buffer_head *bhs[], int flags) 181 struct buffer_head *bhs[], int flags,
182 int (*validate)(struct super_block *sb,
183 struct buffer_head *bh))
170{ 184{
171 int status = 0; 185 int status = 0;
172 int i, ignore_cache = 0; 186 int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
298 312
299 clear_buffer_uptodate(bh); 313 clear_buffer_uptodate(bh);
300 get_bh(bh); /* for end_buffer_read_sync() */ 314 get_bh(bh); /* for end_buffer_read_sync() */
315 if (validate)
316 set_buffer_needs_validate(bh);
301 bh->b_end_io = end_buffer_read_sync; 317 bh->b_end_io = end_buffer_read_sync;
302 submit_bh(READ, bh); 318 submit_bh(READ, bh);
303 continue; 319 continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
328 bhs[i] = NULL; 344 bhs[i] = NULL;
329 continue; 345 continue;
330 } 346 }
347
348 if (buffer_needs_validate(bh)) {
349 /* We never set NeedsValidate if the
350 * buffer was held by the journal, so
351 * that better not have changed */
352 BUG_ON(buffer_jbd(bh));
353 clear_buffer_needs_validate(bh);
354 status = validate(inode->i_sb, bh);
355 if (status) {
356 put_bh(bh);
357 bhs[i] = NULL;
358 continue;
359 }
360 }
331 } 361 }
332 362
333 /* Always set the buffer in the cache, even if it was 363 /* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade7..c75d682dadd8 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh, 31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate); 32 int uptodate);
33 33
34static inline int ocfs2_read_block(struct inode *inode,
35 u64 off,
36 struct buffer_head **bh);
37
38int ocfs2_write_block(struct ocfs2_super *osb, 34int ocfs2_write_block(struct ocfs2_super *osb,
39 struct buffer_head *bh, 35 struct buffer_head *bh,
40 struct inode *inode); 36 struct inode *inode);
41int ocfs2_read_blocks(struct inode *inode,
42 u64 block,
43 int nr,
44 struct buffer_head *bhs[],
45 int flags);
46int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, 37int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
47 unsigned int nr, struct buffer_head *bhs[]); 38 unsigned int nr, struct buffer_head *bhs[]);
48 39
40/*
41 * If not NULL, validate() will be called on a buffer that is freshly
42 * read from disk. It will not be called if the buffer was in cache.
43 * Note that if validate() is being used for this buffer, it needs to
44 * be set even for a READAHEAD call, as it marks the buffer for later
45 * validation.
46 */
47int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
48 struct buffer_head *bhs[], int flags,
49 int (*validate)(struct super_block *sb,
50 struct buffer_head *bh));
51
49int ocfs2_write_super_or_backup(struct ocfs2_super *osb, 52int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
50 struct buffer_head *bh); 53 struct buffer_head *bh);
51 54
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
53#define OCFS2_BH_READAHEAD 8 56#define OCFS2_BH_READAHEAD 8
54 57
55static inline int ocfs2_read_block(struct inode *inode, u64 off, 58static inline int ocfs2_read_block(struct inode *inode, u64 off,
56 struct buffer_head **bh) 59 struct buffer_head **bh,
60 int (*validate)(struct super_block *sb,
61 struct buffer_head *bh))
57{ 62{
58 int status = 0; 63 int status = 0;
59 64
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
63 goto bail; 68 goto bail;
64 } 69 }
65 70
66 status = ocfs2_read_blocks(inode, off, 1, bh, 0); 71 status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
67 72
68bail: 73bail:
69 return status; 74 return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c03..04697ba7f73e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
854 854
855 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 855 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
856 /* We track the time spent inside 856 /* We track the time spent inside
857 * o2hb_do_disk_heartbeat so that we avoid more then 857 * o2hb_do_disk_heartbeat so that we avoid more than
858 * hr_timeout_ms between disk writes. On busy systems 858 * hr_timeout_ms between disk writes. On busy systems
859 * this should result in a heartbeat which is less 859 * this should result in a heartbeat which is less
860 * likely to time itself out. */ 860 * likely to time itself out. */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef6..96df5416993e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
110 define_mask(QUORUM), 110 define_mask(QUORUM),
111 define_mask(EXPORT), 111 define_mask(EXPORT),
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA),
113 define_mask(ERROR), 114 define_mask(ERROR),
114 define_mask(NOTICE), 115 define_mask(NOTICE),
115 define_mask(KTHREAD), 116 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c680471..7e72a81bc2d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ 113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ 114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
115#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 115#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
116#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116/* bits that are infrequently given and frequently matched in the high word */ 117/* bits that are infrequently given and frequently matched in the high word */
117#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
118#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb85187..f2c4098cf337 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
40#include <linux/types.h> 40#include <linux/types.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h>
43 44
44#define MLOG_MASK_PREFIX ML_NAMEI 45#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -47,6 +48,7 @@
47#include "ocfs2.h" 48#include "ocfs2.h"
48 49
49#include "alloc.h" 50#include "alloc.h"
51#include "blockcheck.h"
50#include "dir.h" 52#include "dir.h"
51#include "dlmglue.h" 53#include "dlmglue.h"
52#include "extent_map.h" 54#include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
82 struct ocfs2_alloc_context *meta_ac, 84 struct ocfs2_alloc_context *meta_ac,
83 struct buffer_head **new_bh); 85 struct buffer_head **new_bh);
84 86
85static struct buffer_head *ocfs2_bread(struct inode *inode, 87/*
86 int block, int *err, int reada) 88 * These are distinct checks because future versions of the file system will
89 * want to have a trailing dirent structure independent of indexing.
90 */
91static int ocfs2_dir_has_trailer(struct inode *dir)
87{ 92{
88 struct buffer_head *bh = NULL; 93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
89 int tmperr; 94 return 0;
90 u64 p_blkno;
91 int readflags = 0;
92 95
93 if (reada) 96 return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
94 readflags |= OCFS2_BH_READAHEAD; 97}
95 98
96 if (((u64)block << inode->i_sb->s_blocksize_bits) >= 99static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
97 i_size_read(inode)) { 100{
98 BUG_ON(!reada); 101 return ocfs2_meta_ecc(osb);
99 return NULL; 102}
100 }
101 103
102 down_read(&OCFS2_I(inode)->ip_alloc_sem); 104static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
103 tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, 105{
104 NULL); 106 return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
105 up_read(&OCFS2_I(inode)->ip_alloc_sem); 107}
106 if (tmperr < 0) {
107 mlog_errno(tmperr);
108 goto fail;
109 }
110 108
111 tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags); 109#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
112 if (tmperr < 0)
113 goto fail;
114 110
115 tmperr = 0; 111/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
112 * them more consistent? */
113struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
114 void *data)
115{
116 char *p = data;
116 117
117 *err = 0; 118 p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
118 return bh; 119 return (struct ocfs2_dir_block_trailer *)p;
120}
119 121
120fail: 122/*
121 brelse(bh); 123 * XXX: This is executed once on every dirent. We should consider optimizing
122 bh = NULL; 124 * it.
125 */
126static int ocfs2_skip_dir_trailer(struct inode *dir,
127 struct ocfs2_dir_entry *de,
128 unsigned long offset,
129 unsigned long blklen)
130{
131 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
123 132
124 *err = -EIO; 133 if (!ocfs2_dir_has_trailer(dir))
125 return NULL; 134 return 0;
135
136 if (offset != toff)
137 return 0;
138
139 return 1;
140}
141
142static void ocfs2_init_dir_trailer(struct inode *inode,
143 struct buffer_head *bh)
144{
145 struct ocfs2_dir_block_trailer *trailer;
146
147 trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
148 strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
149 trailer->db_compat_rec_len =
150 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
151 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
152 trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
126} 153}
127 154
128/* 155/*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
231 struct ocfs2_dinode *di; 258 struct ocfs2_dinode *di;
232 struct ocfs2_inline_data *data; 259 struct ocfs2_inline_data *data;
233 260
234 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); 261 ret = ocfs2_read_inode_block(dir, &di_bh);
235 if (ret) { 262 if (ret) {
236 mlog_errno(ret); 263 mlog_errno(ret);
237 goto out; 264 goto out;
@@ -250,6 +277,108 @@ out:
250 return NULL; 277 return NULL;
251} 278}
252 279
280static int ocfs2_validate_dir_block(struct super_block *sb,
281 struct buffer_head *bh)
282{
283 int rc;
284 struct ocfs2_dir_block_trailer *trailer =
285 ocfs2_trailer_from_bh(bh, sb);
286
287
288 /*
289 * We don't validate dirents here, that's handled
290 * in-place when the code walks them.
291 */
292 mlog(0, "Validating dirblock %llu\n",
293 (unsigned long long)bh->b_blocknr);
294
295 BUG_ON(!buffer_uptodate(bh));
296
297 /*
298 * If the ecc fails, we return the error but otherwise
299 * leave the filesystem running. We know any error is
300 * local to this block.
301 *
302 * Note that we are safe to call this even if the directory
303 * doesn't have a trailer. Filesystems without metaecc will do
304 * nothing, and filesystems with it will have one.
305 */
306 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
307 if (rc)
308 mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
309 (unsigned long long)bh->b_blocknr);
310
311 return rc;
312}
313
314/*
315 * This function forces all errors to -EIO for consistency with its
316 * predecessor, ocfs2_bread(). We haven't audited what returning the
317 * real error codes would do to callers. We log the real codes with
318 * mlog_errno() before we squash them.
319 */
320static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
321 struct buffer_head **bh, int flags)
322{
323 int rc = 0;
324 struct buffer_head *tmp = *bh;
325 struct ocfs2_dir_block_trailer *trailer;
326
327 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
328 ocfs2_validate_dir_block);
329 if (rc) {
330 mlog_errno(rc);
331 goto out;
332 }
333
334 /*
335 * We check the trailer here rather than in
336 * ocfs2_validate_dir_block() because that function doesn't have
337 * the inode to test.
338 */
339 if (!(flags & OCFS2_BH_READAHEAD) &&
340 ocfs2_dir_has_trailer(inode)) {
341 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
342 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
343 rc = -EINVAL;
344 ocfs2_error(inode->i_sb,
345 "Invalid dirblock #%llu: "
346 "signature = %.*s\n",
347 (unsigned long long)tmp->b_blocknr, 7,
348 trailer->db_signature);
349 goto out;
350 }
351 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
352 rc = -EINVAL;
353 ocfs2_error(inode->i_sb,
354 "Directory block #%llu has an invalid "
355 "db_blkno of %llu",
356 (unsigned long long)tmp->b_blocknr,
357 (unsigned long long)le64_to_cpu(trailer->db_blkno));
358 goto out;
359 }
360 if (le64_to_cpu(trailer->db_parent_dinode) !=
361 OCFS2_I(inode)->ip_blkno) {
362 rc = -EINVAL;
363 ocfs2_error(inode->i_sb,
364 "Directory block #%llu on dinode "
365 "#%llu has an invalid parent_dinode "
366 "of %llu",
367 (unsigned long long)tmp->b_blocknr,
368 (unsigned long long)OCFS2_I(inode)->ip_blkno,
369 (unsigned long long)le64_to_cpu(trailer->db_blkno));
370 goto out;
371 }
372 }
373
374 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
375 if (!*bh)
376 *bh = tmp;
377
378out:
379 return rc ? -EIO : 0;
380}
381
253static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 382static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
254 struct inode *dir, 383 struct inode *dir,
255 struct ocfs2_dir_entry **res_dir) 384 struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
296 } 425 }
297 num++; 426 num++;
298 427
299 bh = ocfs2_bread(dir, b++, &err, 1); 428 bh = NULL;
429 err = ocfs2_read_dir_block(dir, b++, &bh,
430 OCFS2_BH_READAHEAD);
300 bh_use[ra_max] = bh; 431 bh_use[ra_max] = bh;
301 } 432 }
302 } 433 }
303 if ((bh = bh_use[ra_ptr++]) == NULL) 434 if ((bh = bh_use[ra_ptr++]) == NULL)
304 goto next; 435 goto next;
305 if (ocfs2_read_block(dir, block, &bh)) { 436 if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
306 /* read error, skip block & hope for the best. 437 /* read error, skip block & hope for the best.
307 * ocfs2_read_block() has released the bh. */ 438 * ocfs2_read_dir_block() has released the bh. */
308 ocfs2_error(dir->i_sb, "reading directory %llu, " 439 ocfs2_error(dir->i_sb, "reading directory %llu, "
309 "offset %lu\n", 440 "offset %lu\n",
310 (unsigned long long)OCFS2_I(dir)->ip_blkno, 441 (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
381 struct inode *new_entry_inode) 512 struct inode *new_entry_inode)
382{ 513{
383 int ret; 514 int ret;
515 ocfs2_journal_access_func access = ocfs2_journal_access_db;
384 516
385 /* 517 /*
386 * The same code works fine for both inline-data and extent 518 * The same code works fine for both inline-data and extent
387 * based directories, so no need to split this up. 519 * based directories, so no need to split this up. The only
520 * difference is the journal_access function.
388 */ 521 */
389 522
390 ret = ocfs2_journal_access(handle, dir, de_bh, 523 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
391 OCFS2_JOURNAL_ACCESS_WRITE); 524 access = ocfs2_journal_access_di;
525
526 ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
392 if (ret) { 527 if (ret) {
393 mlog_errno(ret); 528 mlog_errno(ret);
394 goto out; 529 goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
410{ 545{
411 struct ocfs2_dir_entry *de, *pde; 546 struct ocfs2_dir_entry *de, *pde;
412 int i, status = -ENOENT; 547 int i, status = -ENOENT;
548 ocfs2_journal_access_func access = ocfs2_journal_access_db;
413 549
414 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); 550 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
415 551
552 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
553 access = ocfs2_journal_access_di;
554
416 i = 0; 555 i = 0;
417 pde = NULL; 556 pde = NULL;
418 de = (struct ocfs2_dir_entry *) first_de; 557 de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
423 goto bail; 562 goto bail;
424 } 563 }
425 if (de == de_del) { 564 if (de == de_del) {
426 status = ocfs2_journal_access(handle, dir, bh, 565 status = access(handle, dir, bh,
427 OCFS2_JOURNAL_ACCESS_WRITE); 566 OCFS2_JOURNAL_ACCESS_WRITE);
428 if (status < 0) { 567 if (status < 0) {
429 status = -EIO; 568 status = -EIO;
430 mlog_errno(status); 569 mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
458 struct ocfs2_dinode *di; 597 struct ocfs2_dinode *di;
459 struct ocfs2_inline_data *data; 598 struct ocfs2_inline_data *data;
460 599
461 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); 600 ret = ocfs2_read_inode_block(dir, &di_bh);
462 if (ret) { 601 if (ret) {
463 mlog_errno(ret); 602 mlog_errno(ret);
464 goto out; 603 goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
576 goto bail; 715 goto bail;
577 } 716 }
578 717
718 /* We're guaranteed that we should have space, so we
719 * can't possibly have hit the trailer...right? */
720 mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
721 "Hit dir trailer trying to insert %.*s "
722 "(namelen %d) into directory %llu. "
723 "offset is %lu, trailer offset is %d\n",
724 namelen, name, namelen,
725 (unsigned long long)parent_fe_bh->b_blocknr,
726 offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
727
579 if (ocfs2_dirent_would_fit(de, rec_len)) { 728 if (ocfs2_dirent_would_fit(de, rec_len)) {
580 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 729 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
581 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); 730 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
584 goto bail; 733 goto bail;
585 } 734 }
586 735
587 status = ocfs2_journal_access(handle, dir, insert_bh, 736 if (insert_bh == parent_fe_bh)
588 OCFS2_JOURNAL_ACCESS_WRITE); 737 status = ocfs2_journal_access_di(handle, dir,
738 insert_bh,
739 OCFS2_JOURNAL_ACCESS_WRITE);
740 else
741 status = ocfs2_journal_access_db(handle, dir,
742 insert_bh,
743 OCFS2_JOURNAL_ACCESS_WRITE);
589 /* By now the buffer is marked for journaling */ 744 /* By now the buffer is marked for journaling */
590 offset += le16_to_cpu(de->rec_len); 745 offset += le16_to_cpu(de->rec_len);
591 if (le64_to_cpu(de->inode)) { 746 if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
611 retval = 0; 766 retval = 0;
612 goto bail; 767 goto bail;
613 } 768 }
769
614 offset += le16_to_cpu(de->rec_len); 770 offset += le16_to_cpu(de->rec_len);
615 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); 771 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
616 } 772 }
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
636 struct ocfs2_inline_data *data; 792 struct ocfs2_inline_data *data;
637 struct ocfs2_dir_entry *de; 793 struct ocfs2_dir_entry *de;
638 794
639 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); 795 ret = ocfs2_read_inode_block(inode, &di_bh);
640 if (ret) { 796 if (ret) {
641 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", 797 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
642 (unsigned long long)OCFS2_I(inode)->ip_blkno); 798 (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
724 int i, stored; 880 int i, stored;
725 struct buffer_head * bh, * tmp; 881 struct buffer_head * bh, * tmp;
726 struct ocfs2_dir_entry * de; 882 struct ocfs2_dir_entry * de;
727 int err;
728 struct super_block * sb = inode->i_sb; 883 struct super_block * sb = inode->i_sb;
729 unsigned int ra_sectors = 16; 884 unsigned int ra_sectors = 16;
730 885
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
735 890
736 while (!error && !stored && *f_pos < i_size_read(inode)) { 891 while (!error && !stored && *f_pos < i_size_read(inode)) {
737 blk = (*f_pos) >> sb->s_blocksize_bits; 892 blk = (*f_pos) >> sb->s_blocksize_bits;
738 bh = ocfs2_bread(inode, blk, &err, 0); 893 if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
739 if (!bh) { 894 /* Skip the corrupt dirblock and keep trying */
740 mlog(ML_ERROR,
741 "directory #%llu contains a hole at offset %lld\n",
742 (unsigned long long)OCFS2_I(inode)->ip_blkno,
743 *f_pos);
744 *f_pos += sb->s_blocksize - offset; 895 *f_pos += sb->s_blocksize - offset;
745 continue; 896 continue;
746 } 897 }
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
754 || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { 905 || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
755 for (i = ra_sectors >> (sb->s_blocksize_bits - 9); 906 for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
756 i > 0; i--) { 907 i > 0; i--) {
757 tmp = ocfs2_bread(inode, ++blk, &err, 1); 908 tmp = NULL;
758 brelse(tmp); 909 if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
910 OCFS2_BH_READAHEAD))
911 brelse(tmp);
759 } 912 }
760 last_ra_blk = blk; 913 last_ra_blk = blk;
761 ra_sectors = 8; 914 ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
828 } 981 }
829 offset = 0; 982 offset = 0;
830 brelse(bh); 983 brelse(bh);
984 bh = NULL;
831 } 985 }
832 986
833 stored = 0; 987 stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
1050 return !priv.seen_other; 1204 return !priv.seen_other;
1051} 1205}
1052 1206
1053static void ocfs2_fill_initial_dirents(struct inode *inode, 1207/*
1054 struct inode *parent, 1208 * Fills "." and ".." dirents in a new directory block. Returns dirent for
1055 char *start, unsigned int size) 1209 * "..", which might be used during creation of a directory with a trailing
1210 * header. It is otherwise safe to ignore the return code.
1211 */
1212static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
1213 struct inode *parent,
1214 char *start,
1215 unsigned int size)
1056{ 1216{
1057 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start; 1217 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
1058 1218
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
1069 de->name_len = 2; 1229 de->name_len = 2;
1070 strcpy(de->name, ".."); 1230 strcpy(de->name, "..");
1071 ocfs2_set_de_type(de, S_IFDIR); 1231 ocfs2_set_de_type(de, S_IFDIR);
1232
1233 return de;
1072} 1234}
1073 1235
1074/* 1236/*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
1086 struct ocfs2_inline_data *data = &di->id2.i_data; 1248 struct ocfs2_inline_data *data = &di->id2.i_data;
1087 unsigned int size = le16_to_cpu(data->id_count); 1249 unsigned int size = le16_to_cpu(data->id_count);
1088 1250
1089 ret = ocfs2_journal_access(handle, inode, di_bh, 1251 ret = ocfs2_journal_access_di(handle, inode, di_bh,
1090 OCFS2_JOURNAL_ACCESS_WRITE); 1252 OCFS2_JOURNAL_ACCESS_WRITE);
1091 if (ret) { 1253 if (ret) {
1092 mlog_errno(ret); 1254 mlog_errno(ret);
1093 goto out; 1255 goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1121 struct ocfs2_alloc_context *data_ac) 1283 struct ocfs2_alloc_context *data_ac)
1122{ 1284{
1123 int status; 1285 int status;
1286 unsigned int size = osb->sb->s_blocksize;
1124 struct buffer_head *new_bh = NULL; 1287 struct buffer_head *new_bh = NULL;
1288 struct ocfs2_dir_entry *de;
1125 1289
1126 mlog_entry_void(); 1290 mlog_entry_void();
1127 1291
1292 if (ocfs2_supports_dir_trailer(osb))
1293 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
1294
1128 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 1295 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
1129 data_ac, NULL, &new_bh); 1296 data_ac, NULL, &new_bh);
1130 if (status < 0) { 1297 if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1134 1301
1135 ocfs2_set_new_buffer_uptodate(inode, new_bh); 1302 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1136 1303
1137 status = ocfs2_journal_access(handle, inode, new_bh, 1304 status = ocfs2_journal_access_db(handle, inode, new_bh,
1138 OCFS2_JOURNAL_ACCESS_CREATE); 1305 OCFS2_JOURNAL_ACCESS_CREATE);
1139 if (status < 0) { 1306 if (status < 0) {
1140 mlog_errno(status); 1307 mlog_errno(status);
1141 goto bail; 1308 goto bail;
1142 } 1309 }
1143 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 1310 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1144 1311
1145 ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, 1312 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
1146 osb->sb->s_blocksize); 1313 if (ocfs2_supports_dir_trailer(osb))
1314 ocfs2_init_dir_trailer(inode, new_bh);
1147 1315
1148 status = ocfs2_journal_dirty(handle, new_bh); 1316 status = ocfs2_journal_dirty(handle, new_bh);
1149 if (status < 0) { 1317 if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1184 data_ac); 1352 data_ac);
1185} 1353}
1186 1354
1355/*
1356 * Expand rec_len of the rightmost dirent in a directory block so that it
1357 * contains the end of our valid space for dirents. We do this during
1358 * expansion from an inline directory to one with extents. The first dir block
1359 * in that case is taken from the inline data portion of the inode block.
1360 *
1361 * We add the dir trailer if this filesystem wants it.
1362 */
1187static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, 1363static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1188 unsigned int new_size) 1364 struct super_block *sb)
1189{ 1365{
1190 struct ocfs2_dir_entry *de; 1366 struct ocfs2_dir_entry *de;
1191 struct ocfs2_dir_entry *prev_de; 1367 struct ocfs2_dir_entry *prev_de;
1192 char *de_buf, *limit; 1368 char *de_buf, *limit;
1193 unsigned int bytes = new_size - old_size; 1369 unsigned int new_size = sb->s_blocksize;
1370 unsigned int bytes;
1371
1372 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
1373 new_size = ocfs2_dir_trailer_blk_off(sb);
1374
1375 bytes = new_size - old_size;
1194 1376
1195 limit = start + old_size; 1377 limit = start + old_size;
1196 de_buf = start; 1378 de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1216 unsigned int blocks_wanted, 1398 unsigned int blocks_wanted,
1217 struct buffer_head **first_block_bh) 1399 struct buffer_head **first_block_bh)
1218{ 1400{
1219 int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
1220 u32 alloc, bit_off, len; 1401 u32 alloc, bit_off, len;
1221 struct super_block *sb = dir->i_sb; 1402 struct super_block *sb = dir->i_sb;
1403 int ret, credits = ocfs2_inline_to_extents_credits(sb);
1222 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 1404 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
1223 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 1405 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1224 struct ocfs2_inode_info *oi = OCFS2_I(dir); 1406 struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1227 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1228 handle_t *handle; 1410 handle_t *handle;
1229 struct ocfs2_extent_tree et; 1411 struct ocfs2_extent_tree et;
1412 int did_quota = 0;
1230 1413
1231 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 1414 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1232 1415
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1264 goto out_sem; 1447 goto out_sem;
1265 } 1448 }
1266 1449
1450 if (vfs_dq_alloc_space_nodirty(dir,
1451 ocfs2_clusters_to_bytes(osb->sb, alloc))) {
1452 ret = -EDQUOT;
1453 goto out_commit;
1454 }
1455 did_quota = 1;
1267 /* 1456 /*
1268 * Try to claim as many clusters as the bitmap can give though 1457 * Try to claim as many clusters as the bitmap can give though
1269 * if we only get one now, that's enough to continue. The rest 1458 * if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1290 1479
1291 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); 1480 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
1292 1481
1293 ret = ocfs2_journal_access(handle, dir, dirdata_bh, 1482 ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
1294 OCFS2_JOURNAL_ACCESS_CREATE); 1483 OCFS2_JOURNAL_ACCESS_CREATE);
1295 if (ret) { 1484 if (ret) {
1296 mlog_errno(ret); 1485 mlog_errno(ret);
1297 goto out_commit; 1486 goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1300 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 1489 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1301 memset(dirdata_bh->b_data + i_size_read(dir), 0, 1490 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1302 sb->s_blocksize - i_size_read(dir)); 1491 sb->s_blocksize - i_size_read(dir));
1303 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), 1492 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
1304 sb->s_blocksize); 1493 if (ocfs2_supports_dir_trailer(osb))
1494 ocfs2_init_dir_trailer(dir, dirdata_bh);
1305 1495
1306 ret = ocfs2_journal_dirty(handle, dirdata_bh); 1496 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1307 if (ret) { 1497 if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1317 * We let the later dirent insert modify c/mtime - to the user 1507 * We let the later dirent insert modify c/mtime - to the user
1318 * the data hasn't changed. 1508 * the data hasn't changed.
1319 */ 1509 */
1320 ret = ocfs2_journal_access(handle, dir, di_bh, 1510 ret = ocfs2_journal_access_di(handle, dir, di_bh,
1321 OCFS2_JOURNAL_ACCESS_CREATE); 1511 OCFS2_JOURNAL_ACCESS_CREATE);
1322 if (ret) { 1512 if (ret) {
1323 mlog_errno(ret); 1513 mlog_errno(ret);
1324 goto out_commit; 1514 goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1386 dirdata_bh = NULL; 1576 dirdata_bh = NULL;
1387 1577
1388out_commit: 1578out_commit:
1579 if (ret < 0 && did_quota)
1580 vfs_dq_free_space_nodirty(dir,
1581 ocfs2_clusters_to_bytes(osb->sb, 2));
1389 ocfs2_commit_trans(osb, handle); 1582 ocfs2_commit_trans(osb, handle);
1390 1583
1391out_sem: 1584out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1410 struct buffer_head **new_bh) 1603 struct buffer_head **new_bh)
1411{ 1604{
1412 int status; 1605 int status;
1413 int extend; 1606 int extend, did_quota = 0;
1414 u64 p_blkno, v_blkno; 1607 u64 p_blkno, v_blkno;
1415 1608
1416 spin_lock(&OCFS2_I(dir)->ip_lock); 1609 spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1420 if (extend) { 1613 if (extend) {
1421 u32 offset = OCFS2_I(dir)->ip_clusters; 1614 u32 offset = OCFS2_I(dir)->ip_clusters;
1422 1615
1616 if (vfs_dq_alloc_space_nodirty(dir,
1617 ocfs2_clusters_to_bytes(sb, 1))) {
1618 status = -EDQUOT;
1619 goto bail;
1620 }
1621 did_quota = 1;
1622
1423 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 1623 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
1424 1, 0, parent_fe_bh, handle, 1624 1, 0, parent_fe_bh, handle,
1425 data_ac, meta_ac, NULL); 1625 data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1445 } 1645 }
1446 status = 0; 1646 status = 0;
1447bail: 1647bail:
1648 if (did_quota && status < 0)
1649 vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
1448 mlog_exit(status); 1650 mlog_exit(status);
1449 return status; 1651 return status;
1450} 1652}
@@ -1569,16 +1771,22 @@ do_extend:
1569 1771
1570 ocfs2_set_new_buffer_uptodate(dir, new_bh); 1772 ocfs2_set_new_buffer_uptodate(dir, new_bh);
1571 1773
1572 status = ocfs2_journal_access(handle, dir, new_bh, 1774 status = ocfs2_journal_access_db(handle, dir, new_bh,
1573 OCFS2_JOURNAL_ACCESS_CREATE); 1775 OCFS2_JOURNAL_ACCESS_CREATE);
1574 if (status < 0) { 1776 if (status < 0) {
1575 mlog_errno(status); 1777 mlog_errno(status);
1576 goto bail; 1778 goto bail;
1577 } 1779 }
1578 memset(new_bh->b_data, 0, sb->s_blocksize); 1780 memset(new_bh->b_data, 0, sb->s_blocksize);
1781
1579 de = (struct ocfs2_dir_entry *) new_bh->b_data; 1782 de = (struct ocfs2_dir_entry *) new_bh->b_data;
1580 de->inode = 0; 1783 de->inode = 0;
1581 de->rec_len = cpu_to_le16(sb->s_blocksize); 1784 if (ocfs2_dir_has_trailer(dir)) {
1785 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
1786 ocfs2_init_dir_trailer(dir, new_bh);
1787 } else {
1788 de->rec_len = cpu_to_le16(sb->s_blocksize);
1789 }
1582 status = ocfs2_journal_dirty(handle, new_bh); 1790 status = ocfs2_journal_dirty(handle, new_bh);
1583 if (status < 0) { 1791 if (status < 0) {
1584 mlog_errno(status); 1792 mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1620 unsigned int *blocks_wanted) 1828 unsigned int *blocks_wanted)
1621{ 1829{
1622 int ret; 1830 int ret;
1831 struct super_block *sb = dir->i_sb;
1623 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1832 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1624 struct ocfs2_dir_entry *de, *last_de = NULL; 1833 struct ocfs2_dir_entry *de, *last_de = NULL;
1625 char *de_buf, *limit; 1834 char *de_buf, *limit;
1626 unsigned long offset = 0; 1835 unsigned long offset = 0;
1627 unsigned int rec_len, new_rec_len; 1836 unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
1837
1838 /*
1839 * This calculates how many free bytes we'd have in block zero, should
1840 * this function force expansion to an extent tree.
1841 */
1842 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
1843 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
1844 else
1845 free_space = dir->i_sb->s_blocksize - i_size_read(dir);
1628 1846
1629 de_buf = di->id2.i_data.id_data; 1847 de_buf = di->id2.i_data.id_data;
1630 limit = de_buf + i_size_read(dir); 1848 limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1641 ret = -EEXIST; 1859 ret = -EEXIST;
1642 goto out; 1860 goto out;
1643 } 1861 }
1862 /*
1863 * No need to check for a trailing dirent record here as
1864 * they're not used for inline dirs.
1865 */
1866
1644 if (ocfs2_dirent_would_fit(de, rec_len)) { 1867 if (ocfs2_dirent_would_fit(de, rec_len)) {
1645 /* Ok, we found a spot. Return this bh and let 1868 /* Ok, we found a spot. Return this bh and let
1646 * the caller actually fill it in. */ 1869 * the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1661 * dirent can be found. 1884 * dirent can be found.
1662 */ 1885 */
1663 *blocks_wanted = 1; 1886 *blocks_wanted = 1;
1664 new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir)); 1887 new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
1665 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len))) 1888 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
1666 *blocks_wanted = 2; 1889 *blocks_wanted = 2;
1667 1890
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1679 struct ocfs2_dir_entry *de; 1902 struct ocfs2_dir_entry *de;
1680 struct super_block *sb = dir->i_sb; 1903 struct super_block *sb = dir->i_sb;
1681 int status; 1904 int status;
1905 int blocksize = dir->i_sb->s_blocksize;
1682 1906
1683 bh = ocfs2_bread(dir, 0, &status, 0); 1907 status = ocfs2_read_dir_block(dir, 0, &bh, 0);
1684 if (!bh) { 1908 if (status) {
1685 mlog_errno(status); 1909 mlog_errno(status);
1686 goto bail; 1910 goto bail;
1687 } 1911 }
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1702 status = -ENOSPC; 1926 status = -ENOSPC;
1703 goto bail; 1927 goto bail;
1704 } 1928 }
1705 bh = ocfs2_bread(dir, 1929 status = ocfs2_read_dir_block(dir,
1706 offset >> sb->s_blocksize_bits, 1930 offset >> sb->s_blocksize_bits,
1707 &status, 1931 &bh, 0);
1708 0); 1932 if (status) {
1709 if (!bh) {
1710 mlog_errno(status); 1933 mlog_errno(status);
1711 goto bail; 1934 goto bail;
1712 } 1935 }
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1721 status = -EEXIST; 1944 status = -EEXIST;
1722 goto bail; 1945 goto bail;
1723 } 1946 }
1947
1948 if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
1949 blocksize))
1950 goto next;
1951
1724 if (ocfs2_dirent_would_fit(de, rec_len)) { 1952 if (ocfs2_dirent_would_fit(de, rec_len)) {
1725 /* Ok, we found a spot. Return this bh and let 1953 /* Ok, we found a spot. Return this bh and let
1726 * the caller actually fill it in. */ 1954 * the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1729 status = 0; 1957 status = 0;
1730 goto bail; 1958 goto bail;
1731 } 1959 }
1960next:
1732 offset += le16_to_cpu(de->rec_len); 1961 offset += le16_to_cpu(de->rec_len);
1733 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); 1962 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
1734 } 1963 }
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d87..c511e2e18e9f 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
83 struct buffer_head *fe_bh, 83 struct buffer_head *fe_bh,
84 struct ocfs2_alloc_context *data_ac); 84 struct ocfs2_alloc_context *data_ac);
85 85
86struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
87 void *data);
86#endif /* OCFS2_DIR_H */ 88#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8ba..d07ddbe4b283 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
275 struct list_head *iter, *head=NULL; 275 struct list_head *iter, *head=NULL;
276 u64 cookie; 276 u64 cookie;
277 u32 flags; 277 u32 flags;
278 u8 node;
278 279
279 if (!dlm_grab(dlm)) { 280 if (!dlm_grab(dlm)) {
280 dlm_error(DLM_REJECTED); 281 dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
286 287
287 name = past->name; 288 name = past->name;
288 locklen = past->namelen; 289 locklen = past->namelen;
289 cookie = be64_to_cpu(past->cookie); 290 cookie = past->cookie;
290 flags = be32_to_cpu(past->flags); 291 flags = be32_to_cpu(past->flags);
292 node = past->node_idx;
291 293
292 if (locklen > DLM_LOCKID_NAME_MAX) { 294 if (locklen > DLM_LOCKID_NAME_MAX) {
293 ret = DLM_IVBUFLEN; 295 ret = DLM_IVBUFLEN;
294 mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n"); 296 mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
297 "handler!\n", locklen);
295 goto leave; 298 goto leave;
296 } 299 }
297 300
298 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == 301 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
299 (LKM_PUT_LVB|LKM_GET_LVB)) { 302 (LKM_PUT_LVB|LKM_GET_LVB)) {
300 mlog(ML_ERROR, "both PUT and GET lvb specified\n"); 303 mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
304 flags);
301 ret = DLM_BADARGS; 305 ret = DLM_BADARGS;
302 goto leave; 306 goto leave;
303 } 307 }
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
310 if (past->type != DLM_AST && 314 if (past->type != DLM_AST &&
311 past->type != DLM_BAST) { 315 past->type != DLM_BAST) {
312 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" 316 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
313 "name=%.*s\n", past->type, 317 "name=%.*s, node=%u\n", past->type,
314 dlm_get_lock_cookie_node(cookie), 318 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
315 dlm_get_lock_cookie_seq(cookie), 319 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
316 locklen, name); 320 locklen, name, node);
317 ret = DLM_IVLOCKID; 321 ret = DLM_IVLOCKID;
318 goto leave; 322 goto leave;
319 } 323 }
320 324
321 res = dlm_lookup_lockres(dlm, name, locklen); 325 res = dlm_lookup_lockres(dlm, name, locklen);
322 if (!res) { 326 if (!res) {
323 mlog(0, "got %sast for unknown lockres! " 327 mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
324 "cookie=%u:%llu, name=%.*s, namelen=%u\n", 328 "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
325 past->type == DLM_AST ? "" : "b", 329 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
326 dlm_get_lock_cookie_node(cookie), 330 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
327 dlm_get_lock_cookie_seq(cookie), 331 locklen, name, node);
328 locklen, name, locklen);
329 ret = DLM_IVLOCKID; 332 ret = DLM_IVLOCKID;
330 goto leave; 333 goto leave;
331 } 334 }
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
337 340
338 spin_lock(&res->spinlock); 341 spin_lock(&res->spinlock);
339 if (res->state & DLM_LOCK_RES_RECOVERING) { 342 if (res->state & DLM_LOCK_RES_RECOVERING) {
340 mlog(0, "responding with DLM_RECOVERING!\n"); 343 mlog(0, "Responding with DLM_RECOVERING!\n");
341 ret = DLM_RECOVERING; 344 ret = DLM_RECOVERING;
342 goto unlock_out; 345 goto unlock_out;
343 } 346 }
344 if (res->state & DLM_LOCK_RES_MIGRATING) { 347 if (res->state & DLM_LOCK_RES_MIGRATING) {
345 mlog(0, "responding with DLM_MIGRATING!\n"); 348 mlog(0, "Responding with DLM_MIGRATING!\n");
346 ret = DLM_MIGRATING; 349 ret = DLM_MIGRATING;
347 goto unlock_out; 350 goto unlock_out;
348 } 351 }
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
351 lock = NULL; 354 lock = NULL;
352 list_for_each(iter, head) { 355 list_for_each(iter, head) {
353 lock = list_entry (iter, struct dlm_lock, list); 356 lock = list_entry (iter, struct dlm_lock, list);
354 if (be64_to_cpu(lock->ml.cookie) == cookie) 357 if (lock->ml.cookie == cookie)
355 goto do_ast; 358 goto do_ast;
356 } 359 }
357 360
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
363 366
364 list_for_each(iter, head) { 367 list_for_each(iter, head) {
365 lock = list_entry (iter, struct dlm_lock, list); 368 lock = list_entry (iter, struct dlm_lock, list);
366 if (be64_to_cpu(lock->ml.cookie) == cookie) 369 if (lock->ml.cookie == cookie)
367 goto do_ast; 370 goto do_ast;
368 } 371 }
369 372
370 mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " 373 mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
371 "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 374 "node=%u\n", past->type == DLM_AST ? "" : "b",
372 dlm_get_lock_cookie_node(cookie), 375 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
373 dlm_get_lock_cookie_seq(cookie), 376 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
374 locklen, name, locklen); 377 locklen, name, node);
375 378
376 ret = DLM_NORMAL; 379 ret = DLM_NORMAL;
377unlock_out: 380unlock_out:
@@ -383,8 +386,8 @@ do_ast:
383 if (past->type == DLM_AST) { 386 if (past->type == DLM_AST) {
384 /* do not alter lock refcount. switching lists. */ 387 /* do not alter lock refcount. switching lists. */
385 list_move_tail(&lock->list, &res->granted); 388 list_move_tail(&lock->list, &res->granted);
386 mlog(0, "ast: adding to granted list... type=%d, " 389 mlog(0, "ast: Adding to granted list... type=%d, "
387 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); 390 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
388 if (lock->ml.convert_type != LKM_IVMODE) { 391 if (lock->ml.convert_type != LKM_IVMODE) {
389 lock->ml.type = lock->ml.convert_type; 392 lock->ml.type = lock->ml.convert_type;
390 lock->ml.convert_type = LKM_IVMODE; 393 lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
408 dlm_do_local_bast(dlm, res, lock, past->blocked_type); 411 dlm_do_local_bast(dlm, res, lock, past->blocked_type);
409 412
410leave: 413leave:
411
412 if (res) 414 if (res)
413 dlm_lockres_put(res); 415 dlm_lockres_put(res);
414 416
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a49..bb53714813ab 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
140 unsigned int purge_count; 140 unsigned int purge_count;
141 spinlock_t spinlock; 141 spinlock_t spinlock;
142 spinlock_t ast_lock; 142 spinlock_t ast_lock;
143 spinlock_t track_lock;
143 char *name; 144 char *name;
144 u8 node_num; 145 u8 node_num;
145 u32 key; 146 u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
316 * put on a list for the dlm thread to run. */ 317 * put on a list for the dlm thread to run. */
317 unsigned long last_used; 318 unsigned long last_used;
318 319
320 struct dlm_ctxt *dlm;
321
319 unsigned migration_pending:1; 322 unsigned migration_pending:1;
320 atomic_t asts_reserved; 323 atomic_t asts_reserved;
321 spinlock_t spinlock; 324 spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175d..b32f60a5acfb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
630{ 630{
631 struct debug_lockres *dl = m->private; 631 struct debug_lockres *dl = m->private;
632 struct dlm_ctxt *dlm = dl->dl_ctxt; 632 struct dlm_ctxt *dlm = dl->dl_ctxt;
633 struct dlm_lock_resource *oldres = dl->dl_res;
633 struct dlm_lock_resource *res = NULL; 634 struct dlm_lock_resource *res = NULL;
635 struct list_head *track_list;
634 636
635 spin_lock(&dlm->spinlock); 637 spin_lock(&dlm->track_lock);
638 if (oldres)
639 track_list = &oldres->tracking;
640 else
641 track_list = &dlm->tracking_list;
636 642
637 if (dl->dl_res) { 643 list_for_each_entry(res, track_list, tracking) {
638 list_for_each_entry(res, &dl->dl_res->tracking, tracking) { 644 if (&res->tracking == &dlm->tracking_list)
639 if (dl->dl_res) { 645 res = NULL;
640 dlm_lockres_put(dl->dl_res); 646 else
641 dl->dl_res = NULL;
642 }
643 if (&res->tracking == &dlm->tracking_list) {
644 mlog(0, "End of list found, %p\n", res);
645 dl = NULL;
646 break;
647 }
648 dlm_lockres_get(res); 647 dlm_lockres_get(res);
649 dl->dl_res = res; 648 break;
650 break;
651 }
652 } else {
653 if (!list_empty(&dlm->tracking_list)) {
654 list_for_each_entry(res, &dlm->tracking_list, tracking)
655 break;
656 dlm_lockres_get(res);
657 dl->dl_res = res;
658 } else
659 dl = NULL;
660 } 649 }
650 spin_unlock(&dlm->track_lock);
661 651
662 if (dl) { 652 if (oldres)
663 spin_lock(&dl->dl_res->spinlock); 653 dlm_lockres_put(oldres);
664 dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
665 spin_unlock(&dl->dl_res->spinlock);
666 }
667 654
668 spin_unlock(&dlm->spinlock); 655 dl->dl_res = res;
656
657 if (res) {
658 spin_lock(&res->spinlock);
659 dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
660 spin_unlock(&res->spinlock);
661 } else
662 dl = NULL;
669 663
664 /* passed to seq_show */
670 return dl; 665 return dl;
671} 666}
672 667
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e8..d8d578f45613 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1550 spin_lock_init(&dlm->spinlock); 1550 spin_lock_init(&dlm->spinlock);
1551 spin_lock_init(&dlm->master_lock); 1551 spin_lock_init(&dlm->master_lock);
1552 spin_lock_init(&dlm->ast_lock); 1552 spin_lock_init(&dlm->ast_lock);
1553 spin_lock_init(&dlm->track_lock);
1553 INIT_LIST_HEAD(&dlm->list); 1554 INIT_LIST_HEAD(&dlm->list);
1554 INIT_LIST_HEAD(&dlm->dirty_list); 1555 INIT_LIST_HEAD(&dlm->dirty_list);
1555 INIT_LIST_HEAD(&dlm->reco.resources); 1556 INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d54020..1c9efb406a96 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
341 inode->i_mode = mode; 341 inode->i_mode = mode;
342 inode->i_uid = current_fsuid(); 342 inode->i_uid = current_fsuid();
343 inode->i_gid = current_fsgid(); 343 inode->i_gid = current_fsgid();
344 inode->i_blocks = 0;
345 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 344 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
346 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 345 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
347 inc_nlink(inode); 346 inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
367 inode->i_mode = mode; 366 inode->i_mode = mode;
368 inode->i_uid = current_fsuid(); 367 inode->i_uid = current_fsuid();
369 inode->i_gid = current_fsgid(); 368 inode->i_gid = current_fsgid();
370 inode->i_blocks = 0;
371 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 369 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
372 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 370 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
373 371
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf3683..54e182a27caf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
505static void dlm_lockres_release(struct kref *kref) 505static void dlm_lockres_release(struct kref *kref)
506{ 506{
507 struct dlm_lock_resource *res; 507 struct dlm_lock_resource *res;
508 struct dlm_ctxt *dlm;
508 509
509 res = container_of(kref, struct dlm_lock_resource, refs); 510 res = container_of(kref, struct dlm_lock_resource, refs);
511 dlm = res->dlm;
510 512
511 /* This should not happen -- all lockres' have a name 513 /* This should not happen -- all lockres' have a name
512 * associated with them at init time. */ 514 * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
515 mlog(0, "destroying lockres %.*s\n", res->lockname.len, 517 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
516 res->lockname.name); 518 res->lockname.name);
517 519
520 spin_lock(&dlm->track_lock);
518 if (!list_empty(&res->tracking)) 521 if (!list_empty(&res->tracking))
519 list_del_init(&res->tracking); 522 list_del_init(&res->tracking);
520 else { 523 else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
522 res->lockname.len, res->lockname.name); 525 res->lockname.len, res->lockname.name);
523 dlm_print_one_lock_resource(res); 526 dlm_print_one_lock_resource(res);
524 } 527 }
528 spin_unlock(&dlm->track_lock);
529
530 dlm_put(dlm);
525 531
526 if (!hlist_unhashed(&res->hash_node) || 532 if (!hlist_unhashed(&res->hash_node) ||
527 !list_empty(&res->granted) || 533 !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
595 res->migration_pending = 0; 601 res->migration_pending = 0;
596 res->inflight_locks = 0; 602 res->inflight_locks = 0;
597 603
604 /* put in dlm_lockres_release */
605 dlm_grab(dlm);
606 res->dlm = dlm;
607
598 kref_init(&res->refs); 608 kref_init(&res->refs);
599 609
600 /* just for consistency */ 610 /* just for consistency */
@@ -722,14 +732,21 @@ lookup:
722 if (tmpres) { 732 if (tmpres) {
723 int dropping_ref = 0; 733 int dropping_ref = 0;
724 734
735 spin_unlock(&dlm->spinlock);
736
725 spin_lock(&tmpres->spinlock); 737 spin_lock(&tmpres->spinlock);
738 /* We wait for the other thread that is mastering the resource */
739 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
740 __dlm_wait_on_lockres(tmpres);
741 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
742 }
743
726 if (tmpres->owner == dlm->node_num) { 744 if (tmpres->owner == dlm->node_num) {
727 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); 745 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
728 dlm_lockres_grab_inflight_ref(dlm, tmpres); 746 dlm_lockres_grab_inflight_ref(dlm, tmpres);
729 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) 747 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
730 dropping_ref = 1; 748 dropping_ref = 1;
731 spin_unlock(&tmpres->spinlock); 749 spin_unlock(&tmpres->spinlock);
732 spin_unlock(&dlm->spinlock);
733 750
734 /* wait until done messaging the master, drop our ref to allow 751 /* wait until done messaging the master, drop our ref to allow
735 * the lockres to be purged, start over. */ 752 * the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2949 struct dlm_node_iter *iter) 2966 struct dlm_node_iter *iter)
2950{ 2967{
2951 struct dlm_migrate_request migrate; 2968 struct dlm_migrate_request migrate;
2952 int ret, status = 0; 2969 int ret, skip, status = 0;
2953 int nodenum; 2970 int nodenum;
2954 2971
2955 memset(&migrate, 0, sizeof(migrate)); 2972 memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2966 nodenum == new_master) 2983 nodenum == new_master)
2967 continue; 2984 continue;
2968 2985
2986 /* We could race exit domain. If exited, skip. */
2987 spin_lock(&dlm->spinlock);
2988 skip = (!test_bit(nodenum, dlm->domain_map));
2989 spin_unlock(&dlm->spinlock);
2990 if (skip) {
2991 clear_bit(nodenum, iter->node_map);
2992 continue;
2993 }
2994
2969 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, 2995 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
2970 &migrate, sizeof(migrate), nodenum, 2996 &migrate, sizeof(migrate), nodenum,
2971 &status); 2997 &status);
2972 if (ret < 0) 2998 if (ret < 0) {
2973 mlog_errno(ret); 2999 mlog(0, "migrate_request returned %d!\n", ret);
2974 else if (status < 0) { 3000 if (!dlm_is_host_down(ret)) {
3001 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
3002 BUG();
3003 }
3004 clear_bit(nodenum, iter->node_map);
3005 ret = 0;
3006 } else if (status < 0) {
2975 mlog(0, "migrate request (node %u) returned %d!\n", 3007 mlog(0, "migrate request (node %u) returned %d!\n",
2976 nodenum, status); 3008 nodenum, status);
2977 ret = status; 3009 ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d1295203029f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
181 181
182 spin_lock(&res->spinlock); 182 spin_lock(&res->spinlock);
183 /* This ensures that clear refmap is sent after the set */ 183 /* This ensures that clear refmap is sent after the set */
184 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 184 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
185 DLM_LOCK_RES_MIGRATING));
185 spin_unlock(&res->spinlock); 186 spin_unlock(&res->spinlock);
186 187
187 /* clear our bit from the master's refmap, ignore errors */ 188 /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f7..b0c4cadd4c45 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
32#include <linux/debugfs.h> 32#include <linux/debugfs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/time.h> 34#include <linux/time.h>
35#include <linux/quotaops.h>
35 36
36#define MLOG_MASK_PREFIX ML_DLM_GLUE 37#define MLOG_MASK_PREFIX ML_DLM_GLUE
37#include <cluster/masklog.h> 38#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
51#include "slot_map.h" 52#include "slot_map.h"
52#include "super.h" 53#include "super.h"
53#include "uptodate.h" 54#include "uptodate.h"
55#include "quota.h"
54 56
55#include "buffer_head_io.h" 57#include "buffer_head_io.h"
56 58
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
68static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 70static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
69static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); 71static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
70static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres); 72static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
73static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
71 74
72/* 75/*
73 * Return value from ->downconvert_worker functions. 76 * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
102static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 105static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
103 struct ocfs2_lock_res *lockres); 106 struct ocfs2_lock_res *lockres);
104 107
108static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
105 109
106#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) 110#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
107 111
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
111 unsigned int line, 115 unsigned int line,
112 struct ocfs2_lock_res *lockres) 116 struct ocfs2_lock_res *lockres)
113{ 117{
114 struct ocfs2_meta_lvb *lvb = 118 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
115 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
116 119
117 mlog(level, "LVB information for %s (called from %s:%u):\n", 120 mlog(level, "LVB information for %s (called from %s:%u):\n",
118 lockres->l_name, function, line); 121 lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
258 .flags = 0, 261 .flags = 0,
259}; 262};
260 263
264static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
265 .set_lvb = ocfs2_set_qinfo_lvb,
266 .get_osb = ocfs2_get_qinfo_osb,
267 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
268};
269
261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 270static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
262{ 271{
263 return lockres->l_type == OCFS2_LOCK_TYPE_META || 272 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
279 return (struct ocfs2_dentry_lock *)lockres->l_priv; 288 return (struct ocfs2_dentry_lock *)lockres->l_priv;
280} 289}
281 290
291static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
292{
293 BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
294
295 return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
296}
297
282static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) 298static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
283{ 299{
284 if (lockres->l_ops->get_osb) 300 if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
507 return OCFS2_SB(inode->i_sb); 523 return OCFS2_SB(inode->i_sb);
508} 524}
509 525
526static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
527{
528 struct ocfs2_mem_dqinfo *info = lockres->l_priv;
529
530 return OCFS2_SB(info->dqi_gi.dqi_sb);
531}
532
510static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres) 533static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
511{ 534{
512 struct ocfs2_file_private *fp = lockres->l_priv; 535 struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
609 lockres->l_flags |= OCFS2_LOCK_NOCACHE; 632 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
610} 633}
611 634
635void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
636 struct ocfs2_mem_dqinfo *info)
637{
638 ocfs2_lock_res_init_once(lockres);
639 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
640 0, lockres->l_name);
641 ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
642 OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
643 info);
644}
645
612void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 646void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
613{ 647{
614 mlog_entry_void(); 648 mlog_entry_void();
@@ -1290,7 +1324,7 @@ again:
1290 goto out; 1324 goto out;
1291 } 1325 }
1292 1326
1293 mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", 1327 mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
1294 lockres->l_name); 1328 lockres->l_name);
1295 1329
1296 /* At this point we've gone inside the dlm and need to 1330 /* At this point we've gone inside the dlm and need to
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1829 1863
1830 mlog_entry_void(); 1864 mlog_entry_void();
1831 1865
1832 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); 1866 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1833 1867
1834 /* 1868 /*
1835 * Invalidate the LVB of a deleted inode - this way other 1869 * Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1881 1915
1882 mlog_meta_lvb(0, lockres); 1916 mlog_meta_lvb(0, lockres);
1883 1917
1884 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); 1918 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1885 1919
1886 /* We're safe here without the lockres lock... */ 1920 /* We're safe here without the lockres lock... */
1887 spin_lock(&oi->ip_lock); 1921 spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1916static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, 1950static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
1917 struct ocfs2_lock_res *lockres) 1951 struct ocfs2_lock_res *lockres)
1918{ 1952{
1919 struct ocfs2_meta_lvb *lvb = 1953 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1920 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1921 1954
1922 if (lvb->lvb_version == OCFS2_LVB_VERSION 1955 if (lvb->lvb_version == OCFS2_LVB_VERSION
1923 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) 1956 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2024 } else { 2057 } else {
2025 /* Boo, we have to go to disk. */ 2058 /* Boo, we have to go to disk. */
2026 /* read bh, cast, ocfs2_refresh_inode */ 2059 /* read bh, cast, ocfs2_refresh_inode */
2027 status = ocfs2_read_block(inode, oi->ip_blkno, bh); 2060 status = ocfs2_read_inode_block(inode, bh);
2028 if (status < 0) { 2061 if (status < 0) {
2029 mlog_errno(status); 2062 mlog_errno(status);
2030 goto bail_refresh; 2063 goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2032 fe = (struct ocfs2_dinode *) (*bh)->b_data; 2065 fe = (struct ocfs2_dinode *) (*bh)->b_data;
2033 2066
2034 /* This is a good chance to make sure we're not 2067 /* This is a good chance to make sure we're not
2035 * locking an invalid object. 2068 * locking an invalid object. ocfs2_read_inode_block()
2069 * already checked that the inode block is sane.
2036 * 2070 *
2037 * We bug on a stale inode here because we checked 2071 * We bug on a stale inode here because we checked
2038 * above whether it was wiped from disk. The wiping 2072 * above whether it was wiped from disk. The wiping
2039 * node provides a guarantee that we receive that 2073 * node provides a guarantee that we receive that
2040 * message and can mark the inode before dropping any 2074 * message and can mark the inode before dropping any
2041 * locks associated with it. */ 2075 * locks associated with it. */
2042 if (!OCFS2_IS_VALID_DINODE(fe)) {
2043 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
2044 status = -EIO;
2045 goto bail_refresh;
2046 }
2047 mlog_bug_on_msg(inode->i_generation != 2076 mlog_bug_on_msg(inode->i_generation !=
2048 le32_to_cpu(fe->i_generation), 2077 le32_to_cpu(fe->i_generation),
2049 "Invalid dinode %llu disk generation: %u " 2078 "Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
2085 return 0; 2114 return 0;
2086 } 2115 }
2087 2116
2088 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh); 2117 status = ocfs2_read_inode_block(inode, ret_bh);
2089 if (status < 0) 2118 if (status < 0)
2090 mlog_errno(status); 2119 mlog_errno(status);
2091 2120
@@ -2922,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2922 ocfs2_dlm_dump_lksb(&lockres->l_lksb); 2951 ocfs2_dlm_dump_lksb(&lockres->l_lksb);
2923 BUG(); 2952 BUG();
2924 } 2953 }
2925 mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n", 2954 mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
2926 lockres->l_name); 2955 lockres->l_name);
2927 2956
2928 ocfs2_wait_on_busy_lock(lockres); 2957 ocfs2_wait_on_busy_lock(lockres);
@@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3449 return UNBLOCK_CONTINUE_POST; 3478 return UNBLOCK_CONTINUE_POST;
3450} 3479}
3451 3480
3481static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
3482{
3483 struct ocfs2_qinfo_lvb *lvb;
3484 struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
3485 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
3486 oinfo->dqi_gi.dqi_type);
3487
3488 mlog_entry_void();
3489
3490 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
3491 lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
3492 lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
3493 lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
3494 lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
3495 lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
3496 lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
3497 lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
3498
3499 mlog_exit_void();
3500}
3501
3502void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3503{
3504 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3505 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
3506 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3507
3508 mlog_entry_void();
3509 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
3510 ocfs2_cluster_unlock(osb, lockres, level);
3511 mlog_exit_void();
3512}
3513
3514static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
3515{
3516 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
3517 oinfo->dqi_gi.dqi_type);
3518 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3519 struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
3520 struct buffer_head *bh = NULL;
3521 struct ocfs2_global_disk_dqinfo *gdinfo;
3522 int status = 0;
3523
3524 if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
3525 info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
3526 info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
3527 oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
3528 oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
3529 oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
3530 oinfo->dqi_gi.dqi_free_entry =
3531 be32_to_cpu(lvb->lvb_free_entry);
3532 } else {
3533 status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
3534 if (status) {
3535 mlog_errno(status);
3536 goto bail;
3537 }
3538 gdinfo = (struct ocfs2_global_disk_dqinfo *)
3539 (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
3540 info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
3541 info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
3542 oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
3543 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
3544 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
3545 oinfo->dqi_gi.dqi_free_entry =
3546 le32_to_cpu(gdinfo->dqi_free_entry);
3547 brelse(bh);
3548 ocfs2_track_lock_refresh(lockres);
3549 }
3550
3551bail:
3552 return status;
3553}
3554
3555/* Lock quota info, this function expects at least shared lock on the quota file
3556 * so that we can safely refresh quota info from disk. */
3557int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3558{
3559 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3560 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
3561 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3562 int status = 0;
3563
3564 mlog_entry_void();
3565
3566 /* On RO devices, locking really isn't needed... */
3567 if (ocfs2_is_hard_readonly(osb)) {
3568 if (ex)
3569 status = -EROFS;
3570 goto bail;
3571 }
3572 if (ocfs2_mount_local(osb))
3573 goto bail;
3574
3575 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
3576 if (status < 0) {
3577 mlog_errno(status);
3578 goto bail;
3579 }
3580 if (!ocfs2_should_refresh_lock_res(lockres))
3581 goto bail;
3582 /* OK, we have the lock but we need to refresh the quota info */
3583 status = ocfs2_refresh_qinfo(oinfo);
3584 if (status)
3585 ocfs2_qinfo_unlock(oinfo, ex);
3586 ocfs2_complete_lock_res_refresh(lockres, status);
3587bail:
3588 mlog_exit(status);
3589 return status;
3590}
3591
3452/* 3592/*
3453 * This is the filesystem locking protocol. It provides the lock handling 3593 * This is the filesystem locking protocol. It provides the lock handling
3454 * hooks for the underlying DLM. It has a maximum version number. 3594 * hooks for the underlying DLM. It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b1..3f8d9986b8e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
49 __be32 lvb_reserved2; 49 __be32 lvb_reserved2;
50}; 50};
51 51
52#define OCFS2_QINFO_LVB_VERSION 1
53
54struct ocfs2_qinfo_lvb {
55 __u8 lvb_version;
56 __u8 lvb_reserved[3];
57 __be32 lvb_bgrace;
58 __be32 lvb_igrace;
59 __be32 lvb_syncms;
60 __be32 lvb_blocks;
61 __be32 lvb_free_blk;
62 __be32 lvb_free_entry;
63};
64
52/* ocfs2_inode_lock_full() 'arg_flags' flags */ 65/* ocfs2_inode_lock_full() 'arg_flags' flags */
53/* don't wait on recovery. */ 66/* don't wait on recovery. */
54#define OCFS2_META_LOCK_RECOVERY (0x01) 67#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
69struct ocfs2_file_private; 82struct ocfs2_file_private;
70void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, 83void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
71 struct ocfs2_file_private *fp); 84 struct ocfs2_file_private *fp);
85struct ocfs2_mem_dqinfo;
86void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
87 struct ocfs2_mem_dqinfo *info);
72void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 88void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
73int ocfs2_create_new_inode_locks(struct inode *inode); 89int ocfs2_create_new_inode_locks(struct inode *inode);
74int ocfs2_drop_inode_locks(struct inode *inode); 90int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
103void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 119void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
104int ocfs2_file_lock(struct file *file, int ex, int trylock); 120int ocfs2_file_lock(struct file *file, int ex, int trylock);
105void ocfs2_file_unlock(struct file *file); 121void ocfs2_file_unlock(struct file *file);
122int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
123void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
124
106 125
107void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 126void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
108void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 127void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac58234..f2bb1a04d253 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
293 struct ocfs2_extent_block *eb; 293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el; 294 struct ocfs2_extent_list *el;
295 295
296 ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh); 296 ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
297 if (ret) { 297 if (ret) {
298 mlog_errno(ret); 298 mlog_errno(ret);
299 goto out; 299 goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
302 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 302 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
303 el = &eb->h_list; 303 el = &eb->h_list;
304 304
305 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
306 ret = -EROFS;
307 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
308 goto out;
309 }
310
311 if (el->l_tree_depth) { 305 if (el->l_tree_depth) {
312 ocfs2_error(inode->i_sb, 306 ocfs2_error(inode->i_sb,
313 "Inode %lu has non zero tree depth in " 307 "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
381 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) 375 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
382 goto no_more_extents; 376 goto no_more_extents;
383 377
384 ret = ocfs2_read_block(inode, 378 ret = ocfs2_read_extent_block(inode,
385 le64_to_cpu(eb->h_next_leaf_blk), 379 le64_to_cpu(eb->h_next_leaf_blk),
386 &next_eb_bh); 380 &next_eb_bh);
387 if (ret) { 381 if (ret) {
388 mlog_errno(ret); 382 mlog_errno(ret);
389 goto out; 383 goto out;
390 } 384 }
391 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
392
393 if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
394 ret = -EROFS;
395 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
396 goto out;
397 }
398 385
386 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
399 el = &next_eb->h_list; 387 el = &next_eb->h_list;
400
401 i = ocfs2_search_for_hole_index(el, v_cluster); 388 i = ocfs2_search_for_hole_index(el, v_cluster);
402 } 389 }
403 390
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
630 if (ret == 0) 617 if (ret == 0)
631 goto out; 618 goto out;
632 619
633 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); 620 ret = ocfs2_read_inode_block(inode, &di_bh);
634 if (ret) { 621 if (ret) {
635 mlog_errno(ret); 622 mlog_errno(ret);
636 goto out; 623 goto out;
@@ -819,3 +806,74 @@ out:
819 806
820 return ret; 807 return ret;
821} 808}
809
810int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
811 struct buffer_head *bhs[], int flags,
812 int (*validate)(struct super_block *sb,
813 struct buffer_head *bh))
814{
815 int rc = 0;
816 u64 p_block, p_count;
817 int i, count, done = 0;
818
819 mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
820 "flags = %x, validate = %p)\n",
821 inode, (unsigned long long)v_block, nr, bhs, flags,
822 validate);
823
824 if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
825 i_size_read(inode)) {
826 BUG_ON(!(flags & OCFS2_BH_READAHEAD));
827 goto out;
828 }
829
830 while (done < nr) {
831 down_read(&OCFS2_I(inode)->ip_alloc_sem);
832 rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
833 &p_block, &p_count, NULL);
834 up_read(&OCFS2_I(inode)->ip_alloc_sem);
835 if (rc) {
836 mlog_errno(rc);
837 break;
838 }
839
840 if (!p_block) {
841 rc = -EIO;
842 mlog(ML_ERROR,
843 "Inode #%llu contains a hole at offset %llu\n",
844 (unsigned long long)OCFS2_I(inode)->ip_blkno,
845 (unsigned long long)(v_block + done) <<
846 inode->i_sb->s_blocksize_bits);
847 break;
848 }
849
850 count = nr - done;
851 if (p_count < count)
852 count = p_count;
853
854 /*
855 * If the caller passed us bhs, they should have come
856 * from a previous readahead call to this function. Thus,
857 * they should have the right b_blocknr.
858 */
859 for (i = 0; i < count; i++) {
860 if (!bhs[done + i])
861 continue;
862 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
863 }
864
865 rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
866 flags, validate);
867 if (rc) {
868 mlog_errno(rc);
869 break;
870 }
871 done += count;
872 }
873
874out:
875 mlog_exit(rc);
876 return rc;
877}
878
879
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f34..b7dd9731b462 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters, 57 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el); 58 struct ocfs2_extent_list *el);
59 59
60int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
61 struct buffer_head *bhs[], int flags,
62 int (*validate)(struct super_block *sb,
63 struct buffer_head *bh));
64static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
65 struct buffer_head **bh,
66 int (*validate)(struct super_block *sb,
67 struct buffer_head *bh))
68{
69 int status = 0;
70
71 if (bh == NULL) {
72 printk("ocfs2: bh == NULL\n");
73 status = -EINVAL;
74 goto bail;
75 }
76
77 status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
78
79bail:
80 return status;
81}
82
83
60#endif /* _EXTENT_MAP_H */ 84#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b2..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/writeback.h> 36#include <linux/writeback.h>
37#include <linux/falloc.h> 37#include <linux/falloc.h>
38#include <linux/quotaops.h>
38 39
39#define MLOG_MASK_PREFIX ML_INODE 40#define MLOG_MASK_PREFIX ML_INODE
40#include <cluster/masklog.h> 41#include <cluster/masklog.h>
@@ -56,6 +57,8 @@
56#include "suballoc.h" 57#include "suballoc.h"
57#include "super.h" 58#include "super.h"
58#include "xattr.h" 59#include "xattr.h"
60#include "acl.h"
61#include "quota.h"
59 62
60#include "buffer_head_io.h" 63#include "buffer_head_io.h"
61 64
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
253 goto out; 256 goto out;
254 } 257 }
255 258
256 ret = ocfs2_journal_access(handle, inode, bh, 259 ret = ocfs2_journal_access_di(handle, inode, bh,
257 OCFS2_JOURNAL_ACCESS_WRITE); 260 OCFS2_JOURNAL_ACCESS_WRITE);
258 if (ret) { 261 if (ret) {
259 mlog_errno(ret); 262 mlog_errno(ret);
260 goto out_commit; 263 goto out_commit;
@@ -303,9 +306,9 @@ bail:
303 return status; 306 return status;
304} 307}
305 308
306static int ocfs2_simple_size_update(struct inode *inode, 309int ocfs2_simple_size_update(struct inode *inode,
307 struct buffer_head *di_bh, 310 struct buffer_head *di_bh,
308 u64 new_i_size) 311 u64 new_i_size)
309{ 312{
310 int ret; 313 int ret;
311 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 314 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
350 goto out; 353 goto out;
351 } 354 }
352 355
353 status = ocfs2_journal_access(handle, inode, fe_bh, 356 status = ocfs2_journal_access_di(handle, inode, fe_bh,
354 OCFS2_JOURNAL_ACCESS_WRITE); 357 OCFS2_JOURNAL_ACCESS_WRITE);
355 if (status < 0) { 358 if (status < 0) {
356 mlog_errno(status); 359 mlog_errno(status);
357 goto out_commit; 360 goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
401 (unsigned long long)OCFS2_I(inode)->ip_blkno, 404 (unsigned long long)OCFS2_I(inode)->ip_blkno,
402 (unsigned long long)new_i_size); 405 (unsigned long long)new_i_size);
403 406
407 /* We trust di_bh because it comes from ocfs2_inode_lock(), which
408 * already validated it */
404 fe = (struct ocfs2_dinode *) di_bh->b_data; 409 fe = (struct ocfs2_dinode *) di_bh->b_data;
405 if (!OCFS2_IS_VALID_DINODE(fe)) {
406 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
407 status = -EIO;
408 goto bail;
409 }
410 410
411 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 411 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
412 "Inode %llu, inode i_size = %lld != di " 412 "Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
536 enum ocfs2_alloc_restarted why; 536 enum ocfs2_alloc_restarted why;
537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
538 struct ocfs2_extent_tree et; 538 struct ocfs2_extent_tree et;
539 int did_quota = 0;
539 540
540 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 541 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
541 542
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
545 */ 546 */
546 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 547 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
547 548
548 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 549 status = ocfs2_read_inode_block(inode, &bh);
549 if (status < 0) { 550 if (status < 0) {
550 mlog_errno(status); 551 mlog_errno(status);
551 goto leave; 552 goto leave;
552 } 553 }
553
554 fe = (struct ocfs2_dinode *) bh->b_data; 554 fe = (struct ocfs2_dinode *) bh->b_data;
555 if (!OCFS2_IS_VALID_DINODE(fe)) {
556 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
557 status = -EIO;
558 goto leave;
559 }
560 555
561restart_all: 556restart_all:
562 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 557 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
585 } 580 }
586 581
587restarted_transaction: 582restarted_transaction:
583 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
584 clusters_to_add))) {
585 status = -EDQUOT;
586 goto leave;
587 }
588 did_quota = 1;
589
588 /* reserve a write to the file entry early on - that we if we 590 /* reserve a write to the file entry early on - that we if we
589 * run out of credits in the allocation path, we can still 591 * run out of credits in the allocation path, we can still
590 * update i_size. */ 592 * update i_size. */
591 status = ocfs2_journal_access(handle, inode, bh, 593 status = ocfs2_journal_access_di(handle, inode, bh,
592 OCFS2_JOURNAL_ACCESS_WRITE); 594 OCFS2_JOURNAL_ACCESS_WRITE);
593 if (status < 0) { 595 if (status < 0) {
594 mlog_errno(status); 596 mlog_errno(status);
595 goto leave; 597 goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
622 spin_lock(&OCFS2_I(inode)->ip_lock); 624 spin_lock(&OCFS2_I(inode)->ip_lock);
623 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 625 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
624 spin_unlock(&OCFS2_I(inode)->ip_lock); 626 spin_unlock(&OCFS2_I(inode)->ip_lock);
627 /* Release unused quota reservation */
628 vfs_dq_free_space(inode,
629 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
630 did_quota = 0;
625 631
626 if (why != RESTART_NONE && clusters_to_add) { 632 if (why != RESTART_NONE && clusters_to_add) {
627 if (why == RESTART_META) { 633 if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
654 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); 660 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
655 661
656leave: 662leave:
663 if (status < 0 && did_quota)
664 vfs_dq_free_space(inode,
665 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
657 if (handle) { 666 if (handle) {
658 ocfs2_commit_trans(osb, handle); 667 ocfs2_commit_trans(osb, handle);
659 handle = NULL; 668 handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
885 struct ocfs2_super *osb = OCFS2_SB(sb); 894 struct ocfs2_super *osb = OCFS2_SB(sb);
886 struct buffer_head *bh = NULL; 895 struct buffer_head *bh = NULL;
887 handle_t *handle = NULL; 896 handle_t *handle = NULL;
897 int locked[MAXQUOTAS] = {0, 0};
898 int credits, qtype;
899 struct ocfs2_mem_dqinfo *oinfo;
888 900
889 mlog_entry("(0x%p, '%.*s')\n", dentry, 901 mlog_entry("(0x%p, '%.*s')\n", dentry,
890 dentry->d_name.len, dentry->d_name.name); 902 dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
955 } 967 }
956 } 968 }
957 969
958 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 970 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
959 if (IS_ERR(handle)) { 971 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
960 status = PTR_ERR(handle); 972 credits = OCFS2_INODE_UPDATE_CREDITS;
961 mlog_errno(status); 973 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
962 goto bail_unlock; 974 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
975 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
976 oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
977 status = ocfs2_lock_global_qf(oinfo, 1);
978 if (status < 0)
979 goto bail_unlock;
980 credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
981 ocfs2_calc_qdel_credits(sb, USRQUOTA);
982 locked[USRQUOTA] = 1;
983 }
984 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
985 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
986 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
987 oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
988 status = ocfs2_lock_global_qf(oinfo, 1);
989 if (status < 0)
990 goto bail_unlock;
991 credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
992 ocfs2_calc_qdel_credits(sb, GRPQUOTA);
993 locked[GRPQUOTA] = 1;
994 }
995 handle = ocfs2_start_trans(osb, credits);
996 if (IS_ERR(handle)) {
997 status = PTR_ERR(handle);
998 mlog_errno(status);
999 goto bail_unlock;
1000 }
1001 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
1002 if (status < 0)
1003 goto bail_commit;
1004 } else {
1005 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1006 if (IS_ERR(handle)) {
1007 status = PTR_ERR(handle);
1008 mlog_errno(status);
1009 goto bail_unlock;
1010 }
963 } 1011 }
964 1012
965 /* 1013 /*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
982bail_commit: 1030bail_commit:
983 ocfs2_commit_trans(osb, handle); 1031 ocfs2_commit_trans(osb, handle);
984bail_unlock: 1032bail_unlock:
1033 for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
1034 if (!locked[qtype])
1035 continue;
1036 oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
1037 ocfs2_unlock_global_qf(oinfo, 1);
1038 }
985 ocfs2_inode_unlock(inode, 1); 1039 ocfs2_inode_unlock(inode, 1);
986bail_unlock_rw: 1040bail_unlock_rw:
987 if (size_change) 1041 if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
989bail: 1043bail:
990 brelse(bh); 1044 brelse(bh);
991 1045
1046 if (!status && attr->ia_valid & ATTR_MODE) {
1047 status = ocfs2_acl_chmod(inode);
1048 if (status < 0)
1049 mlog_errno(status);
1050 }
1051
992 mlog_exit(status); 1052 mlog_exit(status);
993 return status; 1053 return status;
994} 1054}
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
1035 goto out; 1095 goto out;
1036 } 1096 }
1037 1097
1038 ret = generic_permission(inode, mask, NULL); 1098 ret = generic_permission(inode, mask, ocfs2_check_acl);
1039 1099
1040 ocfs2_inode_unlock(inode, 0); 1100 ocfs2_inode_unlock(inode, 0);
1041out: 1101out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1061 goto out; 1121 goto out;
1062 } 1122 }
1063 1123
1064 ret = ocfs2_journal_access(handle, inode, bh, 1124 ret = ocfs2_journal_access_di(handle, inode, bh,
1065 OCFS2_JOURNAL_ACCESS_WRITE); 1125 OCFS2_JOURNAL_ACCESS_WRITE);
1066 if (ret < 0) { 1126 if (ret < 0) {
1067 mlog_errno(ret); 1127 mlog_errno(ret);
1068 goto out_trans; 1128 goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1128{ 1188{
1129 int ret; 1189 int ret;
1130 struct buffer_head *bh = NULL; 1190 struct buffer_head *bh = NULL;
1131 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1132 1191
1133 ret = ocfs2_read_block(inode, oi->ip_blkno, &bh); 1192 ret = ocfs2_read_inode_block(inode, &bh);
1134 if (ret < 0) { 1193 if (ret < 0) {
1135 mlog_errno(ret); 1194 mlog_errno(ret);
1136 goto out; 1195 goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1156 struct buffer_head *di_bh = NULL; 1215 struct buffer_head *di_bh = NULL;
1157 1216
1158 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1217 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1159 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, 1218 ret = ocfs2_read_inode_block(inode, &di_bh);
1160 &di_bh);
1161 if (ret) { 1219 if (ret) {
1162 mlog_errno(ret); 1220 mlog_errno(ret);
1163 goto out; 1221 goto out;
@@ -1226,83 +1284,6 @@ out:
1226 return ret; 1284 return ret;
1227} 1285}
1228 1286
1229static int __ocfs2_remove_inode_range(struct inode *inode,
1230 struct buffer_head *di_bh,
1231 u32 cpos, u32 phys_cpos, u32 len,
1232 struct ocfs2_cached_dealloc_ctxt *dealloc)
1233{
1234 int ret;
1235 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
1236 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1237 struct inode *tl_inode = osb->osb_tl_inode;
1238 handle_t *handle;
1239 struct ocfs2_alloc_context *meta_ac = NULL;
1240 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1241 struct ocfs2_extent_tree et;
1242
1243 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1244
1245 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
1246 if (ret) {
1247 mlog_errno(ret);
1248 return ret;
1249 }
1250
1251 mutex_lock(&tl_inode->i_mutex);
1252
1253 if (ocfs2_truncate_log_needs_flush(osb)) {
1254 ret = __ocfs2_flush_truncate_log(osb);
1255 if (ret < 0) {
1256 mlog_errno(ret);
1257 goto out;
1258 }
1259 }
1260
1261 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
1262 if (IS_ERR(handle)) {
1263 ret = PTR_ERR(handle);
1264 mlog_errno(ret);
1265 goto out;
1266 }
1267
1268 ret = ocfs2_journal_access(handle, inode, di_bh,
1269 OCFS2_JOURNAL_ACCESS_WRITE);
1270 if (ret) {
1271 mlog_errno(ret);
1272 goto out;
1273 }
1274
1275 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
1276 dealloc);
1277 if (ret) {
1278 mlog_errno(ret);
1279 goto out_commit;
1280 }
1281
1282 OCFS2_I(inode)->ip_clusters -= len;
1283 di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1284
1285 ret = ocfs2_journal_dirty(handle, di_bh);
1286 if (ret) {
1287 mlog_errno(ret);
1288 goto out_commit;
1289 }
1290
1291 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
1292 if (ret)
1293 mlog_errno(ret);
1294
1295out_commit:
1296 ocfs2_commit_trans(osb, handle);
1297out:
1298 mutex_unlock(&tl_inode->i_mutex);
1299
1300 if (meta_ac)
1301 ocfs2_free_alloc_context(meta_ac);
1302
1303 return ret;
1304}
1305
1306/* 1287/*
1307 * Truncate a byte range, avoiding pages within partial clusters. This 1288 * Truncate a byte range, avoiding pages within partial clusters. This
1308 * preserves those pages for the zeroing code to write to. 1289 * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1402 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1383 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1403 struct ocfs2_cached_dealloc_ctxt dealloc; 1384 struct ocfs2_cached_dealloc_ctxt dealloc;
1404 struct address_space *mapping = inode->i_mapping; 1385 struct address_space *mapping = inode->i_mapping;
1386 struct ocfs2_extent_tree et;
1405 1387
1388 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1406 ocfs2_init_dealloc_ctxt(&dealloc); 1389 ocfs2_init_dealloc_ctxt(&dealloc);
1407 1390
1408 if (byte_len == 0) 1391 if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1458 1441
1459 /* Only do work for non-holes */ 1442 /* Only do work for non-holes */
1460 if (phys_cpos != 0) { 1443 if (phys_cpos != 0) {
1461 ret = __ocfs2_remove_inode_range(inode, di_bh, cpos, 1444 ret = ocfs2_remove_btree_range(inode, &et, cpos,
1462 phys_cpos, alloc_size, 1445 phys_cpos, alloc_size,
1463 &dealloc); 1446 &dealloc);
1464 if (ret) { 1447 if (ret) {
1465 mlog_errno(ret); 1448 mlog_errno(ret);
1466 goto out; 1449 goto out;
@@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1622 struct ocfs2_space_resv *sr) 1605 struct ocfs2_space_resv *sr)
1623{ 1606{
1624 struct inode *inode = file->f_path.dentry->d_inode; 1607 struct inode *inode = file->f_path.dentry->d_inode;
1625 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);; 1608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1626 1609
1627 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1610 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1628 !ocfs2_writes_unwritten_extents(osb)) 1611 !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5f..172f9fbc9fc7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
51 struct ocfs2_alloc_context *data_ac, 51 struct ocfs2_alloc_context *data_ac,
52 struct ocfs2_alloc_context *meta_ac, 52 struct ocfs2_alloc_context *meta_ac,
53 enum ocfs2_alloc_restarted *reason_ret); 53 enum ocfs2_alloc_restarted *reason_ret);
54int ocfs2_simple_size_update(struct inode *inode,
55 struct buffer_head *di_bh,
56 u64 new_i_size);
54int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, 57int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
55 u64 zero_to); 58 u64 zero_to);
56int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 59int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d511874..229e707bc050 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/quotaops.h>
31 32
32#include <asm/byteorder.h> 33#include <asm/byteorder.h>
33 34
@@ -37,6 +38,7 @@
37#include "ocfs2.h" 38#include "ocfs2.h"
38 39
39#include "alloc.h" 40#include "alloc.h"
41#include "blockcheck.h"
40#include "dlmglue.h" 42#include "dlmglue.h"
41#include "extent_map.h" 43#include "extent_map.h"
42#include "file.h" 44#include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
214 return 0; 216 return 0;
215} 217}
216 218
217int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 219void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
218 int create_ino) 220 int create_ino)
219{ 221{
220 struct super_block *sb; 222 struct super_block *sb;
221 struct ocfs2_super *osb; 223 struct ocfs2_super *osb;
222 int status = -EINVAL;
223 int use_plocks = 1; 224 int use_plocks = 1;
224 225
225 mlog_entry("(0x%p, size:%llu)\n", inode, 226 mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
232 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) 233 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
233 use_plocks = 0; 234 use_plocks = 0;
234 235
235 /* this means that read_inode cannot create a superblock inode 236 /*
236 * today. change if needed. */ 237 * These have all been checked by ocfs2_read_inode_block() or set
237 if (!OCFS2_IS_VALID_DINODE(fe) || 238 * by ocfs2_mknod_locked(), so a failure is a code bug.
238 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 239 */
239 mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " 240 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode
240 "signature = %.*s, flags = 0x%x\n", 241 cannot create a superblock
241 inode->i_ino, 242 inode today. change if
242 (unsigned long long)le64_to_cpu(fe->i_blkno), 7, 243 that is needed. */
243 fe->i_signature, le32_to_cpu(fe->i_flags)); 244 BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
244 goto bail; 245 BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
245 }
246 246
247 if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
248 mlog(ML_ERROR, "file entry generation does not match "
249 "superblock! osb->fs_generation=%x, "
250 "fe->i_fs_generation=%x\n",
251 osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
252 goto bail;
253 }
254 247
255 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 248 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
256 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 249 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
284 277
285 inode->i_nlink = le16_to_cpu(fe->i_links_count); 278 inode->i_nlink = le16_to_cpu(fe->i_links_count);
286 279
287 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) 280 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
288 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 281 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
282 inode->i_flags |= S_NOQUOTA;
283 }
289 284
290 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { 285 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
291 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 286 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
292 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); 287 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
293 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { 288 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
294 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 289 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
290 } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
291 inode->i_flags |= S_NOQUOTA;
295 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { 292 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
296 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); 293 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
297 /* we can't actually hit this as read_inode can't 294 /* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
354 351
355 ocfs2_set_inode_flags(inode); 352 ocfs2_set_inode_flags(inode);
356 353
357 status = 0; 354 mlog_exit_void();
358bail:
359 mlog_exit(status);
360 return status;
361} 355}
362 356
363static int ocfs2_read_locked_inode(struct inode *inode, 357static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
460 } 454 }
461 } 455 }
462 456
463 if (can_lock) 457 if (can_lock) {
464 status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, 458 status = ocfs2_read_inode_block_full(inode, &bh,
465 OCFS2_BH_IGNORE_CACHE); 459 OCFS2_BH_IGNORE_CACHE);
466 else 460 } else {
467 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 461 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
462 if (!status)
463 status = ocfs2_validate_inode_block(osb->sb, bh);
464 }
468 if (status < 0) { 465 if (status < 0) {
469 mlog_errno(status); 466 mlog_errno(status);
470 goto bail; 467 goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
472 469
473 status = -EINVAL; 470 status = -EINVAL;
474 fe = (struct ocfs2_dinode *) bh->b_data; 471 fe = (struct ocfs2_dinode *) bh->b_data;
475 if (!OCFS2_IS_VALID_DINODE(fe)) {
476 mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
477 (unsigned long long)args->fi_blkno, 7,
478 fe->i_signature);
479 goto bail;
480 }
481 472
482 /* 473 /*
483 * This is a code bug. Right now the caller needs to 474 * This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
491 482
492 if (S_ISCHR(le16_to_cpu(fe->i_mode)) || 483 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
493 S_ISBLK(le16_to_cpu(fe->i_mode))) 484 S_ISBLK(le16_to_cpu(fe->i_mode)))
494 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 485 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
495 486
496 if (ocfs2_populate_inode(inode, fe, 0) < 0) 487 ocfs2_populate_inode(inode, fe, 0);
497 goto bail;
498 488
499 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); 489 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
500 490
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
547 goto out; 537 goto out;
548 } 538 }
549 539
550 status = ocfs2_journal_access(handle, inode, fe_bh, 540 status = ocfs2_journal_access_di(handle, inode, fe_bh,
551 OCFS2_JOURNAL_ACCESS_WRITE); 541 OCFS2_JOURNAL_ACCESS_WRITE);
552 if (status < 0) { 542 if (status < 0) {
553 mlog_errno(status); 543 mlog_errno(status);
554 goto out; 544 goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
615 goto bail; 605 goto bail;
616 } 606 }
617 607
618 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS); 608 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
609 ocfs2_quota_trans_credits(inode->i_sb));
619 if (IS_ERR(handle)) { 610 if (IS_ERR(handle)) {
620 status = PTR_ERR(handle); 611 status = PTR_ERR(handle);
621 mlog_errno(status); 612 mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
630 } 621 }
631 622
632 /* set the inodes dtime */ 623 /* set the inodes dtime */
633 status = ocfs2_journal_access(handle, inode, di_bh, 624 status = ocfs2_journal_access_di(handle, inode, di_bh,
634 OCFS2_JOURNAL_ACCESS_WRITE); 625 OCFS2_JOURNAL_ACCESS_WRITE);
635 if (status < 0) { 626 if (status < 0) {
636 mlog_errno(status); 627 mlog_errno(status);
637 goto bail_commit; 628 goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
647 } 638 }
648 639
649 ocfs2_remove_from_cache(inode, di_bh); 640 ocfs2_remove_from_cache(inode, di_bh);
641 vfs_dq_free_inode(inode);
650 642
651 status = ocfs2_free_dinode(handle, inode_alloc_inode, 643 status = ocfs2_free_dinode(handle, inode_alloc_inode,
652 inode_alloc_bh, di); 644 inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
929 921
930 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 922 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
931 923
932 if (is_bad_inode(inode)) { 924 /* When we fail in read_inode() we mark inode as bad. The second test
925 * catches the case when inode allocation fails before allocating
926 * a block for inode. */
927 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
933 mlog(0, "Skipping delete of bad inode\n"); 928 mlog(0, "Skipping delete of bad inode\n");
934 goto bail; 929 goto bail;
935 } 930 }
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1195 mlog_entry("(inode %llu)\n", 1190 mlog_entry("(inode %llu)\n",
1196 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1191 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1197 1192
1198 status = ocfs2_journal_access(handle, inode, bh, 1193 status = ocfs2_journal_access_di(handle, inode, bh,
1199 OCFS2_JOURNAL_ACCESS_WRITE); 1194 OCFS2_JOURNAL_ACCESS_WRITE);
1200 if (status < 0) { 1195 if (status < 0) {
1201 mlog_errno(status); 1196 mlog_errno(status);
1202 goto leave; 1197 goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
1264 1259
1265 spin_unlock(&OCFS2_I(inode)->ip_lock); 1260 spin_unlock(&OCFS2_I(inode)->ip_lock);
1266} 1261}
1262
1263int ocfs2_validate_inode_block(struct super_block *sb,
1264 struct buffer_head *bh)
1265{
1266 int rc;
1267 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1268
1269 mlog(0, "Validating dinode %llu\n",
1270 (unsigned long long)bh->b_blocknr);
1271
1272 BUG_ON(!buffer_uptodate(bh));
1273
1274 /*
1275 * If the ecc fails, we return the error but otherwise
1276 * leave the filesystem running. We know any error is
1277 * local to this block.
1278 */
1279 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
1280 if (rc) {
1281 mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
1282 (unsigned long long)bh->b_blocknr);
1283 goto bail;
1284 }
1285
1286 /*
1287 * Errors after here are fatal.
1288 */
1289
1290 rc = -EINVAL;
1291
1292 if (!OCFS2_IS_VALID_DINODE(di)) {
1293 ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
1294 (unsigned long long)bh->b_blocknr, 7,
1295 di->i_signature);
1296 goto bail;
1297 }
1298
1299 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
1300 ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
1301 (unsigned long long)bh->b_blocknr,
1302 (unsigned long long)le64_to_cpu(di->i_blkno));
1303 goto bail;
1304 }
1305
1306 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
1307 ocfs2_error(sb,
1308 "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
1309 (unsigned long long)bh->b_blocknr);
1310 goto bail;
1311 }
1312
1313 if (le32_to_cpu(di->i_fs_generation) !=
1314 OCFS2_SB(sb)->fs_generation) {
1315 ocfs2_error(sb,
1316 "Invalid dinode #%llu: fs_generation is %u\n",
1317 (unsigned long long)bh->b_blocknr,
1318 le32_to_cpu(di->i_fs_generation));
1319 goto bail;
1320 }
1321
1322 rc = 0;
1323
1324bail:
1325 return rc;
1326}
1327
1328int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
1329 int flags)
1330{
1331 int rc;
1332 struct buffer_head *tmp = *bh;
1333
1334 rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
1335 flags, ocfs2_validate_inode_block);
1336
1337 /* If ocfs2_read_blocks() got us a new bh, pass it up. */
1338 if (!rc && !*bh)
1339 *bh = tmp;
1340
1341 return rc;
1342}
1343
1344int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
1345{
1346 return ocfs2_read_inode_block_full(inode, bh, 0);
1347}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4a..eb3c302b38d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
128 int sysfile_type); 128 int sysfile_type);
129int ocfs2_inode_init_private(struct inode *inode); 129int ocfs2_inode_init_private(struct inode *inode);
130int ocfs2_inode_revalidate(struct dentry *dentry); 130int ocfs2_inode_revalidate(struct dentry *dentry);
131int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 131void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
132 int create_ino); 132 int create_ino);
133void ocfs2_read_inode(struct inode *inode); 133void ocfs2_read_inode(struct inode *inode);
134void ocfs2_read_inode2(struct inode *inode, void *opaque); 134void ocfs2_read_inode2(struct inode *inode, void *opaque);
135ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, 135ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
142 struct buffer_head *bh); 142 struct buffer_head *bh);
143int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); 143int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
144int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); 144int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
145struct buffer_head *ocfs2_bread(struct inode *inode,
146 int block, int *err, int reada);
145 147
146void ocfs2_set_inode_flags(struct inode *inode); 148void ocfs2_set_inode_flags(struct inode *inode);
147void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); 149void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
153 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); 155 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
154} 156}
155 157
158/* Validate that a bh contains a valid inode */
159int ocfs2_validate_inode_block(struct super_block *sb,
160 struct buffer_head *bh);
161/*
162 * Read an inode block into *bh. If *bh is NULL, a bh will be allocated.
163 * This is a cached read. The inode will be validated with
164 * ocfs2_validate_inode_block().
165 */
166int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
167/* The same, but can be passed OCFS2_BH_* flags */
168int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
169 int flags);
156#endif /* OCFS2_INODE_H */ 170#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3c..57d7d25a2b9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
35#include "ocfs2.h" 35#include "ocfs2.h"
36 36
37#include "alloc.h" 37#include "alloc.h"
38#include "blockcheck.h"
38#include "dir.h" 39#include "dir.h"
39#include "dlmglue.h" 40#include "dlmglue.h"
40#include "extent_map.h" 41#include "extent_map.h"
@@ -45,6 +46,7 @@
45#include "slot_map.h" 46#include "slot_map.h"
46#include "super.h" 47#include "super.h"
47#include "sysfile.h" 48#include "sysfile.h"
49#include "quota.h"
48 50
49#include "buffer_head_io.h" 51#include "buffer_head_io.h"
50 52
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
52 54
53static int ocfs2_force_read_journal(struct inode *inode); 55static int ocfs2_force_read_journal(struct inode *inode);
54static int ocfs2_recover_node(struct ocfs2_super *osb, 56static int ocfs2_recover_node(struct ocfs2_super *osb,
55 int node_num); 57 int node_num, int slot_num);
56static int __ocfs2_recovery_thread(void *arg); 58static int __ocfs2_recovery_thread(void *arg);
57static int ocfs2_commit_cache(struct ocfs2_super *osb); 59static int ocfs2_commit_cache(struct ocfs2_super *osb);
58static int ocfs2_wait_on_mount(struct ocfs2_super *osb); 60static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
59static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, 61static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
60 int dirty, int replayed); 62 int dirty, int replayed);
61static int ocfs2_trylock_journal(struct ocfs2_super *osb, 63static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
64 int slot); 66 int slot);
65static int ocfs2_commit_thread(void *arg); 67static int ocfs2_commit_thread(void *arg);
66 68
69static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
70{
71 return __ocfs2_wait_on_mount(osb, 0);
72}
73
74static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
75{
76 return __ocfs2_wait_on_mount(osb, 1);
77}
78
79
67 80
68/* 81/*
69 * The recovery_list is a simple linked list of node numbers to recover. 82 * The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
256 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); 269 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
257 BUG_ON(max_buffs <= 0); 270 BUG_ON(max_buffs <= 0);
258 271
259 /* JBD might support this, but our journalling code doesn't yet. */ 272 /* Nested transaction? Just return the handle... */
260 if (journal_current_handle()) { 273 if (journal_current_handle())
261 mlog(ML_ERROR, "Recursive transaction attempted!\n"); 274 return jbd2_journal_start(journal, max_buffs);
262 BUG();
263 }
264 275
265 down_read(&osb->journal->j_trans_barrier); 276 down_read(&osb->journal->j_trans_barrier);
266 277
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
285int ocfs2_commit_trans(struct ocfs2_super *osb, 296int ocfs2_commit_trans(struct ocfs2_super *osb,
286 handle_t *handle) 297 handle_t *handle)
287{ 298{
288 int ret; 299 int ret, nested;
289 struct ocfs2_journal *journal = osb->journal; 300 struct ocfs2_journal *journal = osb->journal;
290 301
291 BUG_ON(!handle); 302 BUG_ON(!handle);
292 303
304 nested = handle->h_ref > 1;
293 ret = jbd2_journal_stop(handle); 305 ret = jbd2_journal_stop(handle);
294 if (ret < 0) 306 if (ret < 0)
295 mlog_errno(ret); 307 mlog_errno(ret);
296 308
297 up_read(&journal->j_trans_barrier); 309 if (!nested)
310 up_read(&journal->j_trans_barrier);
298 311
299 return ret; 312 return ret;
300} 313}
@@ -357,10 +370,137 @@ bail:
357 return status; 370 return status;
358} 371}
359 372
360int ocfs2_journal_access(handle_t *handle, 373struct ocfs2_triggers {
361 struct inode *inode, 374 struct jbd2_buffer_trigger_type ot_triggers;
362 struct buffer_head *bh, 375 int ot_offset;
363 int type) 376};
377
378static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
379{
380 return container_of(triggers, struct ocfs2_triggers, ot_triggers);
381}
382
383static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
384 struct buffer_head *bh,
385 void *data, size_t size)
386{
387 struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
388
389 /*
390 * We aren't guaranteed to have the superblock here, so we
391 * must unconditionally compute the ecc data.
392 * __ocfs2_journal_access() will only set the triggers if
393 * metaecc is enabled.
394 */
395 ocfs2_block_check_compute(data, size, data + ot->ot_offset);
396}
397
398/*
399 * Quota blocks have their own trigger because the struct ocfs2_block_check
400 * offset depends on the blocksize.
401 */
402static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
403 struct buffer_head *bh,
404 void *data, size_t size)
405{
406 struct ocfs2_disk_dqtrailer *dqt =
407 ocfs2_block_dqtrailer(size, data);
408
409 /*
410 * We aren't guaranteed to have the superblock here, so we
411 * must unconditionally compute the ecc data.
412 * __ocfs2_journal_access() will only set the triggers if
413 * metaecc is enabled.
414 */
415 ocfs2_block_check_compute(data, size, &dqt->dq_check);
416}
417
418/*
419 * Directory blocks also have their own trigger because the
420 * struct ocfs2_block_check offset depends on the blocksize.
421 */
422static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
423 struct buffer_head *bh,
424 void *data, size_t size)
425{
426 struct ocfs2_dir_block_trailer *trailer =
427 ocfs2_dir_trailer_from_size(size, data);
428
429 /*
430 * We aren't guaranteed to have the superblock here, so we
431 * must unconditionally compute the ecc data.
432 * __ocfs2_journal_access() will only set the triggers if
433 * metaecc is enabled.
434 */
435 ocfs2_block_check_compute(data, size, &trailer->db_check);
436}
437
438static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
439 struct buffer_head *bh)
440{
441 mlog(ML_ERROR,
442 "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, "
443 "bh->b_blocknr = %llu\n",
444 (unsigned long)bh,
445 (unsigned long long)bh->b_blocknr);
446
447 /* We aren't guaranteed to have the superblock here - but if we
448 * don't, it'll just crash. */
449 ocfs2_error(bh->b_assoc_map->host->i_sb,
450 "JBD2 has aborted our journal, ocfs2 cannot continue\n");
451}
452
453static struct ocfs2_triggers di_triggers = {
454 .ot_triggers = {
455 .t_commit = ocfs2_commit_trigger,
456 .t_abort = ocfs2_abort_trigger,
457 },
458 .ot_offset = offsetof(struct ocfs2_dinode, i_check),
459};
460
461static struct ocfs2_triggers eb_triggers = {
462 .ot_triggers = {
463 .t_commit = ocfs2_commit_trigger,
464 .t_abort = ocfs2_abort_trigger,
465 },
466 .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
467};
468
469static struct ocfs2_triggers gd_triggers = {
470 .ot_triggers = {
471 .t_commit = ocfs2_commit_trigger,
472 .t_abort = ocfs2_abort_trigger,
473 },
474 .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
475};
476
477static struct ocfs2_triggers db_triggers = {
478 .ot_triggers = {
479 .t_commit = ocfs2_db_commit_trigger,
480 .t_abort = ocfs2_abort_trigger,
481 },
482};
483
484static struct ocfs2_triggers xb_triggers = {
485 .ot_triggers = {
486 .t_commit = ocfs2_commit_trigger,
487 .t_abort = ocfs2_abort_trigger,
488 },
489 .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
490};
491
492static struct ocfs2_triggers dq_triggers = {
493 .ot_triggers = {
494 .t_commit = ocfs2_dq_commit_trigger,
495 .t_abort = ocfs2_abort_trigger,
496 },
497};
498
499static int __ocfs2_journal_access(handle_t *handle,
500 struct inode *inode,
501 struct buffer_head *bh,
502 struct ocfs2_triggers *triggers,
503 int type)
364{ 504{
365 int status; 505 int status;
366 506
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
406 status = -EINVAL; 546 status = -EINVAL;
407 mlog(ML_ERROR, "Uknown access type!\n"); 547 mlog(ML_ERROR, "Uknown access type!\n");
408 } 548 }
549 if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
550 jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
409 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 551 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
410 552
411 if (status < 0) 553 if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
416 return status; 558 return status;
417} 559}
418 560
561int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
562 struct buffer_head *bh, int type)
563{
564 return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
565 type);
566}
567
568int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
569 struct buffer_head *bh, int type)
570{
571 return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
572 type);
573}
574
575int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
576 struct buffer_head *bh, int type)
577{
578 return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
579 type);
580}
581
582int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
583 struct buffer_head *bh, int type)
584{
585 return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
586 type);
587}
588
589int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
590 struct buffer_head *bh, int type)
591{
592 return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
593 type);
594}
595
596int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
597 struct buffer_head *bh, int type)
598{
599 return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
600 type);
601}
602
603int ocfs2_journal_access(handle_t *handle, struct inode *inode,
604 struct buffer_head *bh, int type)
605{
606 return __ocfs2_journal_access(handle, inode, bh, NULL, type);
607}
608
419int ocfs2_journal_dirty(handle_t *handle, 609int ocfs2_journal_dirty(handle_t *handle,
420 struct buffer_head *bh) 610 struct buffer_head *bh)
421{ 611{
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
434 return status; 624 return status;
435} 625}
436 626
437#ifdef CONFIG_OCFS2_COMPAT_JBD
438int ocfs2_journal_dirty_data(handle_t *handle,
439 struct buffer_head *bh)
440{
441 int err = journal_dirty_data(handle, bh);
442 if (err)
443 mlog_errno(err);
444 /* TODO: When we can handle it, abort the handle and go RO on
445 * error here. */
446
447 return err;
448}
449#endif
450
451#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) 627#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
452 628
453void ocfs2_set_journal_params(struct ocfs2_super *osb) 629void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
587 mlog_entry_void(); 763 mlog_entry_void();
588 764
589 fe = (struct ocfs2_dinode *)bh->b_data; 765 fe = (struct ocfs2_dinode *)bh->b_data;
590 if (!OCFS2_IS_VALID_DINODE(fe)) { 766
591 /* This is called from startup/shutdown which will 767 /* The journal bh on the osb always comes from ocfs2_journal_init()
592 * handle the errors in a specific manner, so no need 768 * and was validated there inside ocfs2_inode_lock_full(). It's a
593 * to call ocfs2_error() here. */ 769 * code bug if we mess it up. */
594 mlog(ML_ERROR, "Journal dinode %llu has invalid " 770 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
595 "signature: %.*s",
596 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
597 fe->i_signature);
598 status = -EIO;
599 goto out;
600 }
601 771
602 flags = le32_to_cpu(fe->id1.journal1.ij_flags); 772 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
603 if (dirty) 773 if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
609 if (replayed) 779 if (replayed)
610 ocfs2_bump_recovery_generation(fe); 780 ocfs2_bump_recovery_generation(fe);
611 781
782 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
612 status = ocfs2_write_block(osb, bh, journal->j_inode); 783 status = ocfs2_write_block(osb, bh, journal->j_inode);
613 if (status < 0) 784 if (status < 0)
614 mlog_errno(status); 785 mlog_errno(status);
615 786
616out:
617 mlog_exit(status); 787 mlog_exit(status);
618 return status; 788 return status;
619} 789}
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
878 int lri_slot; 1048 int lri_slot;
879 struct ocfs2_dinode *lri_la_dinode; 1049 struct ocfs2_dinode *lri_la_dinode;
880 struct ocfs2_dinode *lri_tl_dinode; 1050 struct ocfs2_dinode *lri_tl_dinode;
1051 struct ocfs2_quota_recovery *lri_qrec;
881}; 1052};
882 1053
883/* Does the second half of the recovery process. By this point, the 1054/* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
898 struct ocfs2_super *osb = journal->j_osb; 1069 struct ocfs2_super *osb = journal->j_osb;
899 struct ocfs2_dinode *la_dinode, *tl_dinode; 1070 struct ocfs2_dinode *la_dinode, *tl_dinode;
900 struct ocfs2_la_recovery_item *item, *n; 1071 struct ocfs2_la_recovery_item *item, *n;
1072 struct ocfs2_quota_recovery *qrec;
901 LIST_HEAD(tmp_la_list); 1073 LIST_HEAD(tmp_la_list);
902 1074
903 mlog_entry_void(); 1075 mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
913 1085
914 mlog(0, "Complete recovery for slot %d\n", item->lri_slot); 1086 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
915 1087
1088 ocfs2_wait_on_quotas(osb);
1089
916 la_dinode = item->lri_la_dinode; 1090 la_dinode = item->lri_la_dinode;
917 if (la_dinode) { 1091 if (la_dinode) {
918 mlog(0, "Clean up local alloc %llu\n", 1092 mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
943 if (ret < 0) 1117 if (ret < 0)
944 mlog_errno(ret); 1118 mlog_errno(ret);
945 1119
1120 qrec = item->lri_qrec;
1121 if (qrec) {
1122 mlog(0, "Recovering quota files");
1123 ret = ocfs2_finish_quota_recovery(osb, qrec,
1124 item->lri_slot);
1125 if (ret < 0)
1126 mlog_errno(ret);
1127 /* Recovery info is already freed now */
1128 }
1129
946 kfree(item); 1130 kfree(item);
947 } 1131 }
948 1132
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
956static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, 1140static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
957 int slot_num, 1141 int slot_num,
958 struct ocfs2_dinode *la_dinode, 1142 struct ocfs2_dinode *la_dinode,
959 struct ocfs2_dinode *tl_dinode) 1143 struct ocfs2_dinode *tl_dinode,
1144 struct ocfs2_quota_recovery *qrec)
960{ 1145{
961 struct ocfs2_la_recovery_item *item; 1146 struct ocfs2_la_recovery_item *item;
962 1147
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
971 if (tl_dinode) 1156 if (tl_dinode)
972 kfree(tl_dinode); 1157 kfree(tl_dinode);
973 1158
1159 if (qrec)
1160 ocfs2_free_quota_recovery(qrec);
1161
974 mlog_errno(-ENOMEM); 1162 mlog_errno(-ENOMEM);
975 return; 1163 return;
976 } 1164 }
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
979 item->lri_la_dinode = la_dinode; 1167 item->lri_la_dinode = la_dinode;
980 item->lri_slot = slot_num; 1168 item->lri_slot = slot_num;
981 item->lri_tl_dinode = tl_dinode; 1169 item->lri_tl_dinode = tl_dinode;
1170 item->lri_qrec = qrec;
982 1171
983 spin_lock(&journal->j_lock); 1172 spin_lock(&journal->j_lock);
984 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 1173 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
998 ocfs2_queue_recovery_completion(journal, 1187 ocfs2_queue_recovery_completion(journal,
999 osb->slot_num, 1188 osb->slot_num,
1000 osb->local_alloc_copy, 1189 osb->local_alloc_copy,
1190 NULL,
1001 NULL); 1191 NULL);
1002 ocfs2_schedule_truncate_log_flush(osb, 0); 1192 ocfs2_schedule_truncate_log_flush(osb, 0);
1003 1193
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1006 } 1196 }
1007} 1197}
1008 1198
1199void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
1200{
1201 if (osb->quota_rec) {
1202 ocfs2_queue_recovery_completion(osb->journal,
1203 osb->slot_num,
1204 NULL,
1205 NULL,
1206 osb->quota_rec);
1207 osb->quota_rec = NULL;
1208 }
1209}
1210
1009static int __ocfs2_recovery_thread(void *arg) 1211static int __ocfs2_recovery_thread(void *arg)
1010{ 1212{
1011 int status, node_num; 1213 int status, node_num, slot_num;
1012 struct ocfs2_super *osb = arg; 1214 struct ocfs2_super *osb = arg;
1013 struct ocfs2_recovery_map *rm = osb->recovery_map; 1215 struct ocfs2_recovery_map *rm = osb->recovery_map;
1216 int *rm_quota = NULL;
1217 int rm_quota_used = 0, i;
1218 struct ocfs2_quota_recovery *qrec;
1014 1219
1015 mlog_entry_void(); 1220 mlog_entry_void();
1016 1221
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
1019 goto bail; 1224 goto bail;
1020 } 1225 }
1021 1226
1227 rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
1228 if (!rm_quota) {
1229 status = -ENOMEM;
1230 goto bail;
1231 }
1022restart: 1232restart:
1023 status = ocfs2_super_lock(osb, 1); 1233 status = ocfs2_super_lock(osb, 1);
1024 if (status < 0) { 1234 if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
1032 * clear it until ocfs2_recover_node() has succeeded. */ 1242 * clear it until ocfs2_recover_node() has succeeded. */
1033 node_num = rm->rm_entries[0]; 1243 node_num = rm->rm_entries[0];
1034 spin_unlock(&osb->osb_lock); 1244 spin_unlock(&osb->osb_lock);
1035 1245 mlog(0, "checking node %d\n", node_num);
1036 status = ocfs2_recover_node(osb, node_num); 1246 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1247 if (slot_num == -ENOENT) {
1248 status = 0;
1249 mlog(0, "no slot for this node, so no recovery"
1250 "required.\n");
1251 goto skip_recovery;
1252 }
1253 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1254
1255 /* It is a bit subtle with quota recovery. We cannot do it
1256 * immediately because we have to obtain cluster locks from
1257 * quota files and we also don't want to just skip it because
1258 * then quota usage would be out of sync until some node takes
1259 * the slot. So we remember which nodes need quota recovery
1260 * and when everything else is done, we recover quotas. */
1261 for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
1262 if (i == rm_quota_used)
1263 rm_quota[rm_quota_used++] = slot_num;
1264
1265 status = ocfs2_recover_node(osb, node_num, slot_num);
1266skip_recovery:
1037 if (!status) { 1267 if (!status) {
1038 ocfs2_recovery_map_clear(osb, node_num); 1268 ocfs2_recovery_map_clear(osb, node_num);
1039 } else { 1269 } else {
@@ -1055,13 +1285,27 @@ restart:
1055 if (status < 0) 1285 if (status < 0)
1056 mlog_errno(status); 1286 mlog_errno(status);
1057 1287
1288 /* Now it is right time to recover quotas... We have to do this under
1289 * superblock lock so that noone can start using the slot (and crash)
1290 * before we recover it */
1291 for (i = 0; i < rm_quota_used; i++) {
1292 qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
1293 if (IS_ERR(qrec)) {
1294 status = PTR_ERR(qrec);
1295 mlog_errno(status);
1296 continue;
1297 }
1298 ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
1299 NULL, NULL, qrec);
1300 }
1301
1058 ocfs2_super_unlock(osb, 1); 1302 ocfs2_super_unlock(osb, 1);
1059 1303
1060 /* We always run recovery on our own orphan dir - the dead 1304 /* We always run recovery on our own orphan dir - the dead
1061 * node(s) may have disallowd a previos inode delete. Re-processing 1305 * node(s) may have disallowd a previos inode delete. Re-processing
1062 * is therefore required. */ 1306 * is therefore required. */
1063 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 1307 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1064 NULL); 1308 NULL, NULL);
1065 1309
1066bail: 1310bail:
1067 mutex_lock(&osb->recovery_lock); 1311 mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
1076 1320
1077 mutex_unlock(&osb->recovery_lock); 1321 mutex_unlock(&osb->recovery_lock);
1078 1322
1323 if (rm_quota)
1324 kfree(rm_quota);
1325
1079 mlog_exit(status); 1326 mlog_exit(status);
1080 /* no one is callint kthread_stop() for us so the kthread() api 1327 /* no one is callint kthread_stop() for us so the kthread() api
1081 * requires that we call do_exit(). And it isn't exported, but 1328 * requires that we call do_exit(). And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
1135 } 1382 }
1136 SET_INODE_JOURNAL(inode); 1383 SET_INODE_JOURNAL(inode);
1137 1384
1138 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, 1385 status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
1139 OCFS2_BH_IGNORE_CACHE);
1140 if (status < 0) { 1386 if (status < 0) {
1141 mlog_errno(status); 1387 mlog_errno(status);
1142 goto bail; 1388 goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1268 osb->slot_recovery_generations[slot_num] = 1514 osb->slot_recovery_generations[slot_num] =
1269 ocfs2_get_recovery_generation(fe); 1515 ocfs2_get_recovery_generation(fe);
1270 1516
1517 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
1271 status = ocfs2_write_block(osb, bh, inode); 1518 status = ocfs2_write_block(osb, bh, inode);
1272 if (status < 0) 1519 if (status < 0)
1273 mlog_errno(status); 1520 mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
1304 * far less concerning. 1551 * far less concerning.
1305 */ 1552 */
1306static int ocfs2_recover_node(struct ocfs2_super *osb, 1553static int ocfs2_recover_node(struct ocfs2_super *osb,
1307 int node_num) 1554 int node_num, int slot_num)
1308{ 1555{
1309 int status = 0; 1556 int status = 0;
1310 int slot_num;
1311 struct ocfs2_dinode *la_copy = NULL; 1557 struct ocfs2_dinode *la_copy = NULL;
1312 struct ocfs2_dinode *tl_copy = NULL; 1558 struct ocfs2_dinode *tl_copy = NULL;
1313 1559
1314 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 1560 mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
1315 node_num, osb->node_num); 1561 node_num, slot_num, osb->node_num);
1316
1317 mlog(0, "checking node %d\n", node_num);
1318 1562
1319 /* Should not ever be called to recover ourselves -- in that 1563 /* Should not ever be called to recover ourselves -- in that
1320 * case we should've called ocfs2_journal_load instead. */ 1564 * case we should've called ocfs2_journal_load instead. */
1321 BUG_ON(osb->node_num == node_num); 1565 BUG_ON(osb->node_num == node_num);
1322 1566
1323 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1324 if (slot_num == -ENOENT) {
1325 status = 0;
1326 mlog(0, "no slot for this node, so no recovery required.\n");
1327 goto done;
1328 }
1329
1330 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1331
1332 status = ocfs2_replay_journal(osb, node_num, slot_num); 1567 status = ocfs2_replay_journal(osb, node_num, slot_num);
1333 if (status < 0) { 1568 if (status < 0) {
1334 if (status == -EBUSY) { 1569 if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1364 1599
1365 /* This will kfree the memory pointed to by la_copy and tl_copy */ 1600 /* This will kfree the memory pointed to by la_copy and tl_copy */
1366 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, 1601 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1367 tl_copy); 1602 tl_copy, NULL);
1368 1603
1369 status = 0; 1604 status = 0;
1370done: 1605done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1659 return ret; 1894 return ret;
1660} 1895}
1661 1896
1662static int ocfs2_wait_on_mount(struct ocfs2_super *osb) 1897static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
1663{ 1898{
1664 /* This check is good because ocfs2 will wait on our recovery 1899 /* This check is good because ocfs2 will wait on our recovery
1665 * thread before changing it to something other than MOUNTED 1900 * thread before changing it to something other than MOUNTED
1666 * or DISABLED. */ 1901 * or DISABLED. */
1667 wait_event(osb->osb_mount_event, 1902 wait_event(osb->osb_mount_event,
1668 atomic_read(&osb->vol_state) == VOLUME_MOUNTED || 1903 (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
1904 atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
1669 atomic_read(&osb->vol_state) == VOLUME_DISABLED); 1905 atomic_read(&osb->vol_state) == VOLUME_DISABLED);
1670 1906
1671 /* If there's an error on mount, then we may never get to the 1907 /* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3cea..3c3532e1307c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
27#define OCFS2_JOURNAL_H 27#define OCFS2_JOURNAL_H
28 28
29#include <linux/fs.h> 29#include <linux/fs.h>
30#ifndef CONFIG_OCFS2_COMPAT_JBD 30#include <linux/jbd2.h>
31# include <linux/jbd2.h>
32#else
33# include <linux/jbd.h>
34# include "ocfs2_jbd_compat.h"
35#endif
36 31
37enum ocfs2_journal_state { 32enum ocfs2_journal_state {
38 OCFS2_JOURNAL_FREE = 0, 33 OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb,
173 int node_num); 168 int node_num);
174int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); 169int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
175void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); 170void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
171void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
176 172
177static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) 173static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
178{ 174{
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
216 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may 212 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may
217 * commit the handle to disk in the process, but will 213 * commit the handle to disk in the process, but will
218 * not release any locks taken during the transaction. 214 * not release any locks taken during the transaction.
219 * ocfs2_journal_access - Notify the handle that we want to journal this 215 * ocfs2_journal_access* - Notify the handle that we want to journal this
220 * buffer. Will have to call ocfs2_journal_dirty once 216 * buffer. Will have to call ocfs2_journal_dirty once
221 * we've actually dirtied it. Type is one of . or . 217 * we've actually dirtied it. Type is one of . or .
218 * Always call the specific flavor of
219 * ocfs2_journal_access_*() unless you intend to
220 * manage the checksum by hand.
222 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. 221 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
223 * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before 222 * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
224 * the current handle commits. 223 * the current handle commits.
@@ -248,10 +247,29 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
248#define OCFS2_JOURNAL_ACCESS_WRITE 1 247#define OCFS2_JOURNAL_ACCESS_WRITE 1
249#define OCFS2_JOURNAL_ACCESS_UNDO 2 248#define OCFS2_JOURNAL_ACCESS_UNDO 2
250 249
251int ocfs2_journal_access(handle_t *handle, 250
252 struct inode *inode, 251/* ocfs2_inode */
253 struct buffer_head *bh, 252int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
254 int type); 253 struct buffer_head *bh, int type);
254/* ocfs2_extent_block */
255int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
256 struct buffer_head *bh, int type);
257/* ocfs2_group_desc */
258int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
259 struct buffer_head *bh, int type);
260/* ocfs2_xattr_block */
261int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
262 struct buffer_head *bh, int type);
263/* quota blocks */
264int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
265 struct buffer_head *bh, int type);
266/* dirblock */
267int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
268 struct buffer_head *bh, int type);
269/* Anything that has no ecc */
270int ocfs2_journal_access(handle_t *handle, struct inode *inode,
271 struct buffer_head *bh, int type);
272
255/* 273/*
256 * A word about the journal_access/journal_dirty "dance". It is 274 * A word about the journal_access/journal_dirty "dance". It is
257 * entirely legal to journal_access a buffer more than once (as long 275 * entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int ocfs2_journal_access(handle_t *handle,
273 */ 291 */
274int ocfs2_journal_dirty(handle_t *handle, 292int ocfs2_journal_dirty(handle_t *handle,
275 struct buffer_head *bh); 293 struct buffer_head *bh);
276#ifdef CONFIG_OCFS2_COMPAT_JBD
277int ocfs2_journal_dirty_data(handle_t *handle,
278 struct buffer_head *bh);
279#endif
280 294
281/* 295/*
282 * Credit Macros: 296 * Credit Macros:
@@ -293,6 +307,37 @@ int ocfs2_journal_dirty_data(handle_t *handle,
293/* extended attribute block update */ 307/* extended attribute block update */
294#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 308#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
295 309
310/* global quotafile inode update, data block */
311#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
312
313/*
314 * The two writes below can accidentally see global info dirty due
315 * to set_info() quotactl so make them prepared for the writes.
316 */
317/* quota data block, global info */
318/* Write to local quota file */
319#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
320
321/* global quota data block, local quota data block, global quota inode,
322 * global quota info */
323#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
324
325static inline int ocfs2_quota_trans_credits(struct super_block *sb)
326{
327 int credits = 0;
328
329 if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
330 credits += OCFS2_QWRITE_CREDITS;
331 if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
332 credits += OCFS2_QWRITE_CREDITS;
333 return credits;
334}
335
336/* Number of credits needed for removing quota structure from file */
337int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
338/* Number of credits needed for initialization of new quota structure */
339int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
340
296/* group extend. inode update and last group update. */ 341/* group extend. inode update and last group update. */
297#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 342#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
298 343
@@ -303,8 +348,11 @@ int ocfs2_journal_dirty_data(handle_t *handle,
303 * prev. group desc. if we relink. */ 348 * prev. group desc. if we relink. */
304#define OCFS2_SUBALLOC_ALLOC (3) 349#define OCFS2_SUBALLOC_ALLOC (3)
305 350
306#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC \ 351static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
307 + OCFS2_INODE_UPDATE_CREDITS) 352{
353 return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
354 ocfs2_quota_trans_credits(sb);
355}
308 356
309/* dinode + group descriptor update. We don't relink on free yet. */ 357/* dinode + group descriptor update. We don't relink on free yet. */
310#define OCFS2_SUBALLOC_FREE (2) 358#define OCFS2_SUBALLOC_FREE (2)
@@ -313,16 +361,23 @@ int ocfs2_journal_dirty_data(handle_t *handle,
313#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ 361#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
314 + OCFS2_TRUNCATE_LOG_UPDATE) 362 + OCFS2_TRUNCATE_LOG_UPDATE)
315 363
316#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS) 364static inline int ocfs2_remove_extent_credits(struct super_block *sb)
365{
366 return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
367 ocfs2_quota_trans_credits(sb);
368}
317 369
318/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 370/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
319 * bitmap block for the new bit) */ 371 * bitmap block for the new bit) */
320#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 372#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
321 373
322/* parent fe, parent block, new file entry, inode alloc fe, inode alloc 374/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
323 * group descriptor + mkdir/symlink blocks */ 375 * group descriptor + mkdir/symlink blocks + quota update */
324#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \ 376static inline int ocfs2_mknod_credits(struct super_block *sb)
325 + OCFS2_DIR_LINK_ADDITIONAL_CREDITS) 377{
378 return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
379 ocfs2_quota_trans_credits(sb);
380}
326 381
327/* local alloc metadata change + main bitmap updates */ 382/* local alloc metadata change + main bitmap updates */
328#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ 383#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
@@ -332,13 +387,21 @@ int ocfs2_journal_dirty_data(handle_t *handle,
332 * for the dinode, one for the new block. */ 387 * for the dinode, one for the new block. */
333#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 388#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
334 389
335/* file update (nlink, etc) + directory mtime/ctime + dir entry block */ 390/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
336#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1) 391 * update on dir */
392static inline int ocfs2_link_credits(struct super_block *sb)
393{
394 return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
395 ocfs2_quota_trans_credits(sb);
396}
337 397
338/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 398/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
339 * dir inode link */ 399 * dir inode link */
340#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \ 400static inline int ocfs2_unlink_credits(struct super_block *sb)
341 + OCFS2_LINK_CREDITS) 401{
402 /* The quota update from ocfs2_link_credits is unused here... */
403 return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
404}
342 405
343/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 406/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
344 * inode alloc group descriptor */ 407 * inode alloc group descriptor */
@@ -347,8 +410,10 @@ int ocfs2_journal_dirty_data(handle_t *handle,
347/* dinode update, old dir dinode update, new dir dinode update, old 410/* dinode update, old dir dinode update, new dir dinode update, old
348 * dir dir entry, new dir dir entry, dir entry update for renaming 411 * dir dir entry, new dir dir entry, dir entry update for renaming
349 * directory + target unlink */ 412 * directory + target unlink */
350#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ 413static inline int ocfs2_rename_credits(struct super_block *sb)
351 + OCFS2_UNLINK_CREDITS) 414{
415 return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
416}
352 417
353/* global bitmap dinode, group desc., relinked group, 418/* global bitmap dinode, group desc., relinked group,
354 * suballocator dinode, group desc., relinked group, 419 * suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
386 * credit for the dinode there. */ 451 * credit for the dinode there. */
387 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); 452 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
388 453
389 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks; 454 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
455 ocfs2_quota_trans_credits(sb);
390} 456}
391 457
392static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 458static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
393{ 459{
394 int blocks = OCFS2_MKNOD_CREDITS; 460 int blocks = ocfs2_mknod_credits(sb);
395 461
396 /* links can be longer than one block so we may update many 462 /* links can be longer than one block so we may update many
397 * within our single allocated extent. */ 463 * within our single allocated extent. */
398 blocks += ocfs2_clusters_to_blocks(sb, 1); 464 blocks += ocfs2_clusters_to_blocks(sb, 1);
399 465
400 return blocks; 466 return blocks + ocfs2_quota_trans_credits(sb);
401} 467}
402 468
403static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, 469static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
434 /* update to the truncate log. */ 500 /* update to the truncate log. */
435 credits += OCFS2_TRUNCATE_LOG_UPDATE; 501 credits += OCFS2_TRUNCATE_LOG_UPDATE;
436 502
503 credits += ocfs2_quota_trans_credits(sb);
504
437 return credits; 505 return credits;
438} 506}
439 507
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c32..ec70cdbe77fc 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
36#include "ocfs2.h" 36#include "ocfs2.h"
37 37
38#include "alloc.h" 38#include "alloc.h"
39#include "blockcheck.h"
39#include "dlmglue.h" 40#include "dlmglue.h"
40#include "inode.h" 41#include "inode.h"
41#include "journal.h" 42#include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
248 goto bail; 249 goto bail;
249 } 250 }
250 251
251 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, 252 status = ocfs2_read_inode_block_full(inode, &alloc_bh,
252 &alloc_bh, OCFS2_BH_IGNORE_CACHE); 253 OCFS2_BH_IGNORE_CACHE);
253 if (status < 0) { 254 if (status < 0) {
254 mlog_errno(status); 255 mlog_errno(status);
255 goto bail; 256 goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
382 } 383 }
383 memcpy(alloc_copy, alloc, bh->b_size); 384 memcpy(alloc_copy, alloc, bh->b_size);
384 385
385 status = ocfs2_journal_access(handle, local_alloc_inode, bh, 386 status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
386 OCFS2_JOURNAL_ACCESS_WRITE); 387 OCFS2_JOURNAL_ACCESS_WRITE);
387 if (status < 0) { 388 if (status < 0) {
388 mlog_errno(status); 389 mlog_errno(status);
389 goto out_commit; 390 goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
459 460
460 mutex_lock(&inode->i_mutex); 461 mutex_lock(&inode->i_mutex);
461 462
462 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, 463 status = ocfs2_read_inode_block_full(inode, &alloc_bh,
463 &alloc_bh, OCFS2_BH_IGNORE_CACHE); 464 OCFS2_BH_IGNORE_CACHE);
464 if (status < 0) { 465 if (status < 0) {
465 mlog_errno(status); 466 mlog_errno(status);
466 goto bail; 467 goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
476 alloc = (struct ocfs2_dinode *) alloc_bh->b_data; 477 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
477 ocfs2_clear_local_alloc(alloc); 478 ocfs2_clear_local_alloc(alloc);
478 479
480 ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
479 status = ocfs2_write_block(osb, alloc_bh, inode); 481 status = ocfs2_write_block(osb, alloc_bh, inode);
480 if (status < 0) 482 if (status < 0)
481 mlog_errno(status); 483 mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
762 * delete bits from it! */ 764 * delete bits from it! */
763 *num_bits = bits_wanted; 765 *num_bits = bits_wanted;
764 766
765 status = ocfs2_journal_access(handle, local_alloc_inode, 767 status = ocfs2_journal_access_di(handle, local_alloc_inode,
766 osb->local_alloc_bh, 768 osb->local_alloc_bh,
767 OCFS2_JOURNAL_ACCESS_WRITE); 769 OCFS2_JOURNAL_ACCESS_WRITE);
768 if (status < 0) { 770 if (status < 0) {
769 mlog_errno(status); 771 mlog_errno(status);
770 goto bail; 772 goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1240 } 1242 }
1241 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); 1243 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
1242 1244
1243 status = ocfs2_journal_access(handle, local_alloc_inode, 1245 status = ocfs2_journal_access_di(handle, local_alloc_inode,
1244 osb->local_alloc_bh, 1246 osb->local_alloc_bh,
1245 OCFS2_JOURNAL_ACCESS_WRITE); 1247 OCFS2_JOURNAL_ACCESS_WRITE);
1246 if (status < 0) { 1248 if (status < 0) {
1247 mlog_errno(status); 1249 mlog_errno(status);
1248 goto bail; 1250 goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402efe..084aba86c3b2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
40#include <linux/types.h> 40#include <linux/types.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h>
43 44
44#define MLOG_MASK_PREFIX ML_NAMEI 45#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -61,17 +62,18 @@
61#include "sysfile.h" 62#include "sysfile.h"
62#include "uptodate.h" 63#include "uptodate.h"
63#include "xattr.h" 64#include "xattr.h"
65#include "acl.h"
64 66
65#include "buffer_head_io.h" 67#include "buffer_head_io.h"
66 68
67static int ocfs2_mknod_locked(struct ocfs2_super *osb, 69static int ocfs2_mknod_locked(struct ocfs2_super *osb,
68 struct inode *dir, 70 struct inode *dir,
69 struct dentry *dentry, int mode, 71 struct inode *inode,
72 struct dentry *dentry,
70 dev_t dev, 73 dev_t dev,
71 struct buffer_head **new_fe_bh, 74 struct buffer_head **new_fe_bh,
72 struct buffer_head *parent_fe_bh, 75 struct buffer_head *parent_fe_bh,
73 handle_t *handle, 76 handle_t *handle,
74 struct inode **ret_inode,
75 struct ocfs2_alloc_context *inode_ac); 77 struct ocfs2_alloc_context *inode_ac);
76 78
77static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 79static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
186 return ret; 188 return ret;
187} 189}
188 190
191static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
192{
193 struct inode *inode;
194
195 inode = new_inode(dir->i_sb);
196 if (!inode) {
197 mlog(ML_ERROR, "new_inode failed!\n");
198 return NULL;
199 }
200
201 /* populate as many fields early on as possible - many of
202 * these are used by the support functions here and in
203 * callers. */
204 if (S_ISDIR(mode))
205 inode->i_nlink = 2;
206 else
207 inode->i_nlink = 1;
208 inode->i_uid = current_fsuid();
209 if (dir->i_mode & S_ISGID) {
210 inode->i_gid = dir->i_gid;
211 if (S_ISDIR(mode))
212 mode |= S_ISGID;
213 } else
214 inode->i_gid = current_fsgid();
215 inode->i_mode = mode;
216 vfs_dq_init(inode);
217 return inode;
218}
219
189static int ocfs2_mknod(struct inode *dir, 220static int ocfs2_mknod(struct inode *dir,
190 struct dentry *dentry, 221 struct dentry *dentry,
191 int mode, 222 int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
201 struct inode *inode = NULL; 232 struct inode *inode = NULL;
202 struct ocfs2_alloc_context *inode_ac = NULL; 233 struct ocfs2_alloc_context *inode_ac = NULL;
203 struct ocfs2_alloc_context *data_ac = NULL; 234 struct ocfs2_alloc_context *data_ac = NULL;
235 struct ocfs2_alloc_context *xattr_ac = NULL;
236 int want_clusters = 0;
237 int xattr_credits = 0;
238 struct ocfs2_security_xattr_info si = {
239 .enable = 1,
240 };
241 int did_quota_inode = 0;
204 242
205 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
206 (unsigned long)dev, dentry->d_name.len, 244 (unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
250 goto leave; 288 goto leave;
251 } 289 }
252 290
253 /* Reserve a cluster if creating an extent based directory. */ 291 inode = ocfs2_get_init_inode(dir, mode);
254 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) { 292 if (!inode) {
255 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 293 status = -ENOMEM;
256 if (status < 0) { 294 mlog_errno(status);
257 if (status != -ENOSPC) 295 goto leave;
258 mlog_errno(status); 296 }
297
298 /* get security xattr */
299 status = ocfs2_init_security_get(inode, dir, &si);
300 if (status) {
301 if (status == -EOPNOTSUPP)
302 si.enable = 0;
303 else {
304 mlog_errno(status);
259 goto leave; 305 goto leave;
260 } 306 }
261 } 307 }
262 308
263 handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS); 309 /* calculate meta data/clusters for setting security and acl xattr */
310 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
311 &si, &want_clusters,
312 &xattr_credits, &xattr_ac);
313 if (status < 0) {
314 mlog_errno(status);
315 goto leave;
316 }
317
318 /* Reserve a cluster if creating an extent based directory. */
319 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
320 want_clusters += 1;
321
322 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
323 if (status < 0) {
324 if (status != -ENOSPC)
325 mlog_errno(status);
326 goto leave;
327 }
328
329 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
330 xattr_credits);
264 if (IS_ERR(handle)) { 331 if (IS_ERR(handle)) {
265 status = PTR_ERR(handle); 332 status = PTR_ERR(handle);
266 handle = NULL; 333 handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
268 goto leave; 335 goto leave;
269 } 336 }
270 337
338 /* We don't use standard VFS wrapper because we don't want vfs_dq_init
339 * to be called. */
340 if (sb_any_quota_active(osb->sb) &&
341 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
342 status = -EDQUOT;
343 goto leave;
344 }
345 did_quota_inode = 1;
346
271 /* do the real work now. */ 347 /* do the real work now. */
272 status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, 348 status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
273 &new_fe_bh, parent_fe_bh, handle, 349 &new_fe_bh, parent_fe_bh, handle,
274 &inode, inode_ac); 350 inode_ac);
275 if (status < 0) { 351 if (status < 0) {
276 mlog_errno(status); 352 mlog_errno(status);
277 goto leave; 353 goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
285 goto leave; 361 goto leave;
286 } 362 }
287 363
288 status = ocfs2_journal_access(handle, dir, parent_fe_bh, 364 status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
289 OCFS2_JOURNAL_ACCESS_WRITE); 365 OCFS2_JOURNAL_ACCESS_WRITE);
290 if (status < 0) { 366 if (status < 0) {
291 mlog_errno(status); 367 mlog_errno(status);
292 goto leave; 368 goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
300 inc_nlink(dir); 376 inc_nlink(dir);
301 } 377 }
302 378
379 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
380 xattr_ac, data_ac);
381 if (status < 0) {
382 mlog_errno(status);
383 goto leave;
384 }
385
386 if (si.enable) {
387 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
388 xattr_ac, data_ac);
389 if (status < 0) {
390 mlog_errno(status);
391 goto leave;
392 }
393 }
394
303 status = ocfs2_add_entry(handle, dentry, inode, 395 status = ocfs2_add_entry(handle, dentry, inode,
304 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 396 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
305 de_bh); 397 de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
320 d_instantiate(dentry, inode); 412 d_instantiate(dentry, inode);
321 status = 0; 413 status = 0;
322leave: 414leave:
415 if (status < 0 && did_quota_inode)
416 vfs_dq_free_inode(inode);
323 if (handle) 417 if (handle)
324 ocfs2_commit_trans(osb, handle); 418 ocfs2_commit_trans(osb, handle);
325 419
@@ -331,9 +425,13 @@ leave:
331 brelse(new_fe_bh); 425 brelse(new_fe_bh);
332 brelse(de_bh); 426 brelse(de_bh);
333 brelse(parent_fe_bh); 427 brelse(parent_fe_bh);
428 kfree(si.name);
429 kfree(si.value);
334 430
335 if ((status < 0) && inode) 431 if ((status < 0) && inode) {
432 clear_nlink(inode);
336 iput(inode); 433 iput(inode);
434 }
337 435
338 if (inode_ac) 436 if (inode_ac)
339 ocfs2_free_alloc_context(inode_ac); 437 ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
341 if (data_ac) 439 if (data_ac)
342 ocfs2_free_alloc_context(data_ac); 440 ocfs2_free_alloc_context(data_ac);
343 441
442 if (xattr_ac)
443 ocfs2_free_alloc_context(xattr_ac);
444
344 mlog_exit(status); 445 mlog_exit(status);
345 446
346 return status; 447 return status;
@@ -348,12 +449,12 @@ leave:
348 449
349static int ocfs2_mknod_locked(struct ocfs2_super *osb, 450static int ocfs2_mknod_locked(struct ocfs2_super *osb,
350 struct inode *dir, 451 struct inode *dir,
351 struct dentry *dentry, int mode, 452 struct inode *inode,
453 struct dentry *dentry,
352 dev_t dev, 454 dev_t dev,
353 struct buffer_head **new_fe_bh, 455 struct buffer_head **new_fe_bh,
354 struct buffer_head *parent_fe_bh, 456 struct buffer_head *parent_fe_bh,
355 handle_t *handle, 457 handle_t *handle,
356 struct inode **ret_inode,
357 struct ocfs2_alloc_context *inode_ac) 458 struct ocfs2_alloc_context *inode_ac)
358{ 459{
359 int status = 0; 460 int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
361 struct ocfs2_extent_list *fel; 462 struct ocfs2_extent_list *fel;
362 u64 fe_blkno = 0; 463 u64 fe_blkno = 0;
363 u16 suballoc_bit; 464 u16 suballoc_bit;
364 struct inode *inode = NULL;
365 465
366 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 466 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
367 (unsigned long)dev, dentry->d_name.len, 467 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
368 dentry->d_name.name); 468 dentry->d_name.name);
369 469
370 *new_fe_bh = NULL; 470 *new_fe_bh = NULL;
371 *ret_inode = NULL;
372 471
373 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, 472 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
374 &fe_blkno); 473 &fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
377 goto leave; 476 goto leave;
378 } 477 }
379 478
380 inode = new_inode(dir->i_sb);
381 if (!inode) {
382 status = -ENOMEM;
383 mlog(ML_ERROR, "new_inode failed!\n");
384 goto leave;
385 }
386
387 /* populate as many fields early on as possible - many of 479 /* populate as many fields early on as possible - many of
388 * these are used by the support functions here and in 480 * these are used by the support functions here and in
389 * callers. */ 481 * callers. */
390 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); 482 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
391 OCFS2_I(inode)->ip_blkno = fe_blkno; 483 OCFS2_I(inode)->ip_blkno = fe_blkno;
392 if (S_ISDIR(mode))
393 inode->i_nlink = 2;
394 else
395 inode->i_nlink = 1;
396 inode->i_mode = mode;
397 spin_lock(&osb->osb_lock); 484 spin_lock(&osb->osb_lock);
398 inode->i_generation = osb->s_next_generation++; 485 inode->i_generation = osb->s_next_generation++;
399 spin_unlock(&osb->osb_lock); 486 spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
406 } 493 }
407 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); 494 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
408 495
409 status = ocfs2_journal_access(handle, inode, *new_fe_bh, 496 status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
410 OCFS2_JOURNAL_ACCESS_CREATE); 497 OCFS2_JOURNAL_ACCESS_CREATE);
411 if (status < 0) { 498 if (status < 0) {
412 mlog_errno(status); 499 mlog_errno(status);
413 goto leave; 500 goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
421 fe->i_blkno = cpu_to_le64(fe_blkno); 508 fe->i_blkno = cpu_to_le64(fe_blkno);
422 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 509 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
423 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); 510 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
424 fe->i_uid = cpu_to_le32(current_fsuid()); 511 fe->i_uid = cpu_to_le32(inode->i_uid);
425 if (dir->i_mode & S_ISGID) { 512 fe->i_gid = cpu_to_le32(inode->i_gid);
426 fe->i_gid = cpu_to_le32(dir->i_gid); 513 fe->i_mode = cpu_to_le16(inode->i_mode);
427 if (S_ISDIR(mode)) 514 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
428 mode |= S_ISGID;
429 } else
430 fe->i_gid = cpu_to_le32(current_fsgid());
431 fe->i_mode = cpu_to_le16(mode);
432 if (S_ISCHR(mode) || S_ISBLK(mode))
433 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); 515 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
434
435 fe->i_links_count = cpu_to_le16(inode->i_nlink); 516 fe->i_links_count = cpu_to_le16(inode->i_nlink);
436 517
437 fe->i_last_eb_blk = 0; 518 fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
446 /* 527 /*
447 * If supported, directories start with inline data. 528 * If supported, directories start with inline data.
448 */ 529 */
449 if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) { 530 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
450 u16 feat = le16_to_cpu(fe->i_dyn_features); 531 u16 feat = le16_to_cpu(fe->i_dyn_features);
451 532
452 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
465 goto leave; 546 goto leave;
466 } 547 }
467 548
468 if (ocfs2_populate_inode(inode, fe, 1) < 0) { 549 ocfs2_populate_inode(inode, fe, 1);
469 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
470 "i_blkno=%llu, i_ino=%lu\n",
471 (unsigned long long)(*new_fe_bh)->b_blocknr,
472 (unsigned long long)le64_to_cpu(fe->i_blkno),
473 inode->i_ino);
474 BUG();
475 }
476
477 ocfs2_inode_set_new(osb, inode); 550 ocfs2_inode_set_new(osb, inode);
478 if (!ocfs2_mount_local(osb)) { 551 if (!ocfs2_mount_local(osb)) {
479 status = ocfs2_create_new_inode_locks(inode); 552 status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
484 status = 0; /* error in ocfs2_create_new_inode_locks is not 557 status = 0; /* error in ocfs2_create_new_inode_locks is not
485 * critical */ 558 * critical */
486 559
487 *ret_inode = inode;
488leave: 560leave:
489 if (status < 0) { 561 if (status < 0) {
490 if (*new_fe_bh) { 562 if (*new_fe_bh) {
491 brelse(*new_fe_bh); 563 brelse(*new_fe_bh);
492 *new_fe_bh = NULL; 564 *new_fe_bh = NULL;
493 } 565 }
494 if (inode) {
495 clear_nlink(inode);
496 iput(inode);
497 }
498 } 566 }
499 567
500 mlog_exit(status); 568 mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
588 goto out_unlock_inode; 656 goto out_unlock_inode;
589 } 657 }
590 658
591 handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS); 659 handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
592 if (IS_ERR(handle)) { 660 if (IS_ERR(handle)) {
593 err = PTR_ERR(handle); 661 err = PTR_ERR(handle);
594 handle = NULL; 662 handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
596 goto out_unlock_inode; 664 goto out_unlock_inode;
597 } 665 }
598 666
599 err = ocfs2_journal_access(handle, inode, fe_bh, 667 err = ocfs2_journal_access_di(handle, inode, fe_bh,
600 OCFS2_JOURNAL_ACCESS_WRITE); 668 OCFS2_JOURNAL_ACCESS_WRITE);
601 if (err < 0) { 669 if (err < 0) {
602 mlog_errno(err); 670 mlog_errno(err);
603 goto out_commit; 671 goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
775 } 843 }
776 } 844 }
777 845
778 handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS); 846 handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
779 if (IS_ERR(handle)) { 847 if (IS_ERR(handle)) {
780 status = PTR_ERR(handle); 848 status = PTR_ERR(handle);
781 handle = NULL; 849 handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
783 goto leave; 851 goto leave;
784 } 852 }
785 853
786 status = ocfs2_journal_access(handle, inode, fe_bh, 854 status = ocfs2_journal_access_di(handle, inode, fe_bh,
787 OCFS2_JOURNAL_ACCESS_WRITE); 855 OCFS2_JOURNAL_ACCESS_WRITE);
788 if (status < 0) { 856 if (status < 0) {
789 mlog_errno(status); 857 mlog_errno(status);
790 goto leave; 858 goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
1181 } 1249 }
1182 } 1250 }
1183 1251
1184 handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS); 1252 handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
1185 if (IS_ERR(handle)) { 1253 if (IS_ERR(handle)) {
1186 status = PTR_ERR(handle); 1254 status = PTR_ERR(handle);
1187 handle = NULL; 1255 handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
1197 goto bail; 1265 goto bail;
1198 } 1266 }
1199 } 1267 }
1200 status = ocfs2_journal_access(handle, new_inode, newfe_bh, 1268 status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
1201 OCFS2_JOURNAL_ACCESS_WRITE); 1269 OCFS2_JOURNAL_ACCESS_WRITE);
1202 if (status < 0) { 1270 if (status < 0) {
1203 mlog_errno(status); 1271 mlog_errno(status);
1204 goto bail; 1272 goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
1244 old_inode->i_ctime = CURRENT_TIME; 1312 old_inode->i_ctime = CURRENT_TIME;
1245 mark_inode_dirty(old_inode); 1313 mark_inode_dirty(old_inode);
1246 1314
1247 status = ocfs2_journal_access(handle, old_inode, old_inode_bh, 1315 status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
1248 OCFS2_JOURNAL_ACCESS_WRITE); 1316 OCFS2_JOURNAL_ACCESS_WRITE);
1249 if (status >= 0) { 1317 if (status >= 0) {
1250 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; 1318 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
1251 1319
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
1321 (int)old_dir_nlink, old_dir->i_nlink); 1389 (int)old_dir_nlink, old_dir->i_nlink);
1322 } else { 1390 } else {
1323 struct ocfs2_dinode *fe; 1391 struct ocfs2_dinode *fe;
1324 status = ocfs2_journal_access(handle, old_dir, 1392 status = ocfs2_journal_access_di(handle, old_dir,
1325 old_dir_bh, 1393 old_dir_bh,
1326 OCFS2_JOURNAL_ACCESS_WRITE); 1394 OCFS2_JOURNAL_ACCESS_WRITE);
1327 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1395 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1328 fe->i_links_count = cpu_to_le16(old_dir->i_nlink); 1396 fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
1329 status = ocfs2_journal_dirty(handle, old_dir_bh); 1397 status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
1496 handle_t *handle = NULL; 1564 handle_t *handle = NULL;
1497 struct ocfs2_alloc_context *inode_ac = NULL; 1565 struct ocfs2_alloc_context *inode_ac = NULL;
1498 struct ocfs2_alloc_context *data_ac = NULL; 1566 struct ocfs2_alloc_context *data_ac = NULL;
1567 struct ocfs2_alloc_context *xattr_ac = NULL;
1568 int want_clusters = 0;
1569 int xattr_credits = 0;
1570 struct ocfs2_security_xattr_info si = {
1571 .enable = 1,
1572 };
1573 int did_quota = 0, did_quota_inode = 0;
1499 1574
1500 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1575 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1501 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1576 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
1542 goto bail; 1617 goto bail;
1543 } 1618 }
1544 1619
1545 /* don't reserve bitmap space for fast symlinks. */ 1620 inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
1546 if (l > ocfs2_fast_symlink_chars(sb)) { 1621 if (!inode) {
1547 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 1622 status = -ENOMEM;
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626
1627 /* get security xattr */
1628 status = ocfs2_init_security_get(inode, dir, &si);
1629 if (status) {
1630 if (status == -EOPNOTSUPP)
1631 si.enable = 0;
1632 else {
1633 mlog_errno(status);
1634 goto bail;
1635 }
1636 }
1637
1638 /* calculate meta data/clusters for setting security xattr */
1639 if (si.enable) {
1640 status = ocfs2_calc_security_init(dir, &si, &want_clusters,
1641 &xattr_credits, &xattr_ac);
1548 if (status < 0) { 1642 if (status < 0) {
1549 if (status != -ENOSPC) 1643 mlog_errno(status);
1550 mlog_errno(status);
1551 goto bail; 1644 goto bail;
1552 } 1645 }
1553 } 1646 }
1554 1647
1555 handle = ocfs2_start_trans(osb, credits); 1648 /* don't reserve bitmap space for fast symlinks. */
1649 if (l > ocfs2_fast_symlink_chars(sb))
1650 want_clusters += 1;
1651
1652 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
1653 if (status < 0) {
1654 if (status != -ENOSPC)
1655 mlog_errno(status);
1656 goto bail;
1657 }
1658
1659 handle = ocfs2_start_trans(osb, credits + xattr_credits);
1556 if (IS_ERR(handle)) { 1660 if (IS_ERR(handle)) {
1557 status = PTR_ERR(handle); 1661 status = PTR_ERR(handle);
1558 handle = NULL; 1662 handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
1560 goto bail; 1664 goto bail;
1561 } 1665 }
1562 1666
1563 status = ocfs2_mknod_locked(osb, dir, dentry, 1667 /* We don't use standard VFS wrapper because we don't want vfs_dq_init
1564 S_IFLNK | S_IRWXUGO, 0, 1668 * to be called. */
1565 &new_fe_bh, parent_fe_bh, handle, 1669 if (sb_any_quota_active(osb->sb) &&
1566 &inode, inode_ac); 1670 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
1671 status = -EDQUOT;
1672 goto bail;
1673 }
1674 did_quota_inode = 1;
1675
1676 status = ocfs2_mknod_locked(osb, dir, inode, dentry,
1677 0, &new_fe_bh, parent_fe_bh, handle,
1678 inode_ac);
1567 if (status < 0) { 1679 if (status < 0) {
1568 mlog_errno(status); 1680 mlog_errno(status);
1569 goto bail; 1681 goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
1576 u32 offset = 0; 1688 u32 offset = 0;
1577 1689
1578 inode->i_op = &ocfs2_symlink_inode_operations; 1690 inode->i_op = &ocfs2_symlink_inode_operations;
1691 if (vfs_dq_alloc_space_nodirty(inode,
1692 ocfs2_clusters_to_bytes(osb->sb, 1))) {
1693 status = -EDQUOT;
1694 goto bail;
1695 }
1696 did_quota = 1;
1579 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, 1697 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1580 new_fe_bh, 1698 new_fe_bh,
1581 handle, data_ac, NULL, 1699 handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
1614 } 1732 }
1615 } 1733 }
1616 1734
1735 if (si.enable) {
1736 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
1737 xattr_ac, data_ac);
1738 if (status < 0) {
1739 mlog_errno(status);
1740 goto bail;
1741 }
1742 }
1743
1617 status = ocfs2_add_entry(handle, dentry, inode, 1744 status = ocfs2_add_entry(handle, dentry, inode,
1618 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1745 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1619 de_bh); 1746 de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
1632 dentry->d_op = &ocfs2_dentry_ops; 1759 dentry->d_op = &ocfs2_dentry_ops;
1633 d_instantiate(dentry, inode); 1760 d_instantiate(dentry, inode);
1634bail: 1761bail:
1762 if (status < 0 && did_quota)
1763 vfs_dq_free_space_nodirty(inode,
1764 ocfs2_clusters_to_bytes(osb->sb, 1));
1765 if (status < 0 && did_quota_inode)
1766 vfs_dq_free_inode(inode);
1635 if (handle) 1767 if (handle)
1636 ocfs2_commit_trans(osb, handle); 1768 ocfs2_commit_trans(osb, handle);
1637 1769
@@ -1640,12 +1772,18 @@ bail:
1640 brelse(new_fe_bh); 1772 brelse(new_fe_bh);
1641 brelse(parent_fe_bh); 1773 brelse(parent_fe_bh);
1642 brelse(de_bh); 1774 brelse(de_bh);
1775 kfree(si.name);
1776 kfree(si.value);
1643 if (inode_ac) 1777 if (inode_ac)
1644 ocfs2_free_alloc_context(inode_ac); 1778 ocfs2_free_alloc_context(inode_ac);
1645 if (data_ac) 1779 if (data_ac)
1646 ocfs2_free_alloc_context(data_ac); 1780 ocfs2_free_alloc_context(data_ac);
1647 if ((status < 0) && inode) 1781 if (xattr_ac)
1782 ocfs2_free_alloc_context(xattr_ac);
1783 if ((status < 0) && inode) {
1784 clear_nlink(inode);
1648 iput(inode); 1785 iput(inode);
1786 }
1649 1787
1650 mlog_exit(status); 1788 mlog_exit(status);
1651 1789
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1754 1892
1755 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1893 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1756 1894
1757 status = ocfs2_read_block(orphan_dir_inode, 1895 status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
1758 OCFS2_I(orphan_dir_inode)->ip_blkno,
1759 &orphan_dir_bh);
1760 if (status < 0) { 1896 if (status < 0) {
1761 mlog_errno(status); 1897 mlog_errno(status);
1762 goto leave; 1898 goto leave;
1763 } 1899 }
1764 1900
1765 status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, 1901 status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
1766 OCFS2_JOURNAL_ACCESS_WRITE); 1902 OCFS2_JOURNAL_ACCESS_WRITE);
1767 if (status < 0) { 1903 if (status < 0) {
1768 mlog_errno(status); 1904 mlog_errno(status);
1769 goto leave; 1905 goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1850 goto leave; 1986 goto leave;
1851 } 1987 }
1852 1988
1853 status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, 1989 status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh,
1854 OCFS2_JOURNAL_ACCESS_WRITE); 1990 OCFS2_JOURNAL_ACCESS_WRITE);
1855 if (status < 0) { 1991 if (status < 0) {
1856 mlog_errno(status); 1992 mlog_errno(status);
1857 goto leave; 1993 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d8992..ad5c24a29edd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
161{ 161{
162 VOLUME_INIT = 0, 162 VOLUME_INIT = 0,
163 VOLUME_MOUNTED, 163 VOLUME_MOUNTED,
164 VOLUME_MOUNTED_QUOTAS,
164 VOLUME_DISMOUNTED, 165 VOLUME_DISMOUNTED,
165 VOLUME_DISABLED 166 VOLUME_DISABLED
166}; 167};
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
195 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ 196 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
196 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ 197 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
197 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ 198 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
199 OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
200 OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
201 OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
198}; 202};
199 203
200#define OCFS2_OSB_SOFT_RO 0x0001 204#define OCFS2_OSB_SOFT_RO 0x0001
@@ -205,6 +209,7 @@ enum ocfs2_mount_options
205struct ocfs2_journal; 209struct ocfs2_journal;
206struct ocfs2_slot_info; 210struct ocfs2_slot_info;
207struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_quota_recovery;
208struct ocfs2_super 213struct ocfs2_super
209{ 214{
210 struct task_struct *commit_task; 215 struct task_struct *commit_task;
@@ -286,10 +291,11 @@ struct ocfs2_super
286 char *local_alloc_debug_buf; 291 char *local_alloc_debug_buf;
287#endif 292#endif
288 293
289 /* Next two fields are for local node slot recovery during 294 /* Next three fields are for local node slot recovery during
290 * mount. */ 295 * mount. */
291 int dirty; 296 int dirty;
292 struct ocfs2_dinode *local_alloc_copy; 297 struct ocfs2_dinode *local_alloc_copy;
298 struct ocfs2_quota_recovery *quota_rec;
293 299
294 struct ocfs2_alloc_stats alloc_stats; 300 struct ocfs2_alloc_stats alloc_stats;
295 char dev_str[20]; /* "major,minor" of the device */ 301 char dev_str[20]; /* "major,minor" of the device */
@@ -333,6 +339,10 @@ struct ocfs2_super
333 339
334#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 340#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
335 341
342/* Useful typedef for passing around journal access functions */
343typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
344 struct buffer_head *bh, int type);
345
336static inline int ocfs2_should_order_data(struct inode *inode) 346static inline int ocfs2_should_order_data(struct inode *inode)
337{ 347{
338 if (!S_ISREG(inode->i_mode)) 348 if (!S_ISREG(inode->i_mode))
@@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
376 return 0; 386 return 0;
377} 387}
378 388
389static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
390{
391 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
392 return 1;
393 return 0;
394}
395
379/* set / clear functions because cluster events can make these happen 396/* set / clear functions because cluster events can make these happen
380 * in parallel so we want the transitions to be atomic. this also 397 * in parallel so we want the transitions to be atomic. this also
381 * means that any future flags osb_flags must be protected by spinlock 398 * means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
443#define OCFS2_IS_VALID_DINODE(ptr) \ 460#define OCFS2_IS_VALID_DINODE(ptr) \
444 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) 461 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
445 462
446#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
447 typeof(__di) ____di = (__di); \
448 ocfs2_error((__sb), \
449 "Dinode # %llu has bad signature %.*s", \
450 (unsigned long long)le64_to_cpu((____di)->i_blkno), 7, \
451 (____di)->i_signature); \
452} while (0)
453
454#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ 463#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
455 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) 464 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
456 465
457#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
458 typeof(__eb) ____eb = (__eb); \
459 ocfs2_error((__sb), \
460 "Extent Block # %llu has bad signature %.*s", \
461 (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7, \
462 (____eb)->h_signature); \
463} while (0)
464
465#define OCFS2_IS_VALID_GROUP_DESC(ptr) \ 466#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
466 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) 467 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
467 468
468#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
469 typeof(__gd) ____gd = (__gd); \
470 ocfs2_error((__sb), \
471 "Group Descriptor # %llu has bad signature %.*s", \
472 (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
473 (____gd)->bg_signature); \
474} while (0)
475 469
476#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ 470#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \
477 (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) 471 (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
478 472
473#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
474 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
475
479static inline unsigned long ino_from_blkno(struct super_block *sb, 476static inline unsigned long ino_from_blkno(struct super_block *sb,
480 u64 blkno) 477 u64 blkno)
481{ 478{
@@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
632#define ocfs2_clear_bit ext2_clear_bit 629#define ocfs2_clear_bit ext2_clear_bit
633#define ocfs2_test_bit ext2_test_bit 630#define ocfs2_test_bit ext2_test_bit
634#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 631#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
632#define ocfs2_find_next_bit ext2_find_next_bit
635#endif /* OCFS2_H */ 633#endif /* OCFS2_H */
636 634
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7d..c7ae45aaa36c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" 65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" 67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
68 69
69/* Compatibility flags */ 70/* Compatibility flags */
70#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 71#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -93,8 +94,11 @@
93 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ 94 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
94 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 95 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
95 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 96 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
96 | OCFS2_FEATURE_INCOMPAT_XATTR) 97 | OCFS2_FEATURE_INCOMPAT_XATTR \
97#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 98 | OCFS2_FEATURE_INCOMPAT_META_ECC)
99#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
100 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
101 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
98 102
99/* 103/*
100 * Heartbeat-only devices are missing journals and other files. The 104 * Heartbeat-only devices are missing journals and other files. The
@@ -147,6 +151,9 @@
147/* Support for extended attributes */ 151/* Support for extended attributes */
148#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 152#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
149 153
154/* Metadata checksum and error correction */
155#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
156
150/* 157/*
151 * backup superblock flag is used to indicate that this volume 158 * backup superblock flag is used to indicate that this volume
152 * has backup superblocks. 159 * has backup superblocks.
@@ -163,6 +170,12 @@
163 */ 170 */
164#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 171#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
165 172
173/*
174 * Maintain quota information for this filesystem
175 */
176#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
177#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
178
166/* The byte offset of the first backup block will be 1G. 179/* The byte offset of the first backup block will be 1G.
167 * The following will be 4G, 16G, 64G, 256G and 1T. 180 * The following will be 4G, 16G, 64G, 256G and 1T.
168 */ 181 */
@@ -192,6 +205,7 @@
192#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ 205#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
193#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ 206#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
194#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ 207#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
208#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
195 209
196/* 210/*
197 * Flags on ocfs2_dinode.i_dyn_features 211 * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
329#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE 343#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
330 HEARTBEAT_SYSTEM_INODE, 344 HEARTBEAT_SYSTEM_INODE,
331 GLOBAL_BITMAP_SYSTEM_INODE, 345 GLOBAL_BITMAP_SYSTEM_INODE,
332#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE 346 USER_QUOTA_SYSTEM_INODE,
347 GROUP_QUOTA_SYSTEM_INODE,
348#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
333 ORPHAN_DIR_SYSTEM_INODE, 349 ORPHAN_DIR_SYSTEM_INODE,
334 EXTENT_ALLOC_SYSTEM_INODE, 350 EXTENT_ALLOC_SYSTEM_INODE,
335 INODE_ALLOC_SYSTEM_INODE, 351 INODE_ALLOC_SYSTEM_INODE,
336 JOURNAL_SYSTEM_INODE, 352 JOURNAL_SYSTEM_INODE,
337 LOCAL_ALLOC_SYSTEM_INODE, 353 LOCAL_ALLOC_SYSTEM_INODE,
338 TRUNCATE_LOG_SYSTEM_INODE, 354 TRUNCATE_LOG_SYSTEM_INODE,
355 LOCAL_USER_QUOTA_SYSTEM_INODE,
356 LOCAL_GROUP_QUOTA_SYSTEM_INODE,
339 NUM_SYSTEM_INODES 357 NUM_SYSTEM_INODES
340}; 358};
341 359
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
349 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, 367 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
350 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, 368 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
351 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, 369 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
370 [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
371 [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
352 372
353 /* Slot-specific system inodes (one copy per slot) */ 373 /* Slot-specific system inodes (one copy per slot) */
354 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, 374 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
356 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, 376 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
357 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, 377 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
358 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, 378 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
359 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 } 379 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
380 [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
381 [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
360}; 382};
361 383
362/* Parameter passed from mount.ocfs2 to module */ 384/* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
410#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) 432#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
411 433
412/* 434/*
435 * Block checking structure. This is used in metadata to validate the
436 * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
437 * zeros.
438 */
439struct ocfs2_block_check {
440/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */
441 __le16 bc_ecc; /* Single-error-correction parity vector.
442 This is a simple Hamming code dependant
443 on the blocksize. OCFS2's maximum
444 blocksize, 4K, requires 16 parity bits,
445 so we fit in __le16. */
446 __le16 bc_reserved1;
447/*08*/
448};
449
450/*
413 * On disk extent record for OCFS2 451 * On disk extent record for OCFS2
414 * It describes a range of clusters on disk. 452 * It describes a range of clusters on disk.
415 * 453 *
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
496struct ocfs2_extent_block 534struct ocfs2_extent_block
497{ 535{
498/*00*/ __u8 h_signature[8]; /* Signature for verification */ 536/*00*/ __u8 h_signature[8]; /* Signature for verification */
499 __le64 h_reserved1; 537 struct ocfs2_block_check h_check; /* Error checking */
500/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this 538/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
501 extent_header belongs to */ 539 extent_header belongs to */
502 __le16 h_suballoc_bit; /* Bit offset in suballocator 540 __le16 h_suballoc_bit; /* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
666 was set in i_flags */ 704 was set in i_flags */
667 __le16 i_dyn_features; 705 __le16 i_dyn_features;
668 __le64 i_xattr_loc; 706 __le64 i_xattr_loc;
669/*80*/ __le64 i_reserved2[7]; 707/*80*/ struct ocfs2_block_check i_check; /* Error checking */
708/*88*/ __le64 i_reserved2[6];
670/*B8*/ union { 709/*B8*/ union {
671 __le64 i_pad1; /* Generic way to refer to this 710 __le64 i_pad1; /* Generic way to refer to this
672 64bit union */ 711 64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
715} __attribute__ ((packed)); 754} __attribute__ ((packed));
716 755
717/* 756/*
757 * Per-block record for the unindexed directory btree. This is carefully
758 * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
759 * mirrored. That way, the directory manipulation code needs a minimal amount
760 * of update.
761 *
762 * NOTE: Keep this structure aligned to a multiple of 4 bytes.
763 */
764struct ocfs2_dir_block_trailer {
765/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */
766
767 __le16 db_compat_rec_len; /* Backwards compatible with
768 * ocfs2_dir_entry. */
769 __u8 db_compat_name_len; /* Always zero. Was name_len */
770 __u8 db_reserved0;
771 __le16 db_reserved1;
772 __le16 db_free_rec_len; /* Size of largest empty hole
773 * in this block. (unused) */
774/*10*/ __u8 db_signature[8]; /* Signature for verification */
775 __le64 db_reserved2;
776 __le64 db_free_next; /* Next block in list (unused) */
777/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */
778 __le64 db_parent_dinode; /* dinode which owns me, in
779 blocks */
780/*30*/ struct ocfs2_block_check db_check; /* Error checking */
781/*40*/
782};
783
784/*
718 * On disk allocator group structure for OCFS2 785 * On disk allocator group structure for OCFS2
719 */ 786 */
720struct ocfs2_group_desc 787struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
733/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in 800/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
734 blocks */ 801 blocks */
735 __le64 bg_blkno; /* Offset on disk, in blocks */ 802 __le64 bg_blkno; /* Offset on disk, in blocks */
736/*30*/ __le64 bg_reserved2[2]; 803/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
804 __le64 bg_reserved2;
737/*40*/ __u8 bg_bitmap[0]; 805/*40*/ __u8 bg_bitmap[0];
738}; 806};
739 807
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
776 in this extent record, 844 in this extent record,
777 only valid in the first 845 only valid in the first
778 bucket. */ 846 bucket. */
779 __le64 xh_csum; 847 struct ocfs2_block_check xh_check; /* Error checking
848 (Note, this is only
849 used for xattr
850 buckets. A block uses
851 xb_check and sets
852 this field to zero.) */
780 struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ 853 struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
781}; 854};
782 855
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
827 block group */ 900 block group */
828 __le32 xb_fs_generation; /* Must match super block */ 901 __le32 xb_fs_generation; /* Must match super block */
829/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ 902/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
830 __le64 xb_csum; 903 struct ocfs2_block_check xb_check; /* Error checking */
831/*20*/ __le16 xb_flags; /* Indicates whether this block contains 904/*20*/ __le16 xb_flags; /* Indicates whether this block contains
832 real xattr or a xattr tree. */ 905 real xattr or a xattr tree. */
833 __le16 xb_reserved0; 906 __le16 xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
868 return xe->xe_type & OCFS2_XATTR_TYPE_MASK; 941 return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
869} 942}
870 943
944/*
945 * On disk structures for global quota file
946 */
947
948/* Magic numbers and known versions for global quota files */
949#define OCFS2_GLOBAL_QMAGICS {\
950 0x0cf52470, /* USRQUOTA */ \
951 0x0cf52471 /* GRPQUOTA */ \
952}
953
954#define OCFS2_GLOBAL_QVERSIONS {\
955 0, \
956 0, \
957}
958
959
960/* Each block of each quota file has a certain fixed number of bytes reserved
961 * for OCFS2 internal use at its end. OCFS2 can use it for things like
962 * checksums, etc. */
963#define OCFS2_QBLK_RESERVED_SPACE 8
964
965/* Generic header of all quota files */
966struct ocfs2_disk_dqheader {
967 __le32 dqh_magic; /* Magic number identifying file */
968 __le32 dqh_version; /* Quota format version */
969};
970
971#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
972
973/* Information header of global quota file (immediately follows the generic
974 * header) */
975struct ocfs2_global_disk_dqinfo {
976/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */
977 __le32 dqi_igrace; /* Grace time for inode softlimit excess */
978 __le32 dqi_syncms; /* Time after which we sync local changes to
979 * global quota file */
980 __le32 dqi_blocks; /* Number of blocks in quota file */
981/*10*/ __le32 dqi_free_blk; /* First free block in quota file */
982 __le32 dqi_free_entry; /* First block with free dquot entry in quota
983 * file */
984};
985
986/* Structure with global user / group information. We reserve some space
987 * for future use. */
988struct ocfs2_global_disk_dqblk {
989/*00*/ __le32 dqb_id; /* ID the structure belongs to */
990 __le32 dqb_use_count; /* Number of nodes having reference to this structure */
991 __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */
992/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */
993 __le64 dqb_curinodes; /* current # allocated inodes */
994/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */
995 __le64 dqb_bsoftlimit; /* preferred limit on disk space */
996/*30*/ __le64 dqb_curspace; /* current space occupied */
997 __le64 dqb_btime; /* time limit for excessive disk use */
998/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */
999 __le64 dqb_pad1;
1000/*50*/ __le64 dqb_pad2;
1001};
1002
1003/*
1004 * On-disk structures for local quota file
1005 */
1006
1007/* Magic numbers and known versions for local quota files */
1008#define OCFS2_LOCAL_QMAGICS {\
1009 0x0cf524c0, /* USRQUOTA */ \
1010 0x0cf524c1 /* GRPQUOTA */ \
1011}
1012
1013#define OCFS2_LOCAL_QVERSIONS {\
1014 0, \
1015 0, \
1016}
1017
1018/* Quota flags in dqinfo header */
1019#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\
1020 * quota has been cleanly turned off) */
1021
1022#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
1023
1024/* Information header of local quota file (immediately follows the generic
1025 * header) */
1026struct ocfs2_local_disk_dqinfo {
1027 __le32 dqi_flags; /* Flags for quota file */
1028 __le32 dqi_chunks; /* Number of chunks of quota structures
1029 * with a bitmap */
1030 __le32 dqi_blocks; /* Number of blocks allocated for quota file */
1031};
1032
1033/* Header of one chunk of a quota file */
1034struct ocfs2_local_disk_chunk {
1035 __le32 dqc_free; /* Number of free entries in the bitmap */
1036 u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
1037 * chunk of quota file */
1038};
1039
1040/* One entry in local quota file */
1041struct ocfs2_local_disk_dqblk {
1042/*00*/ __le64 dqb_id; /* id this quota applies to */
1043 __le64 dqb_spacemod; /* Change in the amount of used space */
1044/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */
1045};
1046
1047
1048/*
1049 * The quota trailer lives at the end of each quota block.
1050 */
1051
1052struct ocfs2_disk_dqtrailer {
1053/*00*/ struct ocfs2_block_check dq_check; /* Error checking */
1054/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
1055};
1056
1057static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
1058 void *buf)
1059{
1060 char *ptr = buf;
1061 ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
1062
1063 return (struct ocfs2_disk_dqtrailer *)ptr;
1064}
1065
871#ifdef __KERNEL__ 1066#ifdef __KERNEL__
872static inline int ocfs2_fast_symlink_chars(struct super_block *sb) 1067static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
873{ 1068{
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f558..000000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_jbd_compat.h
5 *
6 * Compatibility defines for JBD.
7 *
8 * Copyright (C) 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License version 2 as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_JBD_COMPAT_H
21#define OCFS2_JBD_COMPAT_H
22
23#ifndef CONFIG_OCFS2_COMPAT_JBD
24# error Should not have been included
25#endif
26
27struct jbd2_inode {
28 unsigned int dummy;
29};
30
31#define JBD2_BARRIER JFS_BARRIER
32#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE
33
34#define jbd2_journal_ack_err journal_ack_err
35#define jbd2_journal_clear_err journal_clear_err
36#define jbd2_journal_destroy journal_destroy
37#define jbd2_journal_dirty_metadata journal_dirty_metadata
38#define jbd2_journal_errno journal_errno
39#define jbd2_journal_extend journal_extend
40#define jbd2_journal_flush journal_flush
41#define jbd2_journal_force_commit journal_force_commit
42#define jbd2_journal_get_write_access journal_get_write_access
43#define jbd2_journal_get_undo_access journal_get_undo_access
44#define jbd2_journal_init_inode journal_init_inode
45#define jbd2_journal_invalidatepage journal_invalidatepage
46#define jbd2_journal_load journal_load
47#define jbd2_journal_lock_updates journal_lock_updates
48#define jbd2_journal_restart journal_restart
49#define jbd2_journal_start journal_start
50#define jbd2_journal_start_commit journal_start_commit
51#define jbd2_journal_stop journal_stop
52#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers
53#define jbd2_journal_unlock_updates journal_unlock_updates
54#define jbd2_journal_wipe journal_wipe
55#define jbd2_log_wait_commit log_wait_commit
56
57static inline int jbd2_journal_file_inode(handle_t *handle,
58 struct jbd2_inode *inode)
59{
60 return 0;
61}
62
63static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
64 loff_t new_size)
65{
66 return 0;
67}
68
69static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
70 struct inode *inode)
71{
72 return;
73}
74
75static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
76 struct jbd2_inode *jinode)
77{
78 return;
79}
80
81
82#endif /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f1..eb6f50c9ceca 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK, 48 OCFS2_LOCK_TYPE_FLOCK,
49 OCFS2_LOCK_TYPE_QINFO,
49 OCFS2_NUM_LOCK_TYPES 50 OCFS2_NUM_LOCK_TYPES
50}; 51};
51 52
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
77 case OCFS2_LOCK_TYPE_FLOCK: 78 case OCFS2_LOCK_TYPE_FLOCK:
78 c = 'F'; 79 c = 'F';
79 break; 80 break;
81 case OCFS2_LOCK_TYPE_QINFO:
82 c = 'Q';
83 break;
80 default: 84 default:
81 c = '\0'; 85 c = '\0';
82 } 86 }
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
95 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 99 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
96 [OCFS2_LOCK_TYPE_OPEN] = "Open", 100 [OCFS2_LOCK_TYPE_OPEN] = "Open",
97 [OCFS2_LOCK_TYPE_FLOCK] = "Flock", 101 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
102 [OCFS2_LOCK_TYPE_QINFO] = "Quota",
98}; 103};
99 104
100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 105static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 000000000000..7365e2e08706
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
1/*
2 * quota.h for OCFS2
3 *
4 * On disk quota structures for local and global quota file, in-memory
5 * structures.
6 *
7 */
8
9#ifndef _OCFS2_QUOTA_H
10#define _OCFS2_QUOTA_H
11
12#include <linux/types.h>
13#include <linux/slab.h>
14#include <linux/quota.h>
15#include <linux/list.h>
16#include <linux/dqblk_qtree.h>
17
18#include "ocfs2.h"
19
20/* Common stuff */
21/* id number of quota format */
22#define QFMT_OCFS2 3
23
24/*
25 * In-memory structures
26 */
27struct ocfs2_dquot {
28 struct dquot dq_dquot; /* Generic VFS dquot */
29 loff_t dq_local_off; /* Offset in the local quota file */
30 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
31 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
32 s64 dq_origspace; /* Last globally synced space usage */
33 s64 dq_originodes; /* Last globally synced inode usage */
34};
35
36/* Description of one chunk to recover in memory */
37struct ocfs2_recovery_chunk {
38 struct list_head rc_list; /* List of chunks */
39 int rc_chunk; /* Chunk number */
40 unsigned long *rc_bitmap; /* Bitmap of entries to recover */
41};
42
43struct ocfs2_quota_recovery {
44 struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */
45};
46
47/* In-memory structure with quota header information */
48struct ocfs2_mem_dqinfo {
49 unsigned int dqi_type; /* Quota type this structure describes */
50 unsigned int dqi_chunks; /* Number of chunks in local quota file */
51 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
52 unsigned int dqi_syncms; /* How often should we sync with other nodes */
53 unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */
54 struct list_head dqi_chunk; /* List of chunks */
55 struct inode *dqi_gqinode; /* Global quota file inode */
56 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
57 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
58 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
59 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
60 struct buffer_head *dqi_ibh; /* Buffer with information header */
61 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
62 struct delayed_work dqi_sync_work; /* Work for syncing dquots */
63 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
64 * information, in case we
65 * enable quotas on file
66 * needing it */
67};
68
69static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
70{
71 return container_of(dquot, struct ocfs2_dquot, dq_dquot);
72}
73
74struct ocfs2_quota_chunk {
75 struct list_head qc_chunk; /* List of quotafile chunks */
76 int qc_num; /* Number of quota chunk */
77 struct buffer_head *qc_headerbh; /* Buffer head with chunk header */
78};
79
80extern struct kmem_cache *ocfs2_dquot_cachep;
81extern struct kmem_cache *ocfs2_qf_chunk_cachep;
82
83extern struct qtree_fmt_operations ocfs2_global_ops;
84
85struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
86 struct ocfs2_super *osb, int slot_num);
87int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
88 struct ocfs2_quota_recovery *rec,
89 int slot_num);
90void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
91ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
92 size_t len, loff_t off);
93ssize_t ocfs2_quota_write(struct super_block *sb, int type,
94 const char *data, size_t len, loff_t off);
95int ocfs2_global_read_info(struct super_block *sb, int type);
96int ocfs2_global_write_info(struct super_block *sb, int type);
97int ocfs2_global_read_dquot(struct dquot *dquot);
98int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
99static inline int ocfs2_sync_dquot(struct dquot *dquot)
100{
101 return __ocfs2_sync_dquot(dquot, 0);
102}
103static inline int ocfs2_global_release_dquot(struct dquot *dquot)
104{
105 return __ocfs2_sync_dquot(dquot, 1);
106}
107
108int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
109void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
110int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
111 struct buffer_head **bh);
112
113extern struct dquot_operations ocfs2_quota_operations;
114extern struct quota_format_type ocfs2_quota_format;
115
116int ocfs2_quota_setup(void);
117void ocfs2_quota_shutdown(void);
118
119#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000000..6aff8f2d3e49
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,1025 @@
1/*
2 * Implementation of operations over global quota file
3 */
4#include <linux/spinlock.h>
5#include <linux/fs.h>
6#include <linux/quota.h>
7#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h>
9#include <linux/jiffies.h>
10#include <linux/writeback.h>
11#include <linux/workqueue.h>
12
13#define MLOG_MASK_PREFIX ML_QUOTA
14#include <cluster/masklog.h>
15
16#include "ocfs2_fs.h"
17#include "ocfs2.h"
18#include "alloc.h"
19#include "blockcheck.h"
20#include "inode.h"
21#include "journal.h"
22#include "file.h"
23#include "sysfile.h"
24#include "dlmglue.h"
25#include "uptodate.h"
26#include "quota.h"
27
28static struct workqueue_struct *ocfs2_quota_wq = NULL;
29
30static void qsync_work_fn(struct work_struct *work);
31
32static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
33{
34 struct ocfs2_global_disk_dqblk *d = dp;
35 struct mem_dqblk *m = &dquot->dq_dqb;
36
37 /* Update from disk only entries not set by the admin */
38 if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
39 m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
40 m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
41 }
42 if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
43 m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
44 if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
45 m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
46 m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
47 }
48 if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
49 m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
50 if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
51 m->dqb_btime = le64_to_cpu(d->dqb_btime);
52 if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
53 m->dqb_itime = le64_to_cpu(d->dqb_itime);
54 OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
55}
56
57static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
58{
59 struct ocfs2_global_disk_dqblk *d = dp;
60 struct mem_dqblk *m = &dquot->dq_dqb;
61
62 d->dqb_id = cpu_to_le32(dquot->dq_id);
63 d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
64 d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
65 d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
66 d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
67 d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
68 d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
69 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
70 d->dqb_btime = cpu_to_le64(m->dqb_btime);
71 d->dqb_itime = cpu_to_le64(m->dqb_itime);
72}
73
74static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
75{
76 struct ocfs2_global_disk_dqblk *d = dp;
77 struct ocfs2_mem_dqinfo *oinfo =
78 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
79
80 if (qtree_entry_unused(&oinfo->dqi_gi, dp))
81 return 0;
82 return le32_to_cpu(d->dqb_id) == dquot->dq_id;
83}
84
85struct qtree_fmt_operations ocfs2_global_ops = {
86 .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
87 .disk2mem_dqblk = ocfs2_global_disk2memdqb,
88 .is_id = ocfs2_global_is_id,
89};
90
91static int ocfs2_validate_quota_block(struct super_block *sb,
92 struct buffer_head *bh)
93{
94 struct ocfs2_disk_dqtrailer *dqt =
95 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
96
97 mlog(0, "Validating quota block %llu\n",
98 (unsigned long long)bh->b_blocknr);
99
100 BUG_ON(!buffer_uptodate(bh));
101
102 /*
103 * If the ecc fails, we return the error but otherwise
104 * leave the filesystem running. We know any error is
105 * local to this block.
106 */
107 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
108}
109
110int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
111 struct buffer_head **bh)
112{
113 int rc = 0;
114 struct buffer_head *tmp = *bh;
115
116 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
117 ocfs2_validate_quota_block);
118 if (rc)
119 mlog_errno(rc);
120
121 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
122 if (!rc && !*bh)
123 *bh = tmp;
124
125 return rc;
126}
127
128static int ocfs2_get_quota_block(struct inode *inode, int block,
129 struct buffer_head **bh)
130{
131 u64 pblock, pcount;
132 int err;
133
134 down_read(&OCFS2_I(inode)->ip_alloc_sem);
135 err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
136 up_read(&OCFS2_I(inode)->ip_alloc_sem);
137 if (err) {
138 mlog_errno(err);
139 return err;
140 }
141 *bh = sb_getblk(inode->i_sb, pblock);
142 if (!*bh) {
143 err = -EIO;
144 mlog_errno(err);
145 }
146 return err;;
147}
148
149/* Read data from global quotafile - avoid pagecache and such because we cannot
150 * afford acquiring the locks... We use quota cluster lock to serialize
151 * operations. Caller is responsible for acquiring it. */
152ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
153 size_t len, loff_t off)
154{
155 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
156 struct inode *gqinode = oinfo->dqi_gqinode;
157 loff_t i_size = i_size_read(gqinode);
158 int offset = off & (sb->s_blocksize - 1);
159 sector_t blk = off >> sb->s_blocksize_bits;
160 int err = 0;
161 struct buffer_head *bh;
162 size_t toread, tocopy;
163
164 if (off > i_size)
165 return 0;
166 if (off + len > i_size)
167 len = i_size - off;
168 toread = len;
169 while (toread > 0) {
170 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
171 bh = NULL;
172 err = ocfs2_read_quota_block(gqinode, blk, &bh);
173 if (err) {
174 mlog_errno(err);
175 return err;
176 }
177 memcpy(data, bh->b_data + offset, tocopy);
178 brelse(bh);
179 offset = 0;
180 toread -= tocopy;
181 data += tocopy;
182 blk++;
183 }
184 return len;
185}
186
187/* Write to quotafile (we know the transaction is already started and has
188 * enough credits) */
189ssize_t ocfs2_quota_write(struct super_block *sb, int type,
190 const char *data, size_t len, loff_t off)
191{
192 struct mem_dqinfo *info = sb_dqinfo(sb, type);
193 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
194 struct inode *gqinode = oinfo->dqi_gqinode;
195 int offset = off & (sb->s_blocksize - 1);
196 sector_t blk = off >> sb->s_blocksize_bits;
197 int err = 0, new = 0, ja_type;
198 struct buffer_head *bh = NULL;
199 handle_t *handle = journal_current_handle();
200
201 if (!handle) {
202 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
203 "because transaction was not started.\n",
204 (unsigned long long)off, (unsigned long long)len);
205 return -EIO;
206 }
207 if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
208 WARN_ON(1);
209 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
210 }
211
212 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
213 if (gqinode->i_size < off + len) {
214 down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
215 err = ocfs2_extend_no_holes(gqinode, off + len, off);
216 up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
217 if (err < 0)
218 goto out;
219 err = ocfs2_simple_size_update(gqinode,
220 oinfo->dqi_gqi_bh,
221 off + len);
222 if (err < 0)
223 goto out;
224 new = 1;
225 }
226 /* Not rewriting whole block? */
227 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
228 !new) {
229 err = ocfs2_read_quota_block(gqinode, blk, &bh);
230 ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
231 } else {
232 err = ocfs2_get_quota_block(gqinode, blk, &bh);
233 ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
234 }
235 if (err) {
236 mlog_errno(err);
237 return err;
238 }
239 lock_buffer(bh);
240 if (new)
241 memset(bh->b_data, 0, sb->s_blocksize);
242 memcpy(bh->b_data + offset, data, len);
243 flush_dcache_page(bh->b_page);
244 set_buffer_uptodate(bh);
245 unlock_buffer(bh);
246 ocfs2_set_buffer_uptodate(gqinode, bh);
247 err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
248 if (err < 0) {
249 brelse(bh);
250 goto out;
251 }
252 err = ocfs2_journal_dirty(handle, bh);
253 brelse(bh);
254 if (err < 0)
255 goto out;
256out:
257 if (err) {
258 mutex_unlock(&gqinode->i_mutex);
259 mlog_errno(err);
260 return err;
261 }
262 gqinode->i_version++;
263 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
264 mutex_unlock(&gqinode->i_mutex);
265 return len;
266}
267
268int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
269{
270 int status;
271 struct buffer_head *bh = NULL;
272
273 status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
274 if (status < 0)
275 return status;
276 spin_lock(&dq_data_lock);
277 if (!oinfo->dqi_gqi_count++)
278 oinfo->dqi_gqi_bh = bh;
279 else
280 WARN_ON(bh != oinfo->dqi_gqi_bh);
281 spin_unlock(&dq_data_lock);
282 return 0;
283}
284
285void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
286{
287 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
288 brelse(oinfo->dqi_gqi_bh);
289 spin_lock(&dq_data_lock);
290 if (!--oinfo->dqi_gqi_count)
291 oinfo->dqi_gqi_bh = NULL;
292 spin_unlock(&dq_data_lock);
293}
294
295/* Read information header from global quota file */
296int ocfs2_global_read_info(struct super_block *sb, int type)
297{
298 struct inode *gqinode = NULL;
299 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
300 GROUP_QUOTA_SYSTEM_INODE };
301 struct ocfs2_global_disk_dqinfo dinfo;
302 struct mem_dqinfo *info = sb_dqinfo(sb, type);
303 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
304 int status;
305
306 mlog_entry_void();
307
308 /* Read global header */
309 gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
310 OCFS2_INVALID_SLOT);
311 if (!gqinode) {
312 mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
313 type);
314 status = -EINVAL;
315 goto out_err;
316 }
317 oinfo->dqi_gi.dqi_sb = sb;
318 oinfo->dqi_gi.dqi_type = type;
319 ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
320 oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
321 oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
322 oinfo->dqi_gqi_bh = NULL;
323 oinfo->dqi_gqi_count = 0;
324 oinfo->dqi_gqinode = gqinode;
325 status = ocfs2_lock_global_qf(oinfo, 0);
326 if (status < 0) {
327 mlog_errno(status);
328 goto out_err;
329 }
330 status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
331 sizeof(struct ocfs2_global_disk_dqinfo),
332 OCFS2_GLOBAL_INFO_OFF);
333 ocfs2_unlock_global_qf(oinfo, 0);
334 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
335 mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
336 status);
337 if (status >= 0)
338 status = -EIO;
339 mlog_errno(status);
340 goto out_err;
341 }
342 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
343 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
344 oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
345 oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
346 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
347 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
348 oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
349 oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
350 oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
351 OCFS2_QBLK_RESERVED_SPACE;
352 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
353 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
354 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
355 oinfo->dqi_syncjiff);
356
357out_err:
358 mlog_exit(status);
359 return status;
360}
361
362/* Write information to global quota file. Expects exlusive lock on quota
363 * file inode and quota info */
364static int __ocfs2_global_write_info(struct super_block *sb, int type)
365{
366 struct mem_dqinfo *info = sb_dqinfo(sb, type);
367 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
368 struct ocfs2_global_disk_dqinfo dinfo;
369 ssize_t size;
370
371 spin_lock(&dq_data_lock);
372 info->dqi_flags &= ~DQF_INFO_DIRTY;
373 dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
374 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
375 spin_unlock(&dq_data_lock);
376 dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
377 dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
378 dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
379 dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
380 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
381 sizeof(struct ocfs2_global_disk_dqinfo),
382 OCFS2_GLOBAL_INFO_OFF);
383 if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
384 mlog(ML_ERROR, "Cannot write global quota info structure\n");
385 if (size >= 0)
386 size = -EIO;
387 return size;
388 }
389 return 0;
390}
391
392int ocfs2_global_write_info(struct super_block *sb, int type)
393{
394 int err;
395 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
396
397 err = ocfs2_qinfo_lock(info, 1);
398 if (err < 0)
399 return err;
400 err = __ocfs2_global_write_info(sb, type);
401 ocfs2_qinfo_unlock(info, 1);
402 return err;
403}
404
405/* Read in information from global quota file and acquire a reference to it.
406 * dquot_acquire() has already started the transaction and locked quota file */
407int ocfs2_global_read_dquot(struct dquot *dquot)
408{
409 int err, err2, ex = 0;
410 struct ocfs2_mem_dqinfo *info =
411 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
412
413 err = ocfs2_qinfo_lock(info, 0);
414 if (err < 0)
415 goto out;
416 err = qtree_read_dquot(&info->dqi_gi, dquot);
417 if (err < 0)
418 goto out_qlock;
419 OCFS2_DQUOT(dquot)->dq_use_count++;
420 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
421 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
422 if (!dquot->dq_off) { /* No real quota entry? */
423 /* Upgrade to exclusive lock for allocation */
424 err = ocfs2_qinfo_lock(info, 1);
425 if (err < 0)
426 goto out_qlock;
427 ex = 1;
428 }
429 err = qtree_write_dquot(&info->dqi_gi, dquot);
430 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
431 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
432 if (!err)
433 err = err2;
434 }
435out_qlock:
436 if (ex)
437 ocfs2_qinfo_unlock(info, 1);
438 ocfs2_qinfo_unlock(info, 0);
439out:
440 if (err < 0)
441 mlog_errno(err);
442 return err;
443}
444
445/* Sync local information about quota modifications with global quota file.
446 * Caller must have started the transaction and obtained exclusive lock for
447 * global quota file inode */
448int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
449{
450 int err, err2;
451 struct super_block *sb = dquot->dq_sb;
452 int type = dquot->dq_type;
453 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
454 struct ocfs2_global_disk_dqblk dqblk;
455 s64 spacechange, inodechange;
456 time_t olditime, oldbtime;
457
458 err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
459 sizeof(struct ocfs2_global_disk_dqblk),
460 dquot->dq_off);
461 if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
462 if (err >= 0) {
463 mlog(ML_ERROR, "Short read from global quota file "
464 "(%u read)\n", err);
465 err = -EIO;
466 }
467 goto out;
468 }
469
470 /* Update space and inode usage. Get also other information from
471 * global quota file so that we don't overwrite any changes there.
472 * We are */
473 spin_lock(&dq_data_lock);
474 spacechange = dquot->dq_dqb.dqb_curspace -
475 OCFS2_DQUOT(dquot)->dq_origspace;
476 inodechange = dquot->dq_dqb.dqb_curinodes -
477 OCFS2_DQUOT(dquot)->dq_originodes;
478 olditime = dquot->dq_dqb.dqb_itime;
479 oldbtime = dquot->dq_dqb.dqb_btime;
480 ocfs2_global_disk2memdqb(dquot, &dqblk);
481 mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
482 dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
483 dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
484 if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
485 dquot->dq_dqb.dqb_curspace += spacechange;
486 if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
487 dquot->dq_dqb.dqb_curinodes += inodechange;
488 /* Set properly space grace time... */
489 if (dquot->dq_dqb.dqb_bsoftlimit &&
490 dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
491 if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
492 oldbtime > 0) {
493 if (dquot->dq_dqb.dqb_btime > 0)
494 dquot->dq_dqb.dqb_btime =
495 min(dquot->dq_dqb.dqb_btime, oldbtime);
496 else
497 dquot->dq_dqb.dqb_btime = oldbtime;
498 }
499 } else {
500 dquot->dq_dqb.dqb_btime = 0;
501 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
502 }
503 /* Set properly inode grace time... */
504 if (dquot->dq_dqb.dqb_isoftlimit &&
505 dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
506 if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
507 olditime > 0) {
508 if (dquot->dq_dqb.dqb_itime > 0)
509 dquot->dq_dqb.dqb_itime =
510 min(dquot->dq_dqb.dqb_itime, olditime);
511 else
512 dquot->dq_dqb.dqb_itime = olditime;
513 }
514 } else {
515 dquot->dq_dqb.dqb_itime = 0;
516 clear_bit(DQ_INODES_B, &dquot->dq_flags);
517 }
518 /* All information is properly updated, clear the flags */
519 __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
520 __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
521 __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
522 __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
523 __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
524 __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
525 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
526 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
527 spin_unlock(&dq_data_lock);
528 err = ocfs2_qinfo_lock(info, freeing);
529 if (err < 0) {
530 mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
531 " (type=%d, id=%u)\n", dquot->dq_type,
532 (unsigned)dquot->dq_id);
533 goto out;
534 }
535 if (freeing)
536 OCFS2_DQUOT(dquot)->dq_use_count--;
537 err = qtree_write_dquot(&info->dqi_gi, dquot);
538 if (err < 0)
539 goto out_qlock;
540 if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
541 err = qtree_release_dquot(&info->dqi_gi, dquot);
542 if (info_dirty(sb_dqinfo(sb, type))) {
543 err2 = __ocfs2_global_write_info(sb, type);
544 if (!err)
545 err = err2;
546 }
547 }
548out_qlock:
549 ocfs2_qinfo_unlock(info, freeing);
550out:
551 if (err < 0)
552 mlog_errno(err);
553 return err;
554}
555
556/*
557 * Functions for periodic syncing of dquots with global file
558 */
559static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
560{
561 handle_t *handle;
562 struct super_block *sb = dquot->dq_sb;
563 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
564 struct ocfs2_super *osb = OCFS2_SB(sb);
565 int status = 0;
566
567 mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
568 dquot->dq_type, type, sb->s_id);
569 if (type != dquot->dq_type)
570 goto out;
571 status = ocfs2_lock_global_qf(oinfo, 1);
572 if (status < 0)
573 goto out;
574
575 handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
576 if (IS_ERR(handle)) {
577 status = PTR_ERR(handle);
578 mlog_errno(status);
579 goto out_ilock;
580 }
581 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
582 status = ocfs2_sync_dquot(dquot);
583 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
584 if (status < 0)
585 mlog_errno(status);
586 /* We have to write local structure as well... */
587 dquot_mark_dquot_dirty(dquot);
588 status = dquot_commit(dquot);
589 if (status < 0)
590 mlog_errno(status);
591 ocfs2_commit_trans(osb, handle);
592out_ilock:
593 ocfs2_unlock_global_qf(oinfo, 1);
594out:
595 mlog_exit(status);
596 return status;
597}
598
599static void qsync_work_fn(struct work_struct *work)
600{
601 struct ocfs2_mem_dqinfo *oinfo = container_of(work,
602 struct ocfs2_mem_dqinfo,
603 dqi_sync_work.work);
604 struct super_block *sb = oinfo->dqi_gqinode->i_sb;
605
606 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
607 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
608 oinfo->dqi_syncjiff);
609}
610
611/*
612 * Wrappers for generic quota functions
613 */
614
615static int ocfs2_write_dquot(struct dquot *dquot)
616{
617 handle_t *handle;
618 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
619 int status = 0;
620
621 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
622
623 handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
624 if (IS_ERR(handle)) {
625 status = PTR_ERR(handle);
626 mlog_errno(status);
627 goto out;
628 }
629 status = dquot_commit(dquot);
630 ocfs2_commit_trans(osb, handle);
631out:
632 mlog_exit(status);
633 return status;
634}
635
636int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
637{
638 struct ocfs2_mem_dqinfo *oinfo;
639 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
640 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
641
642 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
643 return 0;
644
645 oinfo = sb_dqinfo(sb, type)->dqi_priv;
646 /* We modify tree, leaf block, global info, local chunk header,
647 * global and local inode */
648 return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
649 2 * OCFS2_INODE_UPDATE_CREDITS;
650}
651
652static int ocfs2_release_dquot(struct dquot *dquot)
653{
654 handle_t *handle;
655 struct ocfs2_mem_dqinfo *oinfo =
656 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
657 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
658 int status = 0;
659
660 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
661
662 status = ocfs2_lock_global_qf(oinfo, 1);
663 if (status < 0)
664 goto out;
665 handle = ocfs2_start_trans(osb,
666 ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
667 if (IS_ERR(handle)) {
668 status = PTR_ERR(handle);
669 mlog_errno(status);
670 goto out_ilock;
671 }
672 status = dquot_release(dquot);
673 ocfs2_commit_trans(osb, handle);
674out_ilock:
675 ocfs2_unlock_global_qf(oinfo, 1);
676out:
677 mlog_exit(status);
678 return status;
679}
680
681int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
682{
683 struct ocfs2_mem_dqinfo *oinfo;
684 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
685 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
686 struct ocfs2_dinode *lfe, *gfe;
687
688 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
689 return 0;
690
691 oinfo = sb_dqinfo(sb, type)->dqi_priv;
692 gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
693 lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
694 /* We can extend local file + global file. In local file we
695 * can modify info, chunk header block and dquot block. In
696 * global file we can modify info, tree and leaf block */
697 return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
698 ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
699 3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
700}
701
702static int ocfs2_acquire_dquot(struct dquot *dquot)
703{
704 handle_t *handle;
705 struct ocfs2_mem_dqinfo *oinfo =
706 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
707 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
708 int status = 0;
709
710 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
711 /* We need an exclusive lock, because we're going to update use count
712 * and instantiate possibly new dquot structure */
713 status = ocfs2_lock_global_qf(oinfo, 1);
714 if (status < 0)
715 goto out;
716 handle = ocfs2_start_trans(osb,
717 ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
718 if (IS_ERR(handle)) {
719 status = PTR_ERR(handle);
720 mlog_errno(status);
721 goto out_ilock;
722 }
723 status = dquot_acquire(dquot);
724 ocfs2_commit_trans(osb, handle);
725out_ilock:
726 ocfs2_unlock_global_qf(oinfo, 1);
727out:
728 mlog_exit(status);
729 return status;
730}
731
732static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
733{
734 unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
735 (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
736 (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
737 (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
738 (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
739 (1 << (DQ_LASTSET_B + QIF_ITIME_B));
740 int sync = 0;
741 int status;
742 struct super_block *sb = dquot->dq_sb;
743 int type = dquot->dq_type;
744 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
745 handle_t *handle;
746 struct ocfs2_super *osb = OCFS2_SB(sb);
747
748 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
749 dquot_mark_dquot_dirty(dquot);
750
751 /* In case user set some limits, sync dquot immediately to global
752 * quota file so that information propagates quicker */
753 spin_lock(&dq_data_lock);
754 if (dquot->dq_flags & mask)
755 sync = 1;
756 spin_unlock(&dq_data_lock);
757 if (!sync) {
758 status = ocfs2_write_dquot(dquot);
759 goto out;
760 }
761 status = ocfs2_lock_global_qf(oinfo, 1);
762 if (status < 0)
763 goto out;
764 handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
765 if (IS_ERR(handle)) {
766 status = PTR_ERR(handle);
767 mlog_errno(status);
768 goto out_ilock;
769 }
770 status = ocfs2_sync_dquot(dquot);
771 if (status < 0) {
772 mlog_errno(status);
773 goto out_trans;
774 }
775 /* Now write updated local dquot structure */
776 status = dquot_commit(dquot);
777out_trans:
778 ocfs2_commit_trans(osb, handle);
779out_ilock:
780 ocfs2_unlock_global_qf(oinfo, 1);
781out:
782 mlog_exit(status);
783 return status;
784}
785
786/* This should happen only after set_dqinfo(). */
787static int ocfs2_write_info(struct super_block *sb, int type)
788{
789 handle_t *handle;
790 int status = 0;
791 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
792
793 mlog_entry_void();
794
795 status = ocfs2_lock_global_qf(oinfo, 1);
796 if (status < 0)
797 goto out;
798 handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
799 if (IS_ERR(handle)) {
800 status = PTR_ERR(handle);
801 mlog_errno(status);
802 goto out_ilock;
803 }
804 status = dquot_commit_info(sb, type);
805 ocfs2_commit_trans(OCFS2_SB(sb), handle);
806out_ilock:
807 ocfs2_unlock_global_qf(oinfo, 1);
808out:
809 mlog_exit(status);
810 return status;
811}
812
813/* This is difficult. We have to lock quota inode and start transaction
814 * in this function but we don't want to take the penalty of exlusive
815 * quota file lock when we are just going to use cached structures. So
816 * we just take read lock check whether we have dquot cached and if so,
817 * we don't have to take the write lock... */
818static int ocfs2_dquot_initialize(struct inode *inode, int type)
819{
820 handle_t *handle = NULL;
821 int status = 0;
822 struct super_block *sb = inode->i_sb;
823 struct ocfs2_mem_dqinfo *oinfo;
824 int exclusive = 0;
825 int cnt;
826 qid_t id;
827
828 mlog_entry_void();
829
830 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
831 if (type != -1 && cnt != type)
832 continue;
833 if (!sb_has_quota_active(sb, cnt))
834 continue;
835 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
836 status = ocfs2_lock_global_qf(oinfo, 0);
837 if (status < 0)
838 goto out;
839 /* This is just a performance optimization not a reliable test.
840 * Since we hold an inode lock, noone can actually release
841 * the structure until we are finished with initialization. */
842 if (inode->i_dquot[cnt] != NODQUOT) {
843 ocfs2_unlock_global_qf(oinfo, 0);
844 continue;
845 }
846 /* When we have inode lock, we know that no dquot_release() can
847 * run and thus we can safely check whether we need to
848 * read+modify global file to get quota information or whether
849 * our node already has it. */
850 if (cnt == USRQUOTA)
851 id = inode->i_uid;
852 else if (cnt == GRPQUOTA)
853 id = inode->i_gid;
854 else
855 BUG();
856 /* Obtain exclusion from quota off... */
857 down_write(&sb_dqopt(sb)->dqptr_sem);
858 exclusive = !dquot_is_cached(sb, id, cnt);
859 up_write(&sb_dqopt(sb)->dqptr_sem);
860 if (exclusive) {
861 status = ocfs2_lock_global_qf(oinfo, 1);
862 if (status < 0) {
863 exclusive = 0;
864 mlog_errno(status);
865 goto out_ilock;
866 }
867 handle = ocfs2_start_trans(OCFS2_SB(sb),
868 ocfs2_calc_qinit_credits(sb, cnt));
869 if (IS_ERR(handle)) {
870 status = PTR_ERR(handle);
871 mlog_errno(status);
872 goto out_ilock;
873 }
874 }
875 dquot_initialize(inode, cnt);
876 if (exclusive) {
877 ocfs2_commit_trans(OCFS2_SB(sb), handle);
878 ocfs2_unlock_global_qf(oinfo, 1);
879 }
880 ocfs2_unlock_global_qf(oinfo, 0);
881 }
882 mlog_exit(0);
883 return 0;
884out_ilock:
885 if (exclusive)
886 ocfs2_unlock_global_qf(oinfo, 1);
887 ocfs2_unlock_global_qf(oinfo, 0);
888out:
889 mlog_exit(status);
890 return status;
891}
892
893static int ocfs2_dquot_drop_slow(struct inode *inode)
894{
895 int status = 0;
896 int cnt;
897 int got_lock[MAXQUOTAS] = {0, 0};
898 handle_t *handle;
899 struct super_block *sb = inode->i_sb;
900 struct ocfs2_mem_dqinfo *oinfo;
901
902 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
903 if (!sb_has_quota_active(sb, cnt))
904 continue;
905 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
906 status = ocfs2_lock_global_qf(oinfo, 1);
907 if (status < 0)
908 goto out;
909 got_lock[cnt] = 1;
910 }
911 handle = ocfs2_start_trans(OCFS2_SB(sb),
912 ocfs2_calc_qinit_credits(sb, USRQUOTA) +
913 ocfs2_calc_qinit_credits(sb, GRPQUOTA));
914 if (IS_ERR(handle)) {
915 status = PTR_ERR(handle);
916 mlog_errno(status);
917 goto out;
918 }
919 dquot_drop(inode);
920 ocfs2_commit_trans(OCFS2_SB(sb), handle);
921out:
922 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
923 if (got_lock[cnt]) {
924 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
925 ocfs2_unlock_global_qf(oinfo, 1);
926 }
927 return status;
928}
929
930/* See the comment before ocfs2_dquot_initialize. */
931static int ocfs2_dquot_drop(struct inode *inode)
932{
933 int status = 0;
934 struct super_block *sb = inode->i_sb;
935 struct ocfs2_mem_dqinfo *oinfo;
936 int exclusive = 0;
937 int cnt;
938 int got_lock[MAXQUOTAS] = {0, 0};
939
940 mlog_entry_void();
941 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
942 if (!sb_has_quota_active(sb, cnt))
943 continue;
944 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
945 status = ocfs2_lock_global_qf(oinfo, 0);
946 if (status < 0)
947 goto out;
948 got_lock[cnt] = 1;
949 }
950 /* Lock against anyone releasing references so that when when we check
951 * we know we are not going to be last ones to release dquot */
952 down_write(&sb_dqopt(sb)->dqptr_sem);
953 /* Urgh, this is a terrible hack :( */
954 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
955 if (inode->i_dquot[cnt] != NODQUOT &&
956 atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
957 exclusive = 1;
958 break;
959 }
960 }
961 if (!exclusive)
962 dquot_drop_locked(inode);
963 up_write(&sb_dqopt(sb)->dqptr_sem);
964out:
965 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
966 if (got_lock[cnt]) {
967 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
968 ocfs2_unlock_global_qf(oinfo, 0);
969 }
970 /* In case we bailed out because we had to do expensive locking
971 * do it now... */
972 if (exclusive)
973 status = ocfs2_dquot_drop_slow(inode);
974 mlog_exit(status);
975 return status;
976}
977
978static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
979{
980 struct ocfs2_dquot *dquot =
981 kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
982
983 if (!dquot)
984 return NULL;
985 return &dquot->dq_dquot;
986}
987
988static void ocfs2_destroy_dquot(struct dquot *dquot)
989{
990 kmem_cache_free(ocfs2_dquot_cachep, dquot);
991}
992
993struct dquot_operations ocfs2_quota_operations = {
994 .initialize = ocfs2_dquot_initialize,
995 .drop = ocfs2_dquot_drop,
996 .alloc_space = dquot_alloc_space,
997 .alloc_inode = dquot_alloc_inode,
998 .free_space = dquot_free_space,
999 .free_inode = dquot_free_inode,
1000 .transfer = dquot_transfer,
1001 .write_dquot = ocfs2_write_dquot,
1002 .acquire_dquot = ocfs2_acquire_dquot,
1003 .release_dquot = ocfs2_release_dquot,
1004 .mark_dirty = ocfs2_mark_dquot_dirty,
1005 .write_info = ocfs2_write_info,
1006 .alloc_dquot = ocfs2_alloc_dquot,
1007 .destroy_dquot = ocfs2_destroy_dquot,
1008};
1009
1010int ocfs2_quota_setup(void)
1011{
1012 ocfs2_quota_wq = create_workqueue("o2quot");
1013 if (!ocfs2_quota_wq)
1014 return -ENOMEM;
1015 return 0;
1016}
1017
1018void ocfs2_quota_shutdown(void)
1019{
1020 if (ocfs2_quota_wq) {
1021 flush_workqueue(ocfs2_quota_wq);
1022 destroy_workqueue(ocfs2_quota_wq);
1023 ocfs2_quota_wq = NULL;
1024 }
1025}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000000..07deec5e9721
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
1/*
2 * Implementation of operations over local quota file
3 */
4
5#include <linux/fs.h>
6#include <linux/quota.h>
7#include <linux/quotaops.h>
8#include <linux/module.h>
9
10#define MLOG_MASK_PREFIX ML_QUOTA
11#include <cluster/masklog.h>
12
13#include "ocfs2_fs.h"
14#include "ocfs2.h"
15#include "inode.h"
16#include "alloc.h"
17#include "file.h"
18#include "buffer_head_io.h"
19#include "journal.h"
20#include "sysfile.h"
21#include "dlmglue.h"
22#include "quota.h"
23
24/* Number of local quota structures per block */
25static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
26{
27 return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
28 sizeof(struct ocfs2_local_disk_dqblk));
29}
30
31/* Number of blocks with entries in one chunk */
32static inline unsigned int ol_chunk_blocks(struct super_block *sb)
33{
34 return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
35 OCFS2_QBLK_RESERVED_SPACE) << 3) /
36 ol_quota_entries_per_block(sb);
37}
38
39/* Number of entries in a chunk bitmap */
40static unsigned int ol_chunk_entries(struct super_block *sb)
41{
42 return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
43}
44
45/* Offset of the chunk in quota file */
46static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
47{
48 /* 1 block for local quota file info, 1 block per chunk for chunk info */
49 return 1 + (ol_chunk_blocks(sb) + 1) * c;
50}
51
52static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
53{
54 int epb = ol_quota_entries_per_block(sb);
55
56 return ol_quota_chunk_block(sb, c) + 1 + off / epb;
57}
58
59static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
60{
61 int epb = ol_quota_entries_per_block(sb);
62
63 return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
64}
65
66/* Offset of the dquot structure in the quota file */
67static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
68{
69 return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
70 ol_dqblk_block_off(sb, c, off);
71}
72
73/* Compute block number from given offset */
74static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
75{
76 return off >> sb->s_blocksize_bits;
77}
78
79static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
80{
81 return off & ((1 << sb->s_blocksize_bits) - 1);
82}
83
84/* Compute offset in the chunk of a structure with the given offset */
85static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
86{
87 int epb = ol_quota_entries_per_block(sb);
88
89 return ((off >> sb->s_blocksize_bits) -
90 ol_quota_chunk_block(sb, c) - 1) * epb
91 + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
92 sizeof(struct ocfs2_local_disk_dqblk);
93}
94
95/* Write bufferhead into the fs */
96static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
97 void (*modify)(struct buffer_head *, void *), void *private)
98{
99 struct super_block *sb = inode->i_sb;
100 handle_t *handle;
101 int status;
102
103 handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
104 if (IS_ERR(handle)) {
105 status = PTR_ERR(handle);
106 mlog_errno(status);
107 return status;
108 }
109 status = ocfs2_journal_access_dq(handle, inode, bh,
110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (status < 0) {
112 mlog_errno(status);
113 ocfs2_commit_trans(OCFS2_SB(sb), handle);
114 return status;
115 }
116 lock_buffer(bh);
117 modify(bh, private);
118 unlock_buffer(bh);
119 status = ocfs2_journal_dirty(handle, bh);
120 if (status < 0) {
121 mlog_errno(status);
122 ocfs2_commit_trans(OCFS2_SB(sb), handle);
123 return status;
124 }
125 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
126 if (status < 0) {
127 mlog_errno(status);
128 return status;
129 }
130 return 0;
131}
132
133/* Check whether we understand format of quota files */
134static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
135{
136 unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
137 unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
138 unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
139 unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
140 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
141 GROUP_QUOTA_SYSTEM_INODE };
142 struct buffer_head *bh = NULL;
143 struct inode *linode = sb_dqopt(sb)->files[type];
144 struct inode *ginode = NULL;
145 struct ocfs2_disk_dqheader *dqhead;
146 int status, ret = 0;
147
148 /* First check whether we understand local quota file */
149 status = ocfs2_read_quota_block(linode, 0, &bh);
150 if (status) {
151 mlog_errno(status);
152 mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
153 type);
154 goto out_err;
155 }
156 dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
157 if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
158 mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
159 " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
160 lmagics[type], type);
161 goto out_err;
162 }
163 if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
164 mlog(ML_ERROR, "quota file version does not match (%u != %u),"
165 " type=%d\n", le32_to_cpu(dqhead->dqh_version),
166 lversions[type], type);
167 goto out_err;
168 }
169 brelse(bh);
170 bh = NULL;
171
172 /* Next check whether we understand global quota file */
173 ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
174 OCFS2_INVALID_SLOT);
175 if (!ginode) {
176 mlog(ML_ERROR, "cannot get global quota file inode "
177 "(type=%d)\n", type);
178 goto out_err;
179 }
180 /* Since the header is read only, we don't care about locking */
181 status = ocfs2_read_quota_block(ginode, 0, &bh);
182 if (status) {
183 mlog_errno(status);
184 mlog(ML_ERROR, "failed to read global quota file header "
185 "(type=%d)\n", type);
186 goto out_err;
187 }
188 dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
189 if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
190 mlog(ML_ERROR, "global quota file magic does not match "
191 "(%u != %u), type=%d\n",
192 le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
193 goto out_err;
194 }
195 if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
196 mlog(ML_ERROR, "global quota file version does not match "
197 "(%u != %u), type=%d\n",
198 le32_to_cpu(dqhead->dqh_version), gversions[type],
199 type);
200 goto out_err;
201 }
202
203 ret = 1;
204out_err:
205 brelse(bh);
206 iput(ginode);
207 return ret;
208}
209
210/* Release given list of quota file chunks */
211static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
212{
213 struct ocfs2_quota_chunk *pos, *next;
214
215 list_for_each_entry_safe(pos, next, head, qc_chunk) {
216 list_del(&pos->qc_chunk);
217 brelse(pos->qc_headerbh);
218 kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
219 }
220}
221
222/* Load quota bitmaps into memory */
223static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
224 struct ocfs2_local_disk_dqinfo *ldinfo,
225 struct list_head *head)
226{
227 struct ocfs2_quota_chunk *newchunk;
228 int i, status;
229
230 INIT_LIST_HEAD(head);
231 for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
232 newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
233 if (!newchunk) {
234 ocfs2_release_local_quota_bitmaps(head);
235 return -ENOMEM;
236 }
237 newchunk->qc_num = i;
238 newchunk->qc_headerbh = NULL;
239 status = ocfs2_read_quota_block(inode,
240 ol_quota_chunk_block(inode->i_sb, i),
241 &newchunk->qc_headerbh);
242 if (status) {
243 mlog_errno(status);
244 kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
245 ocfs2_release_local_quota_bitmaps(head);
246 return status;
247 }
248 list_add_tail(&newchunk->qc_chunk, head);
249 }
250 return 0;
251}
252
253static void olq_update_info(struct buffer_head *bh, void *private)
254{
255 struct mem_dqinfo *info = private;
256 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
257 struct ocfs2_local_disk_dqinfo *ldinfo;
258
259 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
260 OCFS2_LOCAL_INFO_OFF);
261 spin_lock(&dq_data_lock);
262 ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
263 ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
264 ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
265 spin_unlock(&dq_data_lock);
266}
267
268static int ocfs2_add_recovery_chunk(struct super_block *sb,
269 struct ocfs2_local_disk_chunk *dchunk,
270 int chunk,
271 struct list_head *head)
272{
273 struct ocfs2_recovery_chunk *rc;
274
275 rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
276 if (!rc)
277 return -ENOMEM;
278 rc->rc_chunk = chunk;
279 rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
280 if (!rc->rc_bitmap) {
281 kfree(rc);
282 return -ENOMEM;
283 }
284 memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
285 (ol_chunk_entries(sb) + 7) >> 3);
286 list_add_tail(&rc->rc_list, head);
287 return 0;
288}
289
290static void free_recovery_list(struct list_head *head)
291{
292 struct ocfs2_recovery_chunk *next;
293 struct ocfs2_recovery_chunk *rchunk;
294
295 list_for_each_entry_safe(rchunk, next, head, rc_list) {
296 list_del(&rchunk->rc_list);
297 kfree(rchunk->rc_bitmap);
298 kfree(rchunk);
299 }
300}
301
302void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
303{
304 int type;
305
306 for (type = 0; type < MAXQUOTAS; type++)
307 free_recovery_list(&(rec->r_list[type]));
308 kfree(rec);
309}
310
311/* Load entries in our quota file we have to recover*/
312static int ocfs2_recovery_load_quota(struct inode *lqinode,
313 struct ocfs2_local_disk_dqinfo *ldinfo,
314 int type,
315 struct list_head *head)
316{
317 struct super_block *sb = lqinode->i_sb;
318 struct buffer_head *hbh;
319 struct ocfs2_local_disk_chunk *dchunk;
320 int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
321 int status = 0;
322
323 for (i = 0; i < chunks; i++) {
324 hbh = NULL;
325 status = ocfs2_read_quota_block(lqinode,
326 ol_quota_chunk_block(sb, i),
327 &hbh);
328 if (status) {
329 mlog_errno(status);
330 break;
331 }
332 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
333 if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
334 status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
335 brelse(hbh);
336 if (status < 0)
337 break;
338 }
339 if (status < 0)
340 free_recovery_list(head);
341 return status;
342}
343
344static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
345{
346 int type;
347 struct ocfs2_quota_recovery *rec;
348
349 rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
350 if (!rec)
351 return NULL;
352 for (type = 0; type < MAXQUOTAS; type++)
353 INIT_LIST_HEAD(&(rec->r_list[type]));
354 return rec;
355}
356
357/* Load information we need for quota recovery into memory */
358struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
359 struct ocfs2_super *osb,
360 int slot_num)
361{
362 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
363 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
364 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
365 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
366 struct super_block *sb = osb->sb;
367 struct ocfs2_local_disk_dqinfo *ldinfo;
368 struct inode *lqinode;
369 struct buffer_head *bh;
370 int type;
371 int status = 0;
372 struct ocfs2_quota_recovery *rec;
373
374 mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
375 rec = ocfs2_alloc_quota_recovery();
376 if (!rec)
377 return ERR_PTR(-ENOMEM);
378 /* First init... */
379
380 for (type = 0; type < MAXQUOTAS; type++) {
381 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
382 continue;
383 /* At this point, journal of the slot is already replayed so
384 * we can trust metadata and data of the quota file */
385 lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
386 if (!lqinode) {
387 status = -ENOENT;
388 goto out;
389 }
390 status = ocfs2_inode_lock_full(lqinode, NULL, 1,
391 OCFS2_META_LOCK_RECOVERY);
392 if (status < 0) {
393 mlog_errno(status);
394 goto out_put;
395 }
396 /* Now read local header */
397 bh = NULL;
398 status = ocfs2_read_quota_block(lqinode, 0, &bh);
399 if (status) {
400 mlog_errno(status);
401 mlog(ML_ERROR, "failed to read quota file info header "
402 "(slot=%d type=%d)\n", slot_num, type);
403 goto out_lock;
404 }
405 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
406 OCFS2_LOCAL_INFO_OFF);
407 status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
408 &rec->r_list[type]);
409 brelse(bh);
410out_lock:
411 ocfs2_inode_unlock(lqinode, 1);
412out_put:
413 iput(lqinode);
414 if (status < 0)
415 break;
416 }
417out:
418 if (status < 0) {
419 ocfs2_free_quota_recovery(rec);
420 rec = ERR_PTR(status);
421 }
422 return rec;
423}
424
425/* Sync changes in local quota file into global quota file and
426 * reinitialize local quota file.
427 * The function expects local quota file to be already locked and
428 * dqonoff_mutex locked. */
429static int ocfs2_recover_local_quota_file(struct inode *lqinode,
430 int type,
431 struct ocfs2_quota_recovery *rec)
432{
433 struct super_block *sb = lqinode->i_sb;
434 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
435 struct ocfs2_local_disk_chunk *dchunk;
436 struct ocfs2_local_disk_dqblk *dqblk;
437 struct dquot *dquot;
438 handle_t *handle;
439 struct buffer_head *hbh = NULL, *qbh = NULL;
440 int status = 0;
441 int bit, chunk;
442 struct ocfs2_recovery_chunk *rchunk, *next;
443 qsize_t spacechange, inodechange;
444
445 mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
446
447 status = ocfs2_lock_global_qf(oinfo, 1);
448 if (status < 0)
449 goto out;
450
451 list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
452 chunk = rchunk->rc_chunk;
453 hbh = NULL;
454 status = ocfs2_read_quota_block(lqinode,
455 ol_quota_chunk_block(sb, chunk),
456 &hbh);
457 if (status) {
458 mlog_errno(status);
459 break;
460 }
461 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
462 for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
463 qbh = NULL;
464 status = ocfs2_read_quota_block(lqinode,
465 ol_dqblk_block(sb, chunk, bit),
466 &qbh);
467 if (status) {
468 mlog_errno(status);
469 break;
470 }
471 dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
472 ol_dqblk_block_off(sb, chunk, bit));
473 dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
474 if (!dquot) {
475 status = -EIO;
476 mlog(ML_ERROR, "Failed to get quota structure "
477 "for id %u, type %d. Cannot finish quota "
478 "file recovery.\n",
479 (unsigned)le64_to_cpu(dqblk->dqb_id),
480 type);
481 goto out_put_bh;
482 }
483 handle = ocfs2_start_trans(OCFS2_SB(sb),
484 OCFS2_QSYNC_CREDITS);
485 if (IS_ERR(handle)) {
486 status = PTR_ERR(handle);
487 mlog_errno(status);
488 goto out_put_dquot;
489 }
490 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
491 spin_lock(&dq_data_lock);
492 /* Add usage from quota entry into quota changes
493 * of our node. Auxiliary variables are important
494 * due to signedness */
495 spacechange = le64_to_cpu(dqblk->dqb_spacemod);
496 inodechange = le64_to_cpu(dqblk->dqb_inodemod);
497 dquot->dq_dqb.dqb_curspace += spacechange;
498 dquot->dq_dqb.dqb_curinodes += inodechange;
499 spin_unlock(&dq_data_lock);
500 /* We want to drop reference held by the crashed
501 * node. Since we have our own reference we know
502 * global structure actually won't be freed. */
503 status = ocfs2_global_release_dquot(dquot);
504 if (status < 0) {
505 mlog_errno(status);
506 goto out_commit;
507 }
508 /* Release local quota file entry */
509 status = ocfs2_journal_access_dq(handle, lqinode,
510 qbh, OCFS2_JOURNAL_ACCESS_WRITE);
511 if (status < 0) {
512 mlog_errno(status);
513 goto out_commit;
514 }
515 lock_buffer(qbh);
516 WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
517 ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
518 le32_add_cpu(&dchunk->dqc_free, 1);
519 unlock_buffer(qbh);
520 status = ocfs2_journal_dirty(handle, qbh);
521 if (status < 0)
522 mlog_errno(status);
523out_commit:
524 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
525 ocfs2_commit_trans(OCFS2_SB(sb), handle);
526out_put_dquot:
527 dqput(dquot);
528out_put_bh:
529 brelse(qbh);
530 if (status < 0)
531 break;
532 }
533 brelse(hbh);
534 list_del(&rchunk->rc_list);
535 kfree(rchunk->rc_bitmap);
536 kfree(rchunk);
537 if (status < 0)
538 break;
539 }
540 ocfs2_unlock_global_qf(oinfo, 1);
541out:
542 if (status < 0)
543 free_recovery_list(&(rec->r_list[type]));
544 mlog_exit(status);
545 return status;
546}
547
548/* Recover local quota files for given node different from us */
549int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
550 struct ocfs2_quota_recovery *rec,
551 int slot_num)
552{
553 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
554 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
555 struct super_block *sb = osb->sb;
556 struct ocfs2_local_disk_dqinfo *ldinfo;
557 struct buffer_head *bh;
558 handle_t *handle;
559 int type;
560 int status = 0;
561 struct inode *lqinode;
562 unsigned int flags;
563
564 mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
565 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
566 for (type = 0; type < MAXQUOTAS; type++) {
567 if (list_empty(&(rec->r_list[type])))
568 continue;
569 mlog(0, "Recovering quota in slot %d\n", slot_num);
570 lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
571 if (!lqinode) {
572 status = -ENOENT;
573 goto out;
574 }
575 status = ocfs2_inode_lock_full(lqinode, NULL, 1,
576 OCFS2_META_LOCK_NOQUEUE);
577 /* Someone else is holding the lock? Then he must be
578 * doing the recovery. Just skip the file... */
579 if (status == -EAGAIN) {
580 mlog(ML_NOTICE, "skipping quota recovery for slot %d "
581 "because quota file is locked.\n", slot_num);
582 status = 0;
583 goto out_put;
584 } else if (status < 0) {
585 mlog_errno(status);
586 goto out_put;
587 }
588 /* Now read local header */
589 bh = NULL;
590 status = ocfs2_read_quota_block(lqinode, 0, &bh);
591 if (status) {
592 mlog_errno(status);
593 mlog(ML_ERROR, "failed to read quota file info header "
594 "(slot=%d type=%d)\n", slot_num, type);
595 goto out_lock;
596 }
597 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
598 OCFS2_LOCAL_INFO_OFF);
599 /* Is recovery still needed? */
600 flags = le32_to_cpu(ldinfo->dqi_flags);
601 if (!(flags & OLQF_CLEAN))
602 status = ocfs2_recover_local_quota_file(lqinode,
603 type,
604 rec);
605 /* We don't want to mark file as clean when it is actually
606 * active */
607 if (slot_num == osb->slot_num)
608 goto out_bh;
609 /* Mark quota file as clean if we are recovering quota file of
610 * some other node. */
611 handle = ocfs2_start_trans(osb, 1);
612 if (IS_ERR(handle)) {
613 status = PTR_ERR(handle);
614 mlog_errno(status);
615 goto out_bh;
616 }
617 status = ocfs2_journal_access_dq(handle, lqinode, bh,
618 OCFS2_JOURNAL_ACCESS_WRITE);
619 if (status < 0) {
620 mlog_errno(status);
621 goto out_trans;
622 }
623 lock_buffer(bh);
624 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
625 unlock_buffer(bh);
626 status = ocfs2_journal_dirty(handle, bh);
627 if (status < 0)
628 mlog_errno(status);
629out_trans:
630 ocfs2_commit_trans(osb, handle);
631out_bh:
632 brelse(bh);
633out_lock:
634 ocfs2_inode_unlock(lqinode, 1);
635out_put:
636 iput(lqinode);
637 if (status < 0)
638 break;
639 }
640out:
641 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
642 kfree(rec);
643 return status;
644}
645
646/* Read information header from quota file */
647static int ocfs2_local_read_info(struct super_block *sb, int type)
648{
649 struct ocfs2_local_disk_dqinfo *ldinfo;
650 struct mem_dqinfo *info = sb_dqinfo(sb, type);
651 struct ocfs2_mem_dqinfo *oinfo;
652 struct inode *lqinode = sb_dqopt(sb)->files[type];
653 int status;
654 struct buffer_head *bh = NULL;
655 struct ocfs2_quota_recovery *rec;
656 int locked = 0;
657
658 info->dqi_maxblimit = 0x7fffffffffffffffLL;
659 info->dqi_maxilimit = 0x7fffffffffffffffLL;
660 oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
661 if (!oinfo) {
662 mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
663 " info.");
664 goto out_err;
665 }
666 info->dqi_priv = oinfo;
667 oinfo->dqi_type = type;
668 INIT_LIST_HEAD(&oinfo->dqi_chunk);
669 oinfo->dqi_rec = NULL;
670 oinfo->dqi_lqi_bh = NULL;
671 oinfo->dqi_ibh = NULL;
672
673 status = ocfs2_global_read_info(sb, type);
674 if (status < 0)
675 goto out_err;
676
677 status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
678 if (status < 0) {
679 mlog_errno(status);
680 goto out_err;
681 }
682 locked = 1;
683
684 /* Now read local header */
685 status = ocfs2_read_quota_block(lqinode, 0, &bh);
686 if (status) {
687 mlog_errno(status);
688 mlog(ML_ERROR, "failed to read quota file info header "
689 "(type=%d)\n", type);
690 goto out_err;
691 }
692 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
693 OCFS2_LOCAL_INFO_OFF);
694 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
695 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
696 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
697 oinfo->dqi_ibh = bh;
698
699 /* We crashed when using local quota file? */
700 if (!(info->dqi_flags & OLQF_CLEAN)) {
701 rec = OCFS2_SB(sb)->quota_rec;
702 if (!rec) {
703 rec = ocfs2_alloc_quota_recovery();
704 if (!rec) {
705 status = -ENOMEM;
706 mlog_errno(status);
707 goto out_err;
708 }
709 OCFS2_SB(sb)->quota_rec = rec;
710 }
711
712 status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
713 &rec->r_list[type]);
714 if (status < 0) {
715 mlog_errno(status);
716 goto out_err;
717 }
718 }
719
720 status = ocfs2_load_local_quota_bitmaps(lqinode,
721 ldinfo,
722 &oinfo->dqi_chunk);
723 if (status < 0) {
724 mlog_errno(status);
725 goto out_err;
726 }
727
728 /* Now mark quota file as used */
729 info->dqi_flags &= ~OLQF_CLEAN;
730 status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
731 if (status < 0) {
732 mlog_errno(status);
733 goto out_err;
734 }
735
736 return 0;
737out_err:
738 if (oinfo) {
739 iput(oinfo->dqi_gqinode);
740 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
741 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
742 brelse(oinfo->dqi_lqi_bh);
743 if (locked)
744 ocfs2_inode_unlock(lqinode, 1);
745 ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
746 kfree(oinfo);
747 }
748 brelse(bh);
749 return -1;
750}
751
752/* Write local info to quota file */
753static int ocfs2_local_write_info(struct super_block *sb, int type)
754{
755 struct mem_dqinfo *info = sb_dqinfo(sb, type);
756 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
757 ->dqi_ibh;
758 int status;
759
760 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
761 info);
762 if (status < 0) {
763 mlog_errno(status);
764 return -1;
765 }
766
767 return 0;
768}
769
770/* Release info from memory */
771static int ocfs2_local_free_info(struct super_block *sb, int type)
772{
773 struct mem_dqinfo *info = sb_dqinfo(sb, type);
774 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
775 struct ocfs2_quota_chunk *chunk;
776 struct ocfs2_local_disk_chunk *dchunk;
777 int mark_clean = 1, len;
778 int status;
779
780 /* At this point we know there are no more dquots and thus
781 * even if there's some sync in the pdflush queue, it won't
782 * find any dquots and return without doing anything */
783 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
784 iput(oinfo->dqi_gqinode);
785 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
786 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
787 list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
788 dchunk = (struct ocfs2_local_disk_chunk *)
789 (chunk->qc_headerbh->b_data);
790 if (chunk->qc_num < oinfo->dqi_chunks - 1) {
791 len = ol_chunk_entries(sb);
792 } else {
793 len = (oinfo->dqi_blocks -
794 ol_quota_chunk_block(sb, chunk->qc_num) - 1)
795 * ol_quota_entries_per_block(sb);
796 }
797 /* Not all entries free? Bug! */
798 if (le32_to_cpu(dchunk->dqc_free) != len) {
799 mlog(ML_ERROR, "releasing quota file with used "
800 "entries (type=%d)\n", type);
801 mark_clean = 0;
802 }
803 }
804 ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
805
806 /* dqonoff_mutex protects us against racing with recovery thread... */
807 if (oinfo->dqi_rec) {
808 ocfs2_free_quota_recovery(oinfo->dqi_rec);
809 mark_clean = 0;
810 }
811
812 if (!mark_clean)
813 goto out;
814
815 /* Mark local file as clean */
816 info->dqi_flags |= OLQF_CLEAN;
817 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
818 oinfo->dqi_ibh,
819 olq_update_info,
820 info);
821 if (status < 0) {
822 mlog_errno(status);
823 goto out;
824 }
825
826out:
827 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
828 brelse(oinfo->dqi_ibh);
829 brelse(oinfo->dqi_lqi_bh);
830 kfree(oinfo);
831 return 0;
832}
833
834static void olq_set_dquot(struct buffer_head *bh, void *private)
835{
836 struct ocfs2_dquot *od = private;
837 struct ocfs2_local_disk_dqblk *dqblk;
838 struct super_block *sb = od->dq_dquot.dq_sb;
839
840 dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
841 + ol_dqblk_block_offset(sb, od->dq_local_off));
842
843 dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
844 spin_lock(&dq_data_lock);
845 dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
846 od->dq_origspace);
847 dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
848 od->dq_originodes);
849 spin_unlock(&dq_data_lock);
850 mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
851 od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
852 (long long)le64_to_cpu(dqblk->dqb_inodemod));
853}
854
855/* Write dquot to local quota file */
856static int ocfs2_local_write_dquot(struct dquot *dquot)
857{
858 struct super_block *sb = dquot->dq_sb;
859 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
860 struct buffer_head *bh = NULL;
861 int status;
862
863 status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
864 ol_dqblk_file_block(sb, od->dq_local_off),
865 &bh);
866 if (status) {
867 mlog_errno(status);
868 goto out;
869 }
870 status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
871 olq_set_dquot, od);
872 if (status < 0) {
873 mlog_errno(status);
874 goto out;
875 }
876out:
877 brelse(bh);
878 return status;
879}
880
881/* Find free entry in local quota file */
882static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
883 int type,
884 int *offset)
885{
886 struct mem_dqinfo *info = sb_dqinfo(sb, type);
887 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
888 struct ocfs2_quota_chunk *chunk;
889 struct ocfs2_local_disk_chunk *dchunk;
890 int found = 0, len;
891
892 list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
893 dchunk = (struct ocfs2_local_disk_chunk *)
894 chunk->qc_headerbh->b_data;
895 if (le32_to_cpu(dchunk->dqc_free) > 0) {
896 found = 1;
897 break;
898 }
899 }
900 if (!found)
901 return NULL;
902
903 if (chunk->qc_num < oinfo->dqi_chunks - 1) {
904 len = ol_chunk_entries(sb);
905 } else {
906 len = (oinfo->dqi_blocks -
907 ol_quota_chunk_block(sb, chunk->qc_num) - 1)
908 * ol_quota_entries_per_block(sb);
909 }
910
911 found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
912 /* We failed? */
913 if (found == len) {
914 mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
915 " entries free (type=%d)\n", chunk->qc_num,
916 le32_to_cpu(dchunk->dqc_free), type);
917 return ERR_PTR(-EIO);
918 }
919 *offset = found;
920 return chunk;
921}
922
923/* Add new chunk to the local quota file */
924static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
925 struct super_block *sb,
926 int type,
927 int *offset)
928{
929 struct mem_dqinfo *info = sb_dqinfo(sb, type);
930 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
931 struct inode *lqinode = sb_dqopt(sb)->files[type];
932 struct ocfs2_quota_chunk *chunk = NULL;
933 struct ocfs2_local_disk_chunk *dchunk;
934 int status;
935 handle_t *handle;
936 struct buffer_head *bh = NULL;
937 u64 p_blkno;
938
939 /* We are protected by dqio_sem so no locking needed */
940 status = ocfs2_extend_no_holes(lqinode,
941 lqinode->i_size + 2 * sb->s_blocksize,
942 lqinode->i_size);
943 if (status < 0) {
944 mlog_errno(status);
945 goto out;
946 }
947 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
948 lqinode->i_size + 2 * sb->s_blocksize);
949 if (status < 0) {
950 mlog_errno(status);
951 goto out;
952 }
953
954 chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
955 if (!chunk) {
956 status = -ENOMEM;
957 mlog_errno(status);
958 goto out;
959 }
960
961 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
962 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
963 &p_blkno, NULL, NULL);
964 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
965 if (status < 0) {
966 mlog_errno(status);
967 goto out;
968 }
969 bh = sb_getblk(sb, p_blkno);
970 if (!bh) {
971 status = -ENOMEM;
972 mlog_errno(status);
973 goto out;
974 }
975 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
976
977 handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
978 if (IS_ERR(handle)) {
979 status = PTR_ERR(handle);
980 mlog_errno(status);
981 goto out;
982 }
983
984 status = ocfs2_journal_access_dq(handle, lqinode, bh,
985 OCFS2_JOURNAL_ACCESS_WRITE);
986 if (status < 0) {
987 mlog_errno(status);
988 goto out_trans;
989 }
990 lock_buffer(bh);
991 dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
992 memset(dchunk->dqc_bitmap, 0,
993 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
994 OCFS2_QBLK_RESERVED_SPACE);
995 set_buffer_uptodate(bh);
996 unlock_buffer(bh);
997 status = ocfs2_journal_dirty(handle, bh);
998 if (status < 0) {
999 mlog_errno(status);
1000 goto out_trans;
1001 }
1002
1003 oinfo->dqi_blocks += 2;
1004 oinfo->dqi_chunks++;
1005 status = ocfs2_local_write_info(sb, type);
1006 if (status < 0) {
1007 mlog_errno(status);
1008 goto out_trans;
1009 }
1010 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
1011 if (status < 0) {
1012 mlog_errno(status);
1013 goto out;
1014 }
1015
1016 list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
1017 chunk->qc_num = list_entry(chunk->qc_chunk.prev,
1018 struct ocfs2_quota_chunk,
1019 qc_chunk)->qc_num + 1;
1020 chunk->qc_headerbh = bh;
1021 *offset = 0;
1022 return chunk;
1023out_trans:
1024 ocfs2_commit_trans(OCFS2_SB(sb), handle);
1025out:
1026 brelse(bh);
1027 kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
1028 return ERR_PTR(status);
1029}
1030
1031/* Find free entry in local quota file */
1032static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1033 struct super_block *sb,
1034 int type,
1035 int *offset)
1036{
1037 struct mem_dqinfo *info = sb_dqinfo(sb, type);
1038 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
1039 struct ocfs2_quota_chunk *chunk;
1040 struct inode *lqinode = sb_dqopt(sb)->files[type];
1041 struct ocfs2_local_disk_chunk *dchunk;
1042 int epb = ol_quota_entries_per_block(sb);
1043 unsigned int chunk_blocks;
1044 int status;
1045 handle_t *handle;
1046
1047 if (list_empty(&oinfo->dqi_chunk))
1048 return ocfs2_local_quota_add_chunk(sb, type, offset);
1049 /* Is the last chunk full? */
1050 chunk = list_entry(oinfo->dqi_chunk.prev,
1051 struct ocfs2_quota_chunk, qc_chunk);
1052 chunk_blocks = oinfo->dqi_blocks -
1053 ol_quota_chunk_block(sb, chunk->qc_num) - 1;
1054 if (ol_chunk_blocks(sb) == chunk_blocks)
1055 return ocfs2_local_quota_add_chunk(sb, type, offset);
1056
1057 /* We are protected by dqio_sem so no locking needed */
1058 status = ocfs2_extend_no_holes(lqinode,
1059 lqinode->i_size + sb->s_blocksize,
1060 lqinode->i_size);
1061 if (status < 0) {
1062 mlog_errno(status);
1063 goto out;
1064 }
1065 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
1066 lqinode->i_size + sb->s_blocksize);
1067 if (status < 0) {
1068 mlog_errno(status);
1069 goto out;
1070 }
1071 handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
1072 if (IS_ERR(handle)) {
1073 status = PTR_ERR(handle);
1074 mlog_errno(status);
1075 goto out;
1076 }
1077 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
1078 OCFS2_JOURNAL_ACCESS_WRITE);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto out_trans;
1082 }
1083
1084 dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
1085 lock_buffer(chunk->qc_headerbh);
1086 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
1087 unlock_buffer(chunk->qc_headerbh);
1088 status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
1089 if (status < 0) {
1090 mlog_errno(status);
1091 goto out_trans;
1092 }
1093 oinfo->dqi_blocks++;
1094 status = ocfs2_local_write_info(sb, type);
1095 if (status < 0) {
1096 mlog_errno(status);
1097 goto out_trans;
1098 }
1099
1100 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
1101 if (status < 0) {
1102 mlog_errno(status);
1103 goto out;
1104 }
1105 *offset = chunk_blocks * epb;
1106 return chunk;
1107out_trans:
1108 ocfs2_commit_trans(OCFS2_SB(sb), handle);
1109out:
1110 return ERR_PTR(status);
1111}
1112
1113static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1114{
1115 int *offset = private;
1116 struct ocfs2_local_disk_chunk *dchunk;
1117
1118 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
1119 ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
1120 le32_add_cpu(&dchunk->dqc_free, -1);
1121}
1122
1123/* Create dquot in the local file for given id */
1124static int ocfs2_create_local_dquot(struct dquot *dquot)
1125{
1126 struct super_block *sb = dquot->dq_sb;
1127 int type = dquot->dq_type;
1128 struct inode *lqinode = sb_dqopt(sb)->files[type];
1129 struct ocfs2_quota_chunk *chunk;
1130 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1131 int offset;
1132 int status;
1133
1134 chunk = ocfs2_find_free_entry(sb, type, &offset);
1135 if (!chunk) {
1136 chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
1137 if (IS_ERR(chunk))
1138 return PTR_ERR(chunk);
1139 } else if (IS_ERR(chunk)) {
1140 return PTR_ERR(chunk);
1141 }
1142 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
1143 od->dq_chunk = chunk;
1144
1145 /* Initialize dquot structure on disk */
1146 status = ocfs2_local_write_dquot(dquot);
1147 if (status < 0) {
1148 mlog_errno(status);
1149 goto out;
1150 }
1151
1152 /* Mark structure as allocated */
1153 status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
1154 &offset);
1155 if (status < 0) {
1156 mlog_errno(status);
1157 goto out;
1158 }
1159out:
1160 return status;
1161}
1162
1163/* Create entry in local file for dquot, load data from the global file */
1164static int ocfs2_local_read_dquot(struct dquot *dquot)
1165{
1166 int status;
1167
1168 mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
1169
1170 status = ocfs2_global_read_dquot(dquot);
1171 if (status < 0) {
1172 mlog_errno(status);
1173 goto out_err;
1174 }
1175
1176 /* Now create entry in the local quota file */
1177 status = ocfs2_create_local_dquot(dquot);
1178 if (status < 0) {
1179 mlog_errno(status);
1180 goto out_err;
1181 }
1182 mlog_exit(0);
1183 return 0;
1184out_err:
1185 mlog_exit(status);
1186 return status;
1187}
1188
1189/* Release dquot structure from local quota file. ocfs2_release_dquot() has
1190 * already started a transaction and obtained exclusive lock for global
1191 * quota file. */
1192static int ocfs2_local_release_dquot(struct dquot *dquot)
1193{
1194 int status;
1195 int type = dquot->dq_type;
1196 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1197 struct super_block *sb = dquot->dq_sb;
1198 struct ocfs2_local_disk_chunk *dchunk;
1199 int offset;
1200 handle_t *handle = journal_current_handle();
1201
1202 BUG_ON(!handle);
1203 /* First write all local changes to global file */
1204 status = ocfs2_global_release_dquot(dquot);
1205 if (status < 0) {
1206 mlog_errno(status);
1207 goto out;
1208 }
1209
1210 status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
1211 od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
1212 if (status < 0) {
1213 mlog_errno(status);
1214 goto out;
1215 }
1216 offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
1217 od->dq_local_off);
1218 dchunk = (struct ocfs2_local_disk_chunk *)
1219 (od->dq_chunk->qc_headerbh->b_data);
1220 /* Mark structure as freed */
1221 lock_buffer(od->dq_chunk->qc_headerbh);
1222 ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
1223 le32_add_cpu(&dchunk->dqc_free, 1);
1224 unlock_buffer(od->dq_chunk->qc_headerbh);
1225 status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
1226 if (status < 0) {
1227 mlog_errno(status);
1228 goto out;
1229 }
1230 status = 0;
1231out:
1232 /* Clear the read bit so that next time someone uses this
1233 * dquot he reads fresh info from disk and allocates local
1234 * dquot structure */
1235 clear_bit(DQ_READ_B, &dquot->dq_flags);
1236 return status;
1237}
1238
1239static struct quota_format_ops ocfs2_format_ops = {
1240 .check_quota_file = ocfs2_local_check_quota_file,
1241 .read_file_info = ocfs2_local_read_info,
1242 .write_file_info = ocfs2_global_write_info,
1243 .free_file_info = ocfs2_local_free_info,
1244 .read_dqblk = ocfs2_local_read_dquot,
1245 .commit_dqblk = ocfs2_local_write_dquot,
1246 .release_dqblk = ocfs2_local_release_dquot,
1247};
1248
1249struct quota_format_type ocfs2_quota_format = {
1250 .qf_fmt_id = QFMT_OCFS2,
1251 .qf_ops = &ocfs2_format_ops,
1252 .qf_owner = THIS_MODULE
1253};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a7..424adaa5f900 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", 106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster); 107 new_clusters, first_new_cluster);
108 108
109 ret = ocfs2_journal_access(handle, bm_inode, group_bh, 109 ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
110 OCFS2_JOURNAL_ACCESS_WRITE); 110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) { 111 if (ret < 0) {
112 mlog_errno(ret); 112 mlog_errno(ret);
113 goto out; 113 goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
141 } 141 }
142 142
143 /* update the inode accordingly. */ 143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access(handle, bm_inode, bm_bh, 144 ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE); 145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) { 146 if (ret < 0) {
147 mlog_errno(ret); 147 mlog_errno(ret);
148 goto out_rollback; 148 goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
314 314
315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data; 315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
316 316
317 /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
318 * so any corruption is a code bug. */
319 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
320
317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 321 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
318 ocfs2_group_bitmap_size(osb->sb) * 8) { 322 ocfs2_group_bitmap_size(osb->sb) * 8) {
319 mlog(ML_ERROR, "The disk is too old and small. " 323 mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
322 goto out_unlock; 326 goto out_unlock;
323 } 327 }
324 328
325 if (!OCFS2_IS_VALID_DINODE(fe)) {
326 OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
327 ret = -EIO;
328 goto out_unlock;
329 }
330
331 first_new_cluster = le32_to_cpu(fe->i_clusters); 329 first_new_cluster = le32_to_cpu(fe->i_clusters);
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, 330 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1); 331 first_new_cluster - 1);
334 332
335 ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh); 333 ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
334 &group_bh);
336 if (ret < 0) { 335 if (ret < 0) {
337 mlog_errno(ret); 336 mlog_errno(ret);
338 goto out_unlock; 337 goto out_unlock;
339 } 338 }
340
341 group = (struct ocfs2_group_desc *)group_bh->b_data; 339 group = (struct ocfs2_group_desc *)group_bh->b_data;
342 340
343 ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
344 if (ret) {
345 mlog_errno(ret);
346 goto out_unlock;
347 }
348
349 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); 341 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
350 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters > 342 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
351 le16_to_cpu(fe->id2.i_chain.cl_cpg)) { 343 le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
398 struct buffer_head *group_bh) 390 struct buffer_head *group_bh)
399{ 391{
400 int ret; 392 int ret;
401 struct ocfs2_group_desc *gd; 393 struct ocfs2_group_desc *gd =
394 (struct ocfs2_group_desc *)group_bh->b_data;
402 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc); 395 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
403 unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
404 le16_to_cpu(di->id2.i_chain.cl_bpc);
405
406 396
407 gd = (struct ocfs2_group_desc *)group_bh->b_data; 397 ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
398 if (ret)
399 goto out;
408 400
409 ret = -EIO; 401 ret = -EINVAL;
410 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) 402 if (le16_to_cpu(gd->bg_chain) != input->chain)
411 mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
412 (unsigned long long)le64_to_cpu(gd->bg_blkno));
413 else if (di->i_blkno != gd->bg_parent_dinode)
414 mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
415 "pointer (%llu, expected %llu)\n",
416 (unsigned long long)le64_to_cpu(gd->bg_blkno),
417 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
418 (unsigned long long)le64_to_cpu(di->i_blkno));
419 else if (le16_to_cpu(gd->bg_bits) > max_bits)
420 mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
421 (unsigned long long)le64_to_cpu(gd->bg_blkno),
422 le16_to_cpu(gd->bg_bits));
423 else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
424 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
425 "claims that %u are free\n",
426 (unsigned long long)le64_to_cpu(gd->bg_blkno),
427 le16_to_cpu(gd->bg_bits),
428 le16_to_cpu(gd->bg_free_bits_count));
429 else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
430 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
431 "max bitmap bits of %u\n",
432 (unsigned long long)le64_to_cpu(gd->bg_blkno),
433 le16_to_cpu(gd->bg_bits),
434 8 * le16_to_cpu(gd->bg_size));
435 else if (le16_to_cpu(gd->bg_chain) != input->chain)
436 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u " 403 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
437 "while input has %u set.\n", 404 "while input has %u set.\n",
438 (unsigned long long)le64_to_cpu(gd->bg_blkno), 405 (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
451 else 418 else
452 ret = 0; 419 ret = 0;
453 420
421out:
454 return ret; 422 return ret;
455} 423}
456 424
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
568 cl = &fe->id2.i_chain; 536 cl = &fe->id2.i_chain;
569 cr = &cl->cl_recs[input->chain]; 537 cr = &cl->cl_recs[input->chain];
570 538
571 ret = ocfs2_journal_access(handle, main_bm_inode, group_bh, 539 ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
572 OCFS2_JOURNAL_ACCESS_WRITE); 540 OCFS2_JOURNAL_ACCESS_WRITE);
573 if (ret < 0) { 541 if (ret < 0) {
574 mlog_errno(ret); 542 mlog_errno(ret);
575 goto out_commit; 543 goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
584 goto out_commit; 552 goto out_commit;
585 } 553 }
586 554
587 ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh, 555 ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
588 OCFS2_JOURNAL_ACCESS_WRITE); 556 OCFS2_JOURNAL_ACCESS_WRITE);
589 if (ret < 0) { 557 if (ret < 0) {
590 mlog_errno(ret); 558 mlog_errno(ret);
591 goto out_commit; 559 goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f8508..40661e7824e9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
151 * this is not true, the read of -1 (UINT64_MAX) will fail. 151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */ 152 */
153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, 153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
154 OCFS2_BH_IGNORE_CACHE); 154 OCFS2_BH_IGNORE_CACHE, NULL);
155 if (ret == 0) { 155 if (ret == 0) {
156 spin_lock(&osb->osb_lock); 156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si); 157 ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
405 405
406 bh = NULL; /* Acquire a fresh bh */ 406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, 407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
408 OCFS2_BH_IGNORE_CACHE); 408 OCFS2_BH_IGNORE_CACHE, NULL);
409 if (status < 0) { 409 if (status < 0) {
410 mlog_errno(status); 410 mlog_errno(status);
411 goto bail; 411 goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b57..a69628603e18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
35#include "ocfs2.h" 35#include "ocfs2.h"
36 36
37#include "alloc.h" 37#include "alloc.h"
38#include "blockcheck.h"
38#include "dlmglue.h" 39#include "dlmglue.h"
39#include "inode.h" 40#include "inode.h"
40#include "journal.h" 41#include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
145 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 146 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
146} 147}
147 148
148/* somewhat more expensive than our other checks, so use sparingly. */ 149#define do_error(fmt, ...) \
149int ocfs2_check_group_descriptor(struct super_block *sb, 150 do{ \
150 struct ocfs2_dinode *di, 151 if (clean_error) \
151 struct ocfs2_group_desc *gd) 152 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
153 else \
154 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
155 } while (0)
156
157static int ocfs2_validate_gd_self(struct super_block *sb,
158 struct buffer_head *bh,
159 int clean_error)
152{ 160{
153 unsigned int max_bits; 161 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
154 162
155 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 163 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
156 OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd); 164 do_error("Group descriptor #%llu has bad signature %.*s",
157 return -EIO; 165 (unsigned long long)bh->b_blocknr, 7,
166 gd->bg_signature);
167 return -EINVAL;
158 } 168 }
159 169
170 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
171 do_error("Group descriptor #%llu has an invalid bg_blkno "
172 "of %llu",
173 (unsigned long long)bh->b_blocknr,
174 (unsigned long long)le64_to_cpu(gd->bg_blkno));
175 return -EINVAL;
176 }
177
178 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
179 do_error("Group descriptor #%llu has an invalid "
180 "fs_generation of #%u",
181 (unsigned long long)bh->b_blocknr,
182 le32_to_cpu(gd->bg_generation));
183 return -EINVAL;
184 }
185
186 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
187 do_error("Group descriptor #%llu has bit count %u but "
188 "claims that %u are free",
189 (unsigned long long)bh->b_blocknr,
190 le16_to_cpu(gd->bg_bits),
191 le16_to_cpu(gd->bg_free_bits_count));
192 return -EINVAL;
193 }
194
195 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
196 do_error("Group descriptor #%llu has bit count %u but "
197 "max bitmap bits of %u",
198 (unsigned long long)bh->b_blocknr,
199 le16_to_cpu(gd->bg_bits),
200 8 * le16_to_cpu(gd->bg_size));
201 return -EINVAL;
202 }
203
204 return 0;
205}
206
207static int ocfs2_validate_gd_parent(struct super_block *sb,
208 struct ocfs2_dinode *di,
209 struct buffer_head *bh,
210 int clean_error)
211{
212 unsigned int max_bits;
213 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
214
160 if (di->i_blkno != gd->bg_parent_dinode) { 215 if (di->i_blkno != gd->bg_parent_dinode) {
161 ocfs2_error(sb, "Group descriptor # %llu has bad parent " 216 do_error("Group descriptor #%llu has bad parent "
162 "pointer (%llu, expected %llu)", 217 "pointer (%llu, expected %llu)",
163 (unsigned long long)le64_to_cpu(gd->bg_blkno), 218 (unsigned long long)bh->b_blocknr,
164 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 219 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
165 (unsigned long long)le64_to_cpu(di->i_blkno)); 220 (unsigned long long)le64_to_cpu(di->i_blkno));
166 return -EIO; 221 return -EINVAL;
167 } 222 }
168 223
169 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 224 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
170 if (le16_to_cpu(gd->bg_bits) > max_bits) { 225 if (le16_to_cpu(gd->bg_bits) > max_bits) {
171 ocfs2_error(sb, "Group descriptor # %llu has bit count of %u", 226 do_error("Group descriptor #%llu has bit count of %u",
172 (unsigned long long)le64_to_cpu(gd->bg_blkno), 227 (unsigned long long)bh->b_blocknr,
173 le16_to_cpu(gd->bg_bits)); 228 le16_to_cpu(gd->bg_bits));
174 return -EIO; 229 return -EINVAL;
175 } 230 }
176 231
177 if (le16_to_cpu(gd->bg_chain) >= 232 if (le16_to_cpu(gd->bg_chain) >=
178 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
179 ocfs2_error(sb, "Group descriptor # %llu has bad chain %u", 234 do_error("Group descriptor #%llu has bad chain %u",
180 (unsigned long long)le64_to_cpu(gd->bg_blkno), 235 (unsigned long long)bh->b_blocknr,
181 le16_to_cpu(gd->bg_chain)); 236 le16_to_cpu(gd->bg_chain));
182 return -EIO; 237 return -EINVAL;
183 } 238 }
184 239
185 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 240 return 0;
186 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " 241}
187 "claims that %u are free",
188 (unsigned long long)le64_to_cpu(gd->bg_blkno),
189 le16_to_cpu(gd->bg_bits),
190 le16_to_cpu(gd->bg_free_bits_count));
191 return -EIO;
192 }
193 242
194 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 243#undef do_error
195 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " 244
196 "max bitmap bits of %u", 245/*
197 (unsigned long long)le64_to_cpu(gd->bg_blkno), 246 * This version only prints errors. It does not fail the filesystem, and
198 le16_to_cpu(gd->bg_bits), 247 * exists only for resize.
199 8 * le16_to_cpu(gd->bg_size)); 248 */
200 return -EIO; 249int ocfs2_check_group_descriptor(struct super_block *sb,
250 struct ocfs2_dinode *di,
251 struct buffer_head *bh)
252{
253 int rc;
254 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
255
256 BUG_ON(!buffer_uptodate(bh));
257
258 /*
259 * If the ecc fails, we return the error but otherwise
260 * leave the filesystem running. We know any error is
261 * local to this block.
262 */
263 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
264 if (rc) {
265 mlog(ML_ERROR,
266 "Checksum failed for group descriptor %llu\n",
267 (unsigned long long)bh->b_blocknr);
268 } else
269 rc = ocfs2_validate_gd_self(sb, bh, 1);
270 if (!rc)
271 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
272
273 return rc;
274}
275
276static int ocfs2_validate_group_descriptor(struct super_block *sb,
277 struct buffer_head *bh)
278{
279 int rc;
280 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
281
282 mlog(0, "Validating group descriptor %llu\n",
283 (unsigned long long)bh->b_blocknr);
284
285 BUG_ON(!buffer_uptodate(bh));
286
287 /*
288 * If the ecc fails, we return the error but otherwise
289 * leave the filesystem running. We know any error is
290 * local to this block.
291 */
292 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
293 if (rc)
294 return rc;
295
296 /*
297 * Errors after here are fatal.
298 */
299
300 return ocfs2_validate_gd_self(sb, bh, 0);
301}
302
303int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
304 u64 gd_blkno, struct buffer_head **bh)
305{
306 int rc;
307 struct buffer_head *tmp = *bh;
308
309 rc = ocfs2_read_block(inode, gd_blkno, &tmp,
310 ocfs2_validate_group_descriptor);
311 if (rc)
312 goto out;
313
314 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
315 if (rc) {
316 brelse(tmp);
317 goto out;
201 } 318 }
202 319
203 return 0; 320 /* If ocfs2_read_block() got us a new bh, pass it up. */
321 if (!*bh)
322 *bh = tmp;
323
324out:
325 return rc;
204} 326}
205 327
206static int ocfs2_block_group_fill(handle_t *handle, 328static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
225 goto bail; 347 goto bail;
226 } 348 }
227 349
228 status = ocfs2_journal_access(handle, 350 status = ocfs2_journal_access_gd(handle,
229 alloc_inode, 351 alloc_inode,
230 bg_bh, 352 bg_bh,
231 OCFS2_JOURNAL_ACCESS_CREATE); 353 OCFS2_JOURNAL_ACCESS_CREATE);
232 if (status < 0) { 354 if (status < 0) {
233 mlog_errno(status); 355 mlog_errno(status);
234 goto bail; 356 goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
358 480
359 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 481 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
360 482
361 status = ocfs2_journal_access(handle, alloc_inode, 483 status = ocfs2_journal_access_di(handle, alloc_inode,
362 bh, OCFS2_JOURNAL_ACCESS_WRITE); 484 bh, OCFS2_JOURNAL_ACCESS_WRITE);
363 if (status < 0) { 485 if (status < 0) {
364 mlog_errno(status); 486 mlog_errno(status);
365 goto bail; 487 goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
441 ac->ac_alloc_slot = slot; 563 ac->ac_alloc_slot = slot;
442 564
443 fe = (struct ocfs2_dinode *) bh->b_data; 565 fe = (struct ocfs2_dinode *) bh->b_data;
444 if (!OCFS2_IS_VALID_DINODE(fe)) { 566
445 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 567 /* The bh was validated by the inode read inside
446 status = -EIO; 568 * ocfs2_inode_lock(). Any corruption is a code bug. */
447 goto bail; 569 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
448 } 570
449 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 571 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
450 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", 572 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
451 (unsigned long long)le64_to_cpu(fe->i_blkno)); 573 (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
790 int offset, start, found, status = 0; 912 int offset, start, found, status = 0;
791 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 913 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
792 914
793 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 915 /* Callers got this descriptor from
794 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); 916 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
795 return -EIO; 917 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
796 }
797 918
798 found = start = best_offset = best_size = 0; 919 found = start = best_offset = best_size = 0;
799 bitmap = bg->bg_bitmap; 920 bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
858 979
859 mlog_entry_void(); 980 mlog_entry_void();
860 981
861 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 982 /* All callers get the descriptor via
862 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 983 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
863 status = -EIO; 984 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
864 goto bail;
865 }
866 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 985 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
867 986
868 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 987 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
871 if (ocfs2_is_cluster_bitmap(alloc_inode)) 990 if (ocfs2_is_cluster_bitmap(alloc_inode))
872 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 991 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
873 992
874 status = ocfs2_journal_access(handle, 993 status = ocfs2_journal_access_gd(handle,
875 alloc_inode, 994 alloc_inode,
876 group_bh, 995 group_bh,
877 journal_type); 996 journal_type);
878 if (status < 0) { 997 if (status < 0) {
879 mlog_errno(status); 998 mlog_errno(status);
880 goto bail; 999 goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
931 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1050 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
932 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1051 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
933 1052
934 if (!OCFS2_IS_VALID_DINODE(fe)) { 1053 /* The caller got these descriptors from
935 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 1054 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
936 status = -EIO; 1055 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
937 goto out; 1056 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
938 }
939 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
940 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
941 status = -EIO;
942 goto out;
943 }
944 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
945 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
946 status = -EIO;
947 goto out;
948 }
949 1057
950 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", 1058 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
951 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1059 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
956 bg_ptr = le64_to_cpu(bg->bg_next_group); 1064 bg_ptr = le64_to_cpu(bg->bg_next_group);
957 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1065 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
958 1066
959 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, 1067 status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
960 OCFS2_JOURNAL_ACCESS_WRITE); 1068 OCFS2_JOURNAL_ACCESS_WRITE);
961 if (status < 0) { 1069 if (status < 0) {
962 mlog_errno(status); 1070 mlog_errno(status);
963 goto out_rollback; 1071 goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
971 goto out_rollback; 1079 goto out_rollback;
972 } 1080 }
973 1081
974 status = ocfs2_journal_access(handle, alloc_inode, bg_bh, 1082 status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
975 OCFS2_JOURNAL_ACCESS_WRITE); 1083 OCFS2_JOURNAL_ACCESS_WRITE);
976 if (status < 0) { 1084 if (status < 0) {
977 mlog_errno(status); 1085 mlog_errno(status);
978 goto out_rollback; 1086 goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
986 goto out_rollback; 1094 goto out_rollback;
987 } 1095 }
988 1096
989 status = ocfs2_journal_access(handle, alloc_inode, fe_bh, 1097 status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
990 OCFS2_JOURNAL_ACCESS_WRITE); 1098 OCFS2_JOURNAL_ACCESS_WRITE);
991 if (status < 0) { 1099 if (status < 0) {
992 mlog_errno(status); 1100 mlog_errno(status);
993 goto out_rollback; 1101 goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
1008 bg->bg_next_group = cpu_to_le64(bg_ptr); 1116 bg->bg_next_group = cpu_to_le64(bg_ptr);
1009 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1117 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1010 } 1118 }
1011out: 1119
1012 mlog_exit(status); 1120 mlog_exit(status);
1013 return status; 1121 return status;
1014} 1122}
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1138 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1246 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1139 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1247 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1140 1248
1141 ret = ocfs2_journal_access(handle, inode, di_bh, 1249 ret = ocfs2_journal_access_di(handle, inode, di_bh,
1142 OCFS2_JOURNAL_ACCESS_WRITE); 1250 OCFS2_JOURNAL_ACCESS_WRITE);
1143 if (ret < 0) { 1251 if (ret < 0) {
1144 mlog_errno(ret); 1252 mlog_errno(ret);
1145 goto out; 1253 goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1170 u16 found; 1278 u16 found;
1171 struct buffer_head *group_bh = NULL; 1279 struct buffer_head *group_bh = NULL;
1172 struct ocfs2_group_desc *gd; 1280 struct ocfs2_group_desc *gd;
1281 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1173 struct inode *alloc_inode = ac->ac_inode; 1282 struct inode *alloc_inode = ac->ac_inode;
1174 1283
1175 ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh); 1284 ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1285 &group_bh);
1176 if (ret < 0) { 1286 if (ret < 0) {
1177 mlog_errno(ret); 1287 mlog_errno(ret);
1178 return ret; 1288 return ret;
1179 } 1289 }
1180 1290
1181 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1291 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1182 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
1183 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
1184 ret = -EIO;
1185 goto out;
1186 }
1187
1188 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1292 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1189 ac->ac_max_block, bit_off, &found); 1293 ac->ac_max_block, bit_off, &found);
1190 if (ret < 0) { 1294 if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1241 bits_wanted, chain, 1345 bits_wanted, chain,
1242 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); 1346 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1243 1347
1244 status = ocfs2_read_block(alloc_inode, 1348 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1245 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1349 le64_to_cpu(cl->cl_recs[chain].c_blkno),
1246 &group_bh); 1350 &group_bh);
1247 if (status < 0) { 1351 if (status < 0) {
1248 mlog_errno(status); 1352 mlog_errno(status);
1249 goto bail; 1353 goto bail;
1250 } 1354 }
1251 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1355 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1252 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1253 if (status) {
1254 mlog_errno(status);
1255 goto bail;
1256 }
1257 1356
1258 status = -ENOSPC; 1357 status = -ENOSPC;
1259 /* for now, the chain search is a bit simplistic. We just use 1358 /* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1271 next_group = le64_to_cpu(bg->bg_next_group); 1370 next_group = le64_to_cpu(bg->bg_next_group);
1272 prev_group_bh = group_bh; 1371 prev_group_bh = group_bh;
1273 group_bh = NULL; 1372 group_bh = NULL;
1274 status = ocfs2_read_block(alloc_inode, 1373 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1275 next_group, &group_bh); 1374 next_group, &group_bh);
1276 if (status < 0) { 1375 if (status < 0) {
1277 mlog_errno(status); 1376 mlog_errno(status);
1278 goto bail; 1377 goto bail;
1279 } 1378 }
1280 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1379 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1281 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1282 if (status) {
1283 mlog_errno(status);
1284 goto bail;
1285 }
1286 } 1380 }
1287 if (status < 0) { 1381 if (status < 0) {
1288 if (status != -ENOSPC) 1382 if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1324 1418
1325 /* Ok, claim our bits now: set the info on dinode, chainlist 1419 /* Ok, claim our bits now: set the info on dinode, chainlist
1326 * and then the group */ 1420 * and then the group */
1327 status = ocfs2_journal_access(handle, 1421 status = ocfs2_journal_access_di(handle,
1328 alloc_inode, 1422 alloc_inode,
1329 ac->ac_bh, 1423 ac->ac_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE); 1424 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (status < 0) { 1425 if (status < 0) {
1332 mlog_errno(status); 1426 mlog_errno(status);
1333 goto bail; 1427 goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1392 BUG_ON(!ac->ac_bh); 1486 BUG_ON(!ac->ac_bh);
1393 1487
1394 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1488 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1395 if (!OCFS2_IS_VALID_DINODE(fe)) { 1489
1396 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); 1490 /* The bh was validated by the inode read during
1397 status = -EIO; 1491 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */
1398 goto bail; 1492 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1399 } 1493
1400 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1494 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1401 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1495 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1402 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1496 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1725 1819
1726 mlog_entry_void(); 1820 mlog_entry_void();
1727 1821
1728 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 1822 /* The caller got this descriptor from
1729 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 1823 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1730 status = -EIO; 1824 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1731 goto bail;
1732 }
1733 1825
1734 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1826 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1735 1827
1736 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1828 if (ocfs2_is_cluster_bitmap(alloc_inode))
1737 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1829 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1738 1830
1739 status = ocfs2_journal_access(handle, alloc_inode, group_bh, 1831 status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
1740 journal_type); 1832 journal_type);
1741 if (status < 0) { 1833 if (status < 0) {
1742 mlog_errno(status); 1834 mlog_errno(status);
1743 goto bail; 1835 goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1782 1874
1783 mlog_entry_void(); 1875 mlog_entry_void();
1784 1876
1785 if (!OCFS2_IS_VALID_DINODE(fe)) { 1877 /* The alloc_bh comes from ocfs2_free_dinode() or
1786 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 1878 * ocfs2_free_clusters(). The callers have all locked the
1787 status = -EIO; 1879 * allocator and gotten alloc_bh from the lock call. This
1788 goto bail; 1880 * validates the dinode buffer. Any corruption that has happended
1789 } 1881 * is a code bug. */
1882 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1790 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 1883 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1791 1884
1792 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", 1885 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1793 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, 1886 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1794 (unsigned long long)bg_blkno, start_bit); 1887 (unsigned long long)bg_blkno, start_bit);
1795 1888
1796 status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh); 1889 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
1890 &group_bh);
1797 if (status < 0) { 1891 if (status < 0) {
1798 mlog_errno(status); 1892 mlog_errno(status);
1799 goto bail; 1893 goto bail;
1800 } 1894 }
1801
1802 group = (struct ocfs2_group_desc *) group_bh->b_data; 1895 group = (struct ocfs2_group_desc *) group_bh->b_data;
1803 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group); 1896
1804 if (status) {
1805 mlog_errno(status);
1806 goto bail;
1807 }
1808 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 1897 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1809 1898
1810 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 1899 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1815 goto bail; 1904 goto bail;
1816 } 1905 }
1817 1906
1818 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, 1907 status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
1819 OCFS2_JOURNAL_ACCESS_WRITE); 1908 OCFS2_JOURNAL_ACCESS_WRITE);
1820 if (status < 0) { 1909 if (status < 0) {
1821 mlog_errno(status); 1910 mlog_errno(status);
1822 goto bail; 1911 goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f450..e3c13c77f9e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
164 * and return that block offset. */ 164 * and return that block offset. */
165u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); 165u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
166 166
167/* somewhat more expensive than our other checks, so use sparingly. */ 167/*
168 * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
169 * finds a problem. A caller that wants to check a group descriptor
170 * without going readonly should read the block with ocfs2_read_block[s]()
171 * and then checking it with this function. This is only resize, really.
172 * Everyone else should be using ocfs2_read_group_descriptor().
173 */
168int ocfs2_check_group_descriptor(struct super_block *sb, 174int ocfs2_check_group_descriptor(struct super_block *sb,
169 struct ocfs2_dinode *di, 175 struct ocfs2_dinode *di,
170 struct ocfs2_group_desc *gd); 176 struct buffer_head *bh);
177/*
178 * Read a group descriptor block into *bh. If *bh is NULL, a bh will be
179 * allocated. This is a cached read. The descriptor will be validated with
180 * ocfs2_validate_group_descriptor().
181 */
182int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
183 u64 gd_blkno, struct buffer_head **bh);
184
171int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, 185int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
172 u32 clusters_to_add, u32 extents_to_split, 186 u32 clusters_to_add, u32 extents_to_split,
173 struct ocfs2_alloc_context **data_ac, 187 struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78cf..43ed11345b59 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
41#include <linux/debugfs.h> 41#include <linux/debugfs.h>
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/quotaops.h>
44 45
45#define MLOG_MASK_PREFIX ML_SUPER 46#define MLOG_MASK_PREFIX ML_SUPER
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
51#include "ocfs1_fs_compat.h" 52#include "ocfs1_fs_compat.h"
52 53
53#include "alloc.h" 54#include "alloc.h"
55#include "blockcheck.h"
54#include "dlmglue.h" 56#include "dlmglue.h"
55#include "export.h" 57#include "export.h"
56#include "extent_map.h" 58#include "extent_map.h"
@@ -65,10 +67,13 @@
65#include "uptodate.h" 67#include "uptodate.h"
66#include "ver.h" 68#include "ver.h"
67#include "xattr.h" 69#include "xattr.h"
70#include "quota.h"
68 71
69#include "buffer_head_io.h" 72#include "buffer_head_io.h"
70 73
71static struct kmem_cache *ocfs2_inode_cachep = NULL; 74static struct kmem_cache *ocfs2_inode_cachep = NULL;
75struct kmem_cache *ocfs2_dquot_cachep;
76struct kmem_cache *ocfs2_qf_chunk_cachep;
72 77
73/* OCFS2 needs to schedule several differnt types of work which 78/* OCFS2 needs to schedule several differnt types of work which
74 * require cluster locking, disk I/O, recovery waits, etc. Since these 79 * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
124static void ocfs2_write_super(struct super_block *sb); 129static void ocfs2_write_super(struct super_block *sb);
125static struct inode *ocfs2_alloc_inode(struct super_block *sb); 130static struct inode *ocfs2_alloc_inode(struct super_block *sb);
126static void ocfs2_destroy_inode(struct inode *inode); 131static void ocfs2_destroy_inode(struct inode *inode);
132static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
133static int ocfs2_enable_quotas(struct ocfs2_super *osb);
134static void ocfs2_disable_quotas(struct ocfs2_super *osb);
127 135
128static const struct super_operations ocfs2_sops = { 136static const struct super_operations ocfs2_sops = {
129 .statfs = ocfs2_statfs, 137 .statfs = ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
137 .put_super = ocfs2_put_super, 145 .put_super = ocfs2_put_super,
138 .remount_fs = ocfs2_remount, 146 .remount_fs = ocfs2_remount,
139 .show_options = ocfs2_show_options, 147 .show_options = ocfs2_show_options,
148 .quota_read = ocfs2_quota_read,
149 .quota_write = ocfs2_quota_write,
140}; 150};
141 151
142enum { 152enum {
@@ -158,6 +168,10 @@ enum {
158 Opt_user_xattr, 168 Opt_user_xattr,
159 Opt_nouser_xattr, 169 Opt_nouser_xattr,
160 Opt_inode64, 170 Opt_inode64,
171 Opt_acl,
172 Opt_noacl,
173 Opt_usrquota,
174 Opt_grpquota,
161 Opt_err, 175 Opt_err,
162}; 176};
163 177
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
180 {Opt_user_xattr, "user_xattr"}, 194 {Opt_user_xattr, "user_xattr"},
181 {Opt_nouser_xattr, "nouser_xattr"}, 195 {Opt_nouser_xattr, "nouser_xattr"},
182 {Opt_inode64, "inode64"}, 196 {Opt_inode64, "inode64"},
197 {Opt_acl, "acl"},
198 {Opt_noacl, "noacl"},
199 {Opt_usrquota, "usrquota"},
200 {Opt_grpquota, "grpquota"},
183 {Opt_err, NULL} 201 {Opt_err, NULL}
184}; 202};
185 203
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
221 return 0; 239 return 0;
222} 240}
223 241
242static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
243{
244 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
245 && (ino == USER_QUOTA_SYSTEM_INODE
246 || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
247 return 0;
248 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
249 && (ino == GROUP_QUOTA_SYSTEM_INODE
250 || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
251 return 0;
252 return 1;
253}
254
224static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) 255static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
225{ 256{
226 struct inode *new = NULL; 257 struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
247 278
248 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; 279 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
249 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { 280 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
281 if (!ocfs2_need_system_inode(osb, i))
282 continue;
250 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 283 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
251 if (!new) { 284 if (!new) {
252 ocfs2_release_system_inodes(osb); 285 ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
277 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; 310 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
278 i < NUM_SYSTEM_INODES; 311 i < NUM_SYSTEM_INODES;
279 i++) { 312 i++) {
313 if (!ocfs2_need_system_inode(osb, i))
314 continue;
280 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 315 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
281 if (!new) { 316 if (!new) {
282 ocfs2_release_system_inodes(osb); 317 ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
426 461
427 /* We're going to/from readonly mode. */ 462 /* We're going to/from readonly mode. */
428 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { 463 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
464 /* Disable quota accounting before remounting RO */
465 if (*flags & MS_RDONLY) {
466 ret = ocfs2_susp_quotas(osb, 0);
467 if (ret < 0)
468 goto out;
469 }
429 /* Lock here so the check of HARD_RO and the potential 470 /* Lock here so the check of HARD_RO and the potential
430 * setting of SOFT_RO is atomic. */ 471 * setting of SOFT_RO is atomic. */
431 spin_lock(&osb->osb_lock); 472 spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
461 } 502 }
462unlock_osb: 503unlock_osb:
463 spin_unlock(&osb->osb_lock); 504 spin_unlock(&osb->osb_lock);
505 /* Enable quota accounting after remounting RW */
506 if (!ret && !(*flags & MS_RDONLY)) {
507 if (sb_any_quota_suspended(sb))
508 ret = ocfs2_susp_quotas(osb, 1);
509 else
510 ret = ocfs2_enable_quotas(osb);
511 if (ret < 0) {
512 /* Return back changes... */
513 spin_lock(&osb->osb_lock);
514 sb->s_flags |= MS_RDONLY;
515 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
516 spin_unlock(&osb->osb_lock);
517 goto out;
518 }
519 }
464 } 520 }
465 521
466 if (!ret) { 522 if (!ret) {
467 /* Only save off the new mount options in case of a successful 523 /* Only save off the new mount options in case of a successful
468 * remount. */ 524 * remount. */
525 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
526 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
469 osb->s_mount_opt = parsed_options.mount_opt; 527 osb->s_mount_opt = parsed_options.mount_opt;
470 osb->s_atime_quantum = parsed_options.atime_quantum; 528 osb->s_atime_quantum = parsed_options.atime_quantum;
471 osb->preferred_slot = parsed_options.slot; 529 osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
619 return 0; 677 return 0;
620} 678}
621 679
680static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
681{
682 int type;
683 struct super_block *sb = osb->sb;
684 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
685 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
686 int status = 0;
687
688 for (type = 0; type < MAXQUOTAS; type++) {
689 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
690 continue;
691 if (unsuspend)
692 status = vfs_quota_enable(
693 sb_dqopt(sb)->files[type],
694 type, QFMT_OCFS2,
695 DQUOT_SUSPENDED);
696 else
697 status = vfs_quota_disable(sb, type,
698 DQUOT_SUSPENDED);
699 if (status < 0)
700 break;
701 }
702 if (status < 0)
703 mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
704 "remount (error = %d).\n", status);
705 return status;
706}
707
708static int ocfs2_enable_quotas(struct ocfs2_super *osb)
709{
710 struct inode *inode[MAXQUOTAS] = { NULL, NULL };
711 struct super_block *sb = osb->sb;
712 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
713 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
714 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
715 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
716 int status;
717 int type;
718
719 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
720 for (type = 0; type < MAXQUOTAS; type++) {
721 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
722 continue;
723 inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
724 osb->slot_num);
725 if (!inode[type]) {
726 status = -ENOENT;
727 goto out_quota_off;
728 }
729 status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
730 DQUOT_USAGE_ENABLED);
731 if (status < 0)
732 goto out_quota_off;
733 }
734
735 for (type = 0; type < MAXQUOTAS; type++)
736 iput(inode[type]);
737 return 0;
738out_quota_off:
739 ocfs2_disable_quotas(osb);
740 for (type = 0; type < MAXQUOTAS; type++)
741 iput(inode[type]);
742 mlog_errno(status);
743 return status;
744}
745
746static void ocfs2_disable_quotas(struct ocfs2_super *osb)
747{
748 int type;
749 struct inode *inode;
750 struct super_block *sb = osb->sb;
751
752 /* We mostly ignore errors in this function because there's not much
753 * we can do when we see them */
754 for (type = 0; type < MAXQUOTAS; type++) {
755 if (!sb_has_quota_loaded(sb, type))
756 continue;
757 inode = igrab(sb->s_dquot.files[type]);
758 /* Turn off quotas. This will remove all dquot structures from
759 * memory and so they will be automatically synced to global
760 * quota files */
761 vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
762 DQUOT_LIMITS_ENABLED);
763 if (!inode)
764 continue;
765 iput(inode);
766 }
767}
768
769/* Handle quota on quotactl */
770static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
771 char *path, int remount)
772{
773 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
774 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
775
776 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
777 return -EINVAL;
778
779 if (remount)
780 return 0; /* Just ignore it has been handled in
781 * ocfs2_remount() */
782 return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
783 format_id, DQUOT_LIMITS_ENABLED);
784}
785
786/* Handle quota off quotactl */
787static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
788{
789 if (remount)
790 return 0; /* Ignore now and handle later in
791 * ocfs2_remount() */
792 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
793}
794
795static struct quotactl_ops ocfs2_quotactl_ops = {
796 .quota_on = ocfs2_quota_on,
797 .quota_off = ocfs2_quota_off,
798 .quota_sync = vfs_quota_sync,
799 .get_info = vfs_get_dqinfo,
800 .set_info = vfs_set_dqinfo,
801 .get_dqblk = vfs_get_dqblk,
802 .set_dqblk = vfs_set_dqblk,
803};
804
622static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 805static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
623{ 806{
624 struct dentry *root; 807 struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
651 } 834 }
652 brelse(bh); 835 brelse(bh);
653 bh = NULL; 836 bh = NULL;
837
838 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
839 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
840
654 osb->s_mount_opt = parsed_options.mount_opt; 841 osb->s_mount_opt = parsed_options.mount_opt;
655 osb->s_atime_quantum = parsed_options.atime_quantum; 842 osb->s_atime_quantum = parsed_options.atime_quantum;
656 osb->preferred_slot = parsed_options.slot; 843 osb->preferred_slot = parsed_options.slot;
657 osb->osb_commit_interval = parsed_options.commit_interval; 844 osb->osb_commit_interval = parsed_options.commit_interval;
658 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); 845 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
659 osb->local_alloc_bits = osb->local_alloc_default_bits; 846 osb->local_alloc_bits = osb->local_alloc_default_bits;
847 if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
848 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
849 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
850 status = -EINVAL;
851 mlog(ML_ERROR, "User quotas were requested, but this "
852 "filesystem does not have the feature enabled.\n");
853 goto read_super_error;
854 }
855 if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
856 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
857 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
858 status = -EINVAL;
859 mlog(ML_ERROR, "Group quotas were requested, but this "
860 "filesystem does not have the feature enabled.\n");
861 goto read_super_error;
862 }
660 863
661 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 864 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
662 if (status) 865 if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
664 867
665 sb->s_magic = OCFS2_SUPER_MAGIC; 868 sb->s_magic = OCFS2_SUPER_MAGIC;
666 869
870 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
871 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
872
667 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, 873 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
668 * heartbeat=none */ 874 * heartbeat=none */
669 if (bdev_read_only(sb->s_bdev)) { 875 if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
758 atomic_set(&osb->vol_state, VOLUME_MOUNTED); 964 atomic_set(&osb->vol_state, VOLUME_MOUNTED);
759 wake_up(&osb->osb_mount_event); 965 wake_up(&osb->osb_mount_event);
760 966
967 /* Now we can initialize quotas because we can afford to wait
968 * for cluster locks recovery now. That also means that truncation
969 * log recovery can happen but that waits for proper quota setup */
970 if (!(sb->s_flags & MS_RDONLY)) {
971 status = ocfs2_enable_quotas(osb);
972 if (status < 0) {
973 /* We have to err-out specially here because
974 * s_root is already set */
975 mlog_errno(status);
976 atomic_set(&osb->vol_state, VOLUME_DISABLED);
977 wake_up(&osb->osb_mount_event);
978 mlog_exit(status);
979 return status;
980 }
981 }
982
983 ocfs2_complete_quota_recovery(osb);
984
985 /* Now we wake up again for processes waiting for quotas */
986 atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
987 wake_up(&osb->osb_mount_event);
988
761 mlog_exit(status); 989 mlog_exit(status);
762 return status; 990 return status;
763 991
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
945 case Opt_inode64: 1173 case Opt_inode64:
946 mopt->mount_opt |= OCFS2_MOUNT_INODE64; 1174 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
947 break; 1175 break;
1176 case Opt_usrquota:
1177 /* We check only on remount, otherwise features
1178 * aren't yet initialized. */
1179 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1180 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1181 mlog(ML_ERROR, "User quota requested but "
1182 "filesystem feature is not set\n");
1183 status = 0;
1184 goto bail;
1185 }
1186 mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
1187 break;
1188 case Opt_grpquota:
1189 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1190 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1191 mlog(ML_ERROR, "Group quota requested but "
1192 "filesystem feature is not set\n");
1193 status = 0;
1194 goto bail;
1195 }
1196 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1197 break;
1198#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1199 case Opt_acl:
1200 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1201 break;
1202 case Opt_noacl:
1203 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1204 break;
1205#else
1206 case Opt_acl:
1207 case Opt_noacl:
1208 printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
1209 break;
1210#endif
948 default: 1211 default:
949 mlog(ML_ERROR, 1212 mlog(ML_ERROR,
950 "Unrecognized mount option \"%s\" " 1213 "Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1008 if (osb->osb_cluster_stack[0]) 1271 if (osb->osb_cluster_stack[0])
1009 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, 1272 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
1010 osb->osb_cluster_stack); 1273 osb->osb_cluster_stack);
1274 if (opts & OCFS2_MOUNT_USRQUOTA)
1275 seq_printf(s, ",usrquota");
1276 if (opts & OCFS2_MOUNT_GRPQUOTA)
1277 seq_printf(s, ",grpquota");
1011 1278
1012 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1279 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1013 seq_printf(s, ",nouser_xattr"); 1280 seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1017 if (opts & OCFS2_MOUNT_INODE64) 1284 if (opts & OCFS2_MOUNT_INODE64)
1018 seq_printf(s, ",inode64"); 1285 seq_printf(s, ",inode64");
1019 1286
1287#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1288 if (opts & OCFS2_MOUNT_POSIX_ACL)
1289 seq_printf(s, ",acl");
1290 else
1291 seq_printf(s, ",noacl");
1292#endif
1293
1020 return 0; 1294 return 0;
1021} 1295}
1022 1296
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
1052 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1326 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1053 } 1327 }
1054 1328
1329 status = ocfs2_quota_setup();
1330 if (status)
1331 goto leave;
1332
1055 ocfs2_set_locking_protocol(); 1333 ocfs2_set_locking_protocol();
1056 1334
1335 status = register_quota_format(&ocfs2_quota_format);
1057leave: 1336leave:
1058 if (status < 0) { 1337 if (status < 0) {
1338 ocfs2_quota_shutdown();
1059 ocfs2_free_mem_caches(); 1339 ocfs2_free_mem_caches();
1060 exit_ocfs2_uptodate_cache(); 1340 exit_ocfs2_uptodate_cache();
1061 } 1341 }
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
1072{ 1352{
1073 mlog_entry_void(); 1353 mlog_entry_void();
1074 1354
1355 ocfs2_quota_shutdown();
1356
1075 if (ocfs2_wq) { 1357 if (ocfs2_wq) {
1076 flush_workqueue(ocfs2_wq); 1358 flush_workqueue(ocfs2_wq);
1077 destroy_workqueue(ocfs2_wq); 1359 destroy_workqueue(ocfs2_wq);
1078 } 1360 }
1079 1361
1362 unregister_quota_format(&ocfs2_quota_format);
1363
1080 debugfs_remove(ocfs2_debugfs_root); 1364 debugfs_remove(ocfs2_debugfs_root);
1081 1365
1082 ocfs2_free_mem_caches(); 1366 ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
1192 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 1476 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1193 SLAB_MEM_SPREAD), 1477 SLAB_MEM_SPREAD),
1194 ocfs2_inode_init_once); 1478 ocfs2_inode_init_once);
1195 if (!ocfs2_inode_cachep) 1479 ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
1480 sizeof(struct ocfs2_dquot),
1481 0,
1482 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1483 SLAB_MEM_SPREAD),
1484 NULL);
1485 ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
1486 sizeof(struct ocfs2_quota_chunk),
1487 0,
1488 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
1489 NULL);
1490 if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
1491 !ocfs2_qf_chunk_cachep) {
1492 if (ocfs2_inode_cachep)
1493 kmem_cache_destroy(ocfs2_inode_cachep);
1494 if (ocfs2_dquot_cachep)
1495 kmem_cache_destroy(ocfs2_dquot_cachep);
1496 if (ocfs2_qf_chunk_cachep)
1497 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1196 return -ENOMEM; 1498 return -ENOMEM;
1499 }
1197 1500
1198 return 0; 1501 return 0;
1199} 1502}
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
1202{ 1505{
1203 if (ocfs2_inode_cachep) 1506 if (ocfs2_inode_cachep)
1204 kmem_cache_destroy(ocfs2_inode_cachep); 1507 kmem_cache_destroy(ocfs2_inode_cachep);
1205
1206 ocfs2_inode_cachep = NULL; 1508 ocfs2_inode_cachep = NULL;
1509
1510 if (ocfs2_dquot_cachep)
1511 kmem_cache_destroy(ocfs2_dquot_cachep);
1512 ocfs2_dquot_cachep = NULL;
1513
1514 if (ocfs2_qf_chunk_cachep)
1515 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1516 ocfs2_qf_chunk_cachep = NULL;
1207} 1517}
1208 1518
1209static int ocfs2_get_sector(struct super_block *sb, 1519static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1303 osb = OCFS2_SB(sb); 1613 osb = OCFS2_SB(sb);
1304 BUG_ON(!osb); 1614 BUG_ON(!osb);
1305 1615
1616 ocfs2_disable_quotas(osb);
1617
1306 ocfs2_shutdown_local_alloc(osb); 1618 ocfs2_shutdown_local_alloc(osb);
1307 1619
1308 ocfs2_truncate_log_shutdown(osb); 1620 ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
1413 sb->s_fs_info = osb; 1725 sb->s_fs_info = osb;
1414 sb->s_op = &ocfs2_sops; 1726 sb->s_op = &ocfs2_sops;
1415 sb->s_export_op = &ocfs2_export_ops; 1727 sb->s_export_op = &ocfs2_export_ops;
1728 sb->s_qcop = &ocfs2_quotactl_ops;
1729 sb->dq_op = &ocfs2_quota_operations;
1416 sb->s_xattr = ocfs2_xattr_handlers; 1730 sb->s_xattr = ocfs2_xattr_handlers;
1417 sb->s_time_gran = 1; 1731 sb->s_time_gran = 1;
1418 sb->s_flags |= MS_NOATIME; 1732 sb->s_flags |= MS_NOATIME;
@@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1676 1990
1677 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, 1991 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
1678 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { 1992 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
1993 /* We have to do a raw check of the feature here */
1994 if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
1995 OCFS2_FEATURE_INCOMPAT_META_ECC) {
1996 status = ocfs2_block_check_validate(bh->b_data,
1997 bh->b_size,
1998 &di->i_check);
1999 if (status)
2000 goto out;
2001 }
1679 status = -EINVAL; 2002 status = -EINVAL;
1680 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { 2003 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
1681 mlog(ML_ERROR, "found superblock with incorrect block " 2004 mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1717 } 2040 }
1718 } 2041 }
1719 2042
2043out:
1720 mlog_exit(status); 2044 mlog_exit(status);
1721 return status; 2045 return status;
1722} 2046}
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b9..ed0a0cfd68d2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
84 84
85 mlog_entry_void(); 85 mlog_entry_void();
86 86
87 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh); 87 status = ocfs2_read_inode_block(inode, bh);
88 if (status < 0) { 88 if (status < 0) {
89 mlog_errno(status); 89 mlog_errno(status);
90 link = ERR_PTR(status); 90 link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade13..e1d638af6ac3 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/string.h> 37#include <linux/string.h>
38#include <linux/security.h>
38 39
39#define MLOG_MASK_PREFIX ML_XATTR 40#define MLOG_MASK_PREFIX ML_XATTR
40#include <cluster/masklog.h> 41#include <cluster/masklog.h>
41 42
42#include "ocfs2.h" 43#include "ocfs2.h"
43#include "alloc.h" 44#include "alloc.h"
45#include "blockcheck.h"
44#include "dlmglue.h" 46#include "dlmglue.h"
45#include "file.h" 47#include "file.h"
46#include "symlink.h" 48#include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
61}; 63};
62 64
63struct ocfs2_xattr_bucket { 65struct ocfs2_xattr_bucket {
64 struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; 66 /* The inode these xattrs are associated with */
65 struct ocfs2_xattr_header *xh; 67 struct inode *bu_inode;
68
69 /* The actual buffers that make up the bucket */
70 struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
71
72 /* How many blocks make up one bucket for this filesystem */
73 int bu_blocks;
74};
75
76struct ocfs2_xattr_set_ctxt {
77 handle_t *handle;
78 struct ocfs2_alloc_context *meta_ac;
79 struct ocfs2_alloc_context *data_ac;
80 struct ocfs2_cached_dealloc_ctxt dealloc;
66}; 81};
67 82
68#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) 83#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
69#define OCFS2_XATTR_INLINE_SIZE 80 84#define OCFS2_XATTR_INLINE_SIZE 80
85#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \
86 - sizeof(struct ocfs2_xattr_header) \
87 - sizeof(__u32))
88#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \
89 - sizeof(struct ocfs2_xattr_block) \
90 - sizeof(struct ocfs2_xattr_header) \
91 - sizeof(__u32))
70 92
71static struct ocfs2_xattr_def_value_root def_xv = { 93static struct ocfs2_xattr_def_value_root def_xv = {
72 .xv.xr_list.l_count = cpu_to_le16(1), 94 .xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
74 96
75struct xattr_handler *ocfs2_xattr_handlers[] = { 97struct xattr_handler *ocfs2_xattr_handlers[] = {
76 &ocfs2_xattr_user_handler, 98 &ocfs2_xattr_user_handler,
99#ifdef CONFIG_OCFS2_FS_POSIX_ACL
100 &ocfs2_xattr_acl_access_handler,
101 &ocfs2_xattr_acl_default_handler,
102#endif
77 &ocfs2_xattr_trusted_handler, 103 &ocfs2_xattr_trusted_handler,
104 &ocfs2_xattr_security_handler,
78 NULL 105 NULL
79}; 106};
80 107
81static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 108static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
82 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 109 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
110#ifdef CONFIG_OCFS2_FS_POSIX_ACL
111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
112 = &ocfs2_xattr_acl_access_handler,
113 [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
114 = &ocfs2_xattr_acl_default_handler,
115#endif
83 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, 116 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
117 [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
84}; 118};
85 119
86struct ocfs2_xattr_info { 120struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
98 */ 132 */
99 struct buffer_head *xattr_bh; 133 struct buffer_head *xattr_bh;
100 struct ocfs2_xattr_header *header; 134 struct ocfs2_xattr_header *header;
101 struct ocfs2_xattr_bucket bucket; 135 struct ocfs2_xattr_bucket *bucket;
102 void *base; 136 void *base;
103 void *end; 137 void *end;
104 struct ocfs2_xattr_entry *here; 138 struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
127 size_t buffer_size); 161 size_t buffer_size);
128 162
129static int ocfs2_xattr_create_index_block(struct inode *inode, 163static int ocfs2_xattr_create_index_block(struct inode *inode,
130 struct ocfs2_xattr_search *xs); 164 struct ocfs2_xattr_search *xs,
165 struct ocfs2_xattr_set_ctxt *ctxt);
131 166
132static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 167static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
133 struct ocfs2_xattr_info *xi, 168 struct ocfs2_xattr_info *xi,
134 struct ocfs2_xattr_search *xs); 169 struct ocfs2_xattr_search *xs,
170 struct ocfs2_xattr_set_ctxt *ctxt);
135 171
136static int ocfs2_delete_xattr_index_block(struct inode *inode, 172static int ocfs2_delete_xattr_index_block(struct inode *inode,
137 struct buffer_head *xb_bh); 173 struct buffer_head *xb_bh);
174static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
175 u64 src_blk, u64 last_blk, u64 to_blk,
176 unsigned int start_bucket,
177 u32 *first_hash);
138 178
139static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) 179static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
140{ 180{
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
154 return len / sizeof(struct ocfs2_xattr_entry); 194 return len / sizeof(struct ocfs2_xattr_entry);
155} 195}
156 196
197#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
198#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
199#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
200
201static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
202{
203 struct ocfs2_xattr_bucket *bucket;
204 int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
205
206 BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
207
208 bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
209 if (bucket) {
210 bucket->bu_inode = inode;
211 bucket->bu_blocks = blks;
212 }
213
214 return bucket;
215}
216
217static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
218{
219 int i;
220
221 for (i = 0; i < bucket->bu_blocks; i++) {
222 brelse(bucket->bu_bhs[i]);
223 bucket->bu_bhs[i] = NULL;
224 }
225}
226
227static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
228{
229 if (bucket) {
230 ocfs2_xattr_bucket_relse(bucket);
231 bucket->bu_inode = NULL;
232 kfree(bucket);
233 }
234}
235
236/*
237 * A bucket that has never been written to disk doesn't need to be
238 * read. We just need the buffer_heads. Don't call this for
239 * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes
240 * them fully.
241 */
242static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
243 u64 xb_blkno)
244{
245 int i, rc = 0;
246
247 for (i = 0; i < bucket->bu_blocks; i++) {
248 bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
249 xb_blkno + i);
250 if (!bucket->bu_bhs[i]) {
251 rc = -EIO;
252 mlog_errno(rc);
253 break;
254 }
255
256 if (!ocfs2_buffer_uptodate(bucket->bu_inode,
257 bucket->bu_bhs[i]))
258 ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
259 bucket->bu_bhs[i]);
260 }
261
262 if (rc)
263 ocfs2_xattr_bucket_relse(bucket);
264 return rc;
265}
266
267/* Read the xattr bucket at xb_blkno */
268static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
269 u64 xb_blkno)
270{
271 int rc;
272
273 rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
274 bucket->bu_blocks, bucket->bu_bhs, 0,
275 NULL);
276 if (!rc) {
277 rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
278 bucket->bu_bhs,
279 bucket->bu_blocks,
280 &bucket_xh(bucket)->xh_check);
281 if (rc)
282 mlog_errno(rc);
283 }
284
285 if (rc)
286 ocfs2_xattr_bucket_relse(bucket);
287 return rc;
288}
289
290static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
291 struct ocfs2_xattr_bucket *bucket,
292 int type)
293{
294 int i, rc = 0;
295
296 for (i = 0; i < bucket->bu_blocks; i++) {
297 rc = ocfs2_journal_access(handle, bucket->bu_inode,
298 bucket->bu_bhs[i], type);
299 if (rc) {
300 mlog_errno(rc);
301 break;
302 }
303 }
304
305 return rc;
306}
307
308static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
309 struct ocfs2_xattr_bucket *bucket)
310{
311 int i;
312
313 ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
314 bucket->bu_bhs, bucket->bu_blocks,
315 &bucket_xh(bucket)->xh_check);
316
317 for (i = 0; i < bucket->bu_blocks; i++)
318 ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
319}
320
321static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
322 struct ocfs2_xattr_bucket *src)
323{
324 int i;
325 int blocksize = src->bu_inode->i_sb->s_blocksize;
326
327 BUG_ON(dest->bu_blocks != src->bu_blocks);
328 BUG_ON(dest->bu_inode != src->bu_inode);
329
330 for (i = 0; i < src->bu_blocks; i++) {
331 memcpy(bucket_block(dest, i), bucket_block(src, i),
332 blocksize);
333 }
334}
335
336static int ocfs2_validate_xattr_block(struct super_block *sb,
337 struct buffer_head *bh)
338{
339 int rc;
340 struct ocfs2_xattr_block *xb =
341 (struct ocfs2_xattr_block *)bh->b_data;
342
343 mlog(0, "Validating xattr block %llu\n",
344 (unsigned long long)bh->b_blocknr);
345
346 BUG_ON(!buffer_uptodate(bh));
347
348 /*
349 * If the ecc fails, we return the error but otherwise
350 * leave the filesystem running. We know any error is
351 * local to this block.
352 */
353 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
354 if (rc)
355 return rc;
356
357 /*
358 * Errors after here are fatal
359 */
360
361 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
362 ocfs2_error(sb,
363 "Extended attribute block #%llu has bad "
364 "signature %.*s",
365 (unsigned long long)bh->b_blocknr, 7,
366 xb->xb_signature);
367 return -EINVAL;
368 }
369
370 if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
371 ocfs2_error(sb,
372 "Extended attribute block #%llu has an "
373 "invalid xb_blkno of %llu",
374 (unsigned long long)bh->b_blocknr,
375 (unsigned long long)le64_to_cpu(xb->xb_blkno));
376 return -EINVAL;
377 }
378
379 if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
380 ocfs2_error(sb,
381 "Extended attribute block #%llu has an invalid "
382 "xb_fs_generation of #%u",
383 (unsigned long long)bh->b_blocknr,
384 le32_to_cpu(xb->xb_fs_generation));
385 return -EINVAL;
386 }
387
388 return 0;
389}
390
391static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
392 struct buffer_head **bh)
393{
394 int rc;
395 struct buffer_head *tmp = *bh;
396
397 rc = ocfs2_read_block(inode, xb_blkno, &tmp,
398 ocfs2_validate_xattr_block);
399
400 /* If ocfs2_read_block() got us a new bh, pass it up. */
401 if (!rc && !*bh)
402 *bh = tmp;
403
404 return rc;
405}
406
157static inline const char *ocfs2_xattr_prefix(int name_index) 407static inline const char *ocfs2_xattr_prefix(int name_index)
158{ 408{
159 struct xattr_handler *handler = NULL; 409 struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
200 return; 450 return;
201} 451}
202 452
453static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
454{
455 int size = 0;
456
457 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
458 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
459 else
460 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
461 size += sizeof(struct ocfs2_xattr_entry);
462
463 return size;
464}
465
466int ocfs2_calc_security_init(struct inode *dir,
467 struct ocfs2_security_xattr_info *si,
468 int *want_clusters,
469 int *xattr_credits,
470 struct ocfs2_alloc_context **xattr_ac)
471{
472 int ret = 0;
473 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
474 int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
475 si->value_len);
476
477 /*
478 * The max space of security xattr taken inline is
479 * 256(name) + 80(value) + 16(entry) = 352 bytes,
480 * So reserve one metadata block for it is ok.
481 */
482 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
483 s_size > OCFS2_XATTR_FREE_IN_IBODY) {
484 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
485 if (ret) {
486 mlog_errno(ret);
487 return ret;
488 }
489 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
490 }
491
492 /* reserve clusters for xattr value which will be set in B tree*/
493 if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
494 int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
495 si->value_len);
496
497 *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
498 new_clusters);
499 *want_clusters += new_clusters;
500 }
501 return ret;
502}
503
504int ocfs2_calc_xattr_init(struct inode *dir,
505 struct buffer_head *dir_bh,
506 int mode,
507 struct ocfs2_security_xattr_info *si,
508 int *want_clusters,
509 int *xattr_credits,
510 struct ocfs2_alloc_context **xattr_ac)
511{
512 int ret = 0;
513 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
514 int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
515
516 if (si->enable)
517 s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
518 si->value_len);
519
520 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
521 acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
522 OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
523 "", NULL, 0);
524 if (acl_len > 0) {
525 a_size = ocfs2_xattr_entry_real_size(0, acl_len);
526 if (S_ISDIR(mode))
527 a_size <<= 1;
528 } else if (acl_len != 0 && acl_len != -ENODATA) {
529 mlog_errno(ret);
530 return ret;
531 }
532 }
533
534 if (!(s_size + a_size))
535 return ret;
536
537 /*
538 * The max space of security xattr taken inline is
539 * 256(name) + 80(value) + 16(entry) = 352 bytes,
540 * The max space of acl xattr taken inline is
541 * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
542 * when blocksize = 512, may reserve one more cluser for
543 * xattr bucket, otherwise reserve one metadata block
544 * for them is ok.
545 */
546 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
547 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
548 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
549 if (ret) {
550 mlog_errno(ret);
551 return ret;
552 }
553 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
554 }
555
556 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
557 (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
558 *want_clusters += 1;
559 *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
560 }
561
562 /*
563 * reserve credits and clusters for xattrs which has large value
564 * and have to be set outside
565 */
566 if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
567 new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
568 si->value_len);
569 *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
570 new_clusters);
571 *want_clusters += new_clusters;
572 }
573 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
574 acl_len > OCFS2_XATTR_INLINE_SIZE) {
575 /* for directory, it has DEFAULT and ACCESS two types of acls */
576 new_clusters = (S_ISDIR(mode) ? 2 : 1) *
577 ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
578 *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
579 new_clusters);
580 *want_clusters += new_clusters;
581 }
582
583 return ret;
584}
585
203static int ocfs2_xattr_extend_allocation(struct inode *inode, 586static int ocfs2_xattr_extend_allocation(struct inode *inode,
204 u32 clusters_to_add, 587 u32 clusters_to_add,
205 struct buffer_head *xattr_bh, 588 struct ocfs2_xattr_value_buf *vb,
206 struct ocfs2_xattr_value_root *xv) 589 struct ocfs2_xattr_set_ctxt *ctxt)
207{ 590{
208 int status = 0; 591 int status = 0;
209 int restart_func = 0; 592 handle_t *handle = ctxt->handle;
210 int credits = 0;
211 handle_t *handle = NULL;
212 struct ocfs2_alloc_context *data_ac = NULL;
213 struct ocfs2_alloc_context *meta_ac = NULL;
214 enum ocfs2_alloc_restarted why; 593 enum ocfs2_alloc_restarted why;
215 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 594 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
216 u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters); 595 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
217 struct ocfs2_extent_tree et; 596 struct ocfs2_extent_tree et;
218 597
219 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); 598 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
220 599
221 ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv); 600 ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
222
223restart_all:
224
225 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
226 &data_ac, &meta_ac);
227 if (status) {
228 mlog_errno(status);
229 goto leave;
230 }
231
232 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
233 clusters_to_add);
234 handle = ocfs2_start_trans(osb, credits);
235 if (IS_ERR(handle)) {
236 status = PTR_ERR(handle);
237 handle = NULL;
238 mlog_errno(status);
239 goto leave;
240 }
241 601
242restarted_transaction: 602 status = vb->vb_access(handle, inode, vb->vb_bh,
243 status = ocfs2_journal_access(handle, inode, xattr_bh, 603 OCFS2_JOURNAL_ACCESS_WRITE);
244 OCFS2_JOURNAL_ACCESS_WRITE);
245 if (status < 0) { 604 if (status < 0) {
246 mlog_errno(status); 605 mlog_errno(status);
247 goto leave; 606 goto leave;
248 } 607 }
249 608
250 prev_clusters = le32_to_cpu(xv->xr_clusters); 609 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
251 status = ocfs2_add_clusters_in_btree(osb, 610 status = ocfs2_add_clusters_in_btree(osb,
252 inode, 611 inode,
253 &logical_start, 612 &logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
255 0, 614 0,
256 &et, 615 &et,
257 handle, 616 handle,
258 data_ac, 617 ctxt->data_ac,
259 meta_ac, 618 ctxt->meta_ac,
260 &why); 619 &why);
261 if ((status < 0) && (status != -EAGAIN)) { 620 if (status < 0) {
262 if (status != -ENOSPC) 621 mlog_errno(status);
263 mlog_errno(status);
264 goto leave; 622 goto leave;
265 } 623 }
266 624
267 status = ocfs2_journal_dirty(handle, xattr_bh); 625 status = ocfs2_journal_dirty(handle, vb->vb_bh);
268 if (status < 0) { 626 if (status < 0) {
269 mlog_errno(status); 627 mlog_errno(status);
270 goto leave; 628 goto leave;
271 } 629 }
272 630
273 clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters; 631 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
274 632
275 if (why != RESTART_NONE && clusters_to_add) { 633 /*
276 if (why == RESTART_META) { 634 * We should have already allocated enough space before the transaction,
277 mlog(0, "restarting function.\n"); 635 * so no need to restart.
278 restart_func = 1; 636 */
279 } else { 637 BUG_ON(why != RESTART_NONE || clusters_to_add);
280 BUG_ON(why != RESTART_TRANS);
281
282 mlog(0, "restarting transaction.\n");
283 /* TODO: This can be more intelligent. */
284 credits = ocfs2_calc_extend_credits(osb->sb,
285 et.et_root_el,
286 clusters_to_add);
287 status = ocfs2_extend_trans(handle, credits);
288 if (status < 0) {
289 /* handle still has to be committed at
290 * this point. */
291 status = -ENOMEM;
292 mlog_errno(status);
293 goto leave;
294 }
295 goto restarted_transaction;
296 }
297 }
298 638
299leave: 639leave:
300 if (handle) {
301 ocfs2_commit_trans(osb, handle);
302 handle = NULL;
303 }
304 if (data_ac) {
305 ocfs2_free_alloc_context(data_ac);
306 data_ac = NULL;
307 }
308 if (meta_ac) {
309 ocfs2_free_alloc_context(meta_ac);
310 meta_ac = NULL;
311 }
312 if ((!status) && restart_func) {
313 restart_func = 0;
314 goto restart_all;
315 }
316 640
317 return status; 641 return status;
318} 642}
319 643
320static int __ocfs2_remove_xattr_range(struct inode *inode, 644static int __ocfs2_remove_xattr_range(struct inode *inode,
321 struct buffer_head *root_bh, 645 struct ocfs2_xattr_value_buf *vb,
322 struct ocfs2_xattr_value_root *xv,
323 u32 cpos, u32 phys_cpos, u32 len, 646 u32 cpos, u32 phys_cpos, u32 len,
324 struct ocfs2_cached_dealloc_ctxt *dealloc) 647 struct ocfs2_xattr_set_ctxt *ctxt)
325{ 648{
326 int ret; 649 int ret;
327 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 650 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
328 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 651 handle_t *handle = ctxt->handle;
329 struct inode *tl_inode = osb->osb_tl_inode;
330 handle_t *handle;
331 struct ocfs2_alloc_context *meta_ac = NULL;
332 struct ocfs2_extent_tree et; 652 struct ocfs2_extent_tree et;
333 653
334 ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv); 654 ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
335 655
336 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); 656 ret = vb->vb_access(handle, inode, vb->vb_bh,
657 OCFS2_JOURNAL_ACCESS_WRITE);
337 if (ret) { 658 if (ret) {
338 mlog_errno(ret); 659 mlog_errno(ret);
339 return ret;
340 }
341
342 mutex_lock(&tl_inode->i_mutex);
343
344 if (ocfs2_truncate_log_needs_flush(osb)) {
345 ret = __ocfs2_flush_truncate_log(osb);
346 if (ret < 0) {
347 mlog_errno(ret);
348 goto out;
349 }
350 }
351
352 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
353 if (IS_ERR(handle)) {
354 ret = PTR_ERR(handle);
355 mlog_errno(ret);
356 goto out; 660 goto out;
357 } 661 }
358 662
359 ret = ocfs2_journal_access(handle, inode, root_bh, 663 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
360 OCFS2_JOURNAL_ACCESS_WRITE); 664 &ctxt->dealloc);
361 if (ret) {
362 mlog_errno(ret);
363 goto out_commit;
364 }
365
366 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
367 dealloc);
368 if (ret) { 665 if (ret) {
369 mlog_errno(ret); 666 mlog_errno(ret);
370 goto out_commit; 667 goto out;
371 } 668 }
372 669
373 le32_add_cpu(&xv->xr_clusters, -len); 670 le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
374 671
375 ret = ocfs2_journal_dirty(handle, root_bh); 672 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
376 if (ret) { 673 if (ret) {
377 mlog_errno(ret); 674 mlog_errno(ret);
378 goto out_commit; 675 goto out;
379 } 676 }
380 677
381 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 678 ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
382 if (ret) 679 if (ret)
383 mlog_errno(ret); 680 mlog_errno(ret);
384 681
385out_commit:
386 ocfs2_commit_trans(osb, handle);
387out: 682out:
388 mutex_unlock(&tl_inode->i_mutex);
389
390 if (meta_ac)
391 ocfs2_free_alloc_context(meta_ac);
392
393 return ret; 683 return ret;
394} 684}
395 685
396static int ocfs2_xattr_shrink_size(struct inode *inode, 686static int ocfs2_xattr_shrink_size(struct inode *inode,
397 u32 old_clusters, 687 u32 old_clusters,
398 u32 new_clusters, 688 u32 new_clusters,
399 struct buffer_head *root_bh, 689 struct ocfs2_xattr_value_buf *vb,
400 struct ocfs2_xattr_value_root *xv) 690 struct ocfs2_xattr_set_ctxt *ctxt)
401{ 691{
402 int ret = 0; 692 int ret = 0;
403 u32 trunc_len, cpos, phys_cpos, alloc_size; 693 u32 trunc_len, cpos, phys_cpos, alloc_size;
404 u64 block; 694 u64 block;
405 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
406 struct ocfs2_cached_dealloc_ctxt dealloc;
407
408 ocfs2_init_dealloc_ctxt(&dealloc);
409 695
410 if (old_clusters <= new_clusters) 696 if (old_clusters <= new_clusters)
411 return 0; 697 return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
414 trunc_len = old_clusters - new_clusters; 700 trunc_len = old_clusters - new_clusters;
415 while (trunc_len) { 701 while (trunc_len) {
416 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, 702 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
417 &alloc_size, &xv->xr_list); 703 &alloc_size,
704 &vb->vb_xv->xr_list);
418 if (ret) { 705 if (ret) {
419 mlog_errno(ret); 706 mlog_errno(ret);
420 goto out; 707 goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
423 if (alloc_size > trunc_len) 710 if (alloc_size > trunc_len)
424 alloc_size = trunc_len; 711 alloc_size = trunc_len;
425 712
426 ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos, 713 ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
427 phys_cpos, alloc_size, 714 phys_cpos, alloc_size,
428 &dealloc); 715 ctxt);
429 if (ret) { 716 if (ret) {
430 mlog_errno(ret); 717 mlog_errno(ret);
431 goto out; 718 goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
439 } 726 }
440 727
441out: 728out:
442 ocfs2_schedule_truncate_log_flush(osb, 1);
443 ocfs2_run_deallocs(osb, &dealloc);
444
445 return ret; 729 return ret;
446} 730}
447 731
448static int ocfs2_xattr_value_truncate(struct inode *inode, 732static int ocfs2_xattr_value_truncate(struct inode *inode,
449 struct buffer_head *root_bh, 733 struct ocfs2_xattr_value_buf *vb,
450 struct ocfs2_xattr_value_root *xv, 734 int len,
451 int len) 735 struct ocfs2_xattr_set_ctxt *ctxt)
452{ 736{
453 int ret; 737 int ret;
454 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); 738 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
455 u32 old_clusters = le32_to_cpu(xv->xr_clusters); 739 u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
456 740
457 if (new_clusters == old_clusters) 741 if (new_clusters == old_clusters)
458 return 0; 742 return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
460 if (new_clusters > old_clusters) 744 if (new_clusters > old_clusters)
461 ret = ocfs2_xattr_extend_allocation(inode, 745 ret = ocfs2_xattr_extend_allocation(inode,
462 new_clusters - old_clusters, 746 new_clusters - old_clusters,
463 root_bh, xv); 747 vb, ctxt);
464 else 748 else
465 ret = ocfs2_xattr_shrink_size(inode, 749 ret = ocfs2_xattr_shrink_size(inode,
466 old_clusters, new_clusters, 750 old_clusters, new_clusters,
467 root_bh, xv); 751 vb, ctxt);
468 752
469 return ret; 753 return ret;
470} 754}
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
554 if (!di->i_xattr_loc) 838 if (!di->i_xattr_loc)
555 return ret; 839 return ret;
556 840
557 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); 841 ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
842 &blk_bh);
558 if (ret < 0) { 843 if (ret < 0) {
559 mlog_errno(ret); 844 mlog_errno(ret);
560 return ret; 845 return ret;
561 } 846 }
562 847
563 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 848 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
564 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
565 ret = -EIO;
566 goto cleanup;
567 }
568
569 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 849 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
570 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; 850 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
571 ret = ocfs2_xattr_list_entries(inode, header, 851 ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
575 ret = ocfs2_xattr_tree_list_index_block(inode, xt, 855 ret = ocfs2_xattr_tree_list_index_block(inode, xt,
576 buffer, buffer_size); 856 buffer, buffer_size);
577 } 857 }
578cleanup: 858
579 brelse(blk_bh); 859 brelse(blk_bh);
580 860
581 return ret; 861 return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
685 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 965 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
686 /* Copy ocfs2_xattr_value */ 966 /* Copy ocfs2_xattr_value */
687 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 967 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
688 ret = ocfs2_read_block(inode, blkno, &bh); 968 ret = ocfs2_read_block(inode, blkno, &bh, NULL);
689 if (ret) { 969 if (ret) {
690 mlog_errno(ret); 970 mlog_errno(ret);
691 goto out; 971 goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
769 size_t size; 1049 size_t size;
770 int ret = -ENODATA, name_offset, name_len, block_off, i; 1050 int ret = -ENODATA, name_offset, name_len, block_off, i;
771 1051
772 memset(&xs->bucket, 0, sizeof(xs->bucket)); 1052 xs->bucket = ocfs2_xattr_bucket_new(inode);
1053 if (!xs->bucket) {
1054 ret = -ENOMEM;
1055 mlog_errno(ret);
1056 goto cleanup;
1057 }
773 1058
774 ret = ocfs2_xattr_block_find(inode, name_index, name, xs); 1059 ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
775 if (ret) { 1060 if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
795 1080
796 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { 1081 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
797 ret = ocfs2_xattr_bucket_get_name_value(inode, 1082 ret = ocfs2_xattr_bucket_get_name_value(inode,
798 xs->bucket.xh, 1083 bucket_xh(xs->bucket),
799 i, 1084 i,
800 &block_off, 1085 &block_off,
801 &name_offset); 1086 &name_offset);
802 xs->base = xs->bucket.bhs[block_off]->b_data; 1087 xs->base = bucket_block(xs->bucket, block_off);
803 } 1088 }
804 if (ocfs2_xattr_is_local(xs->here)) { 1089 if (ocfs2_xattr_is_local(xs->here)) {
805 memcpy(buffer, (void *)xs->base + 1090 memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
817 } 1102 }
818 ret = size; 1103 ret = size;
819cleanup: 1104cleanup:
820 for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++) 1105 ocfs2_xattr_bucket_free(xs->bucket);
821 brelse(xs->bucket.bhs[i]);
822 memset(&xs->bucket, 0, sizeof(xs->bucket));
823 1106
824 brelse(xs->xattr_bh); 1107 brelse(xs->xattr_bh);
825 xs->xattr_bh = NULL; 1108 xs->xattr_bh = NULL;
826 return ret; 1109 return ret;
827} 1110}
828 1111
829/* ocfs2_xattr_get() 1112int ocfs2_xattr_get_nolock(struct inode *inode,
830 * 1113 struct buffer_head *di_bh,
831 * Copy an extended attribute into the buffer provided.
832 * Buffer is NULL to compute the size of buffer required.
833 */
834static int ocfs2_xattr_get(struct inode *inode,
835 int name_index, 1114 int name_index,
836 const char *name, 1115 const char *name,
837 void *buffer, 1116 void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
839{ 1118{
840 int ret; 1119 int ret;
841 struct ocfs2_dinode *di = NULL; 1120 struct ocfs2_dinode *di = NULL;
842 struct buffer_head *di_bh = NULL;
843 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1121 struct ocfs2_inode_info *oi = OCFS2_I(inode);
844 struct ocfs2_xattr_search xis = { 1122 struct ocfs2_xattr_search xis = {
845 .not_found = -ENODATA, 1123 .not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
854 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) 1132 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
855 ret = -ENODATA; 1133 ret = -ENODATA;
856 1134
857 ret = ocfs2_inode_lock(inode, &di_bh, 0);
858 if (ret < 0) {
859 mlog_errno(ret);
860 return ret;
861 }
862 xis.inode_bh = xbs.inode_bh = di_bh; 1135 xis.inode_bh = xbs.inode_bh = di_bh;
863 di = (struct ocfs2_dinode *)di_bh->b_data; 1136 di = (struct ocfs2_dinode *)di_bh->b_data;
864 1137
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
869 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, 1142 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
870 buffer_size, &xbs); 1143 buffer_size, &xbs);
871 up_read(&oi->ip_xattr_sem); 1144 up_read(&oi->ip_xattr_sem);
1145
1146 return ret;
1147}
1148
1149/* ocfs2_xattr_get()
1150 *
1151 * Copy an extended attribute into the buffer provided.
1152 * Buffer is NULL to compute the size of buffer required.
1153 */
1154static int ocfs2_xattr_get(struct inode *inode,
1155 int name_index,
1156 const char *name,
1157 void *buffer,
1158 size_t buffer_size)
1159{
1160 int ret;
1161 struct buffer_head *di_bh = NULL;
1162
1163 ret = ocfs2_inode_lock(inode, &di_bh, 0);
1164 if (ret < 0) {
1165 mlog_errno(ret);
1166 return ret;
1167 }
1168 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
1169 name, buffer, buffer_size);
1170
872 ocfs2_inode_unlock(inode, 0); 1171 ocfs2_inode_unlock(inode, 0);
873 1172
874 brelse(di_bh); 1173 brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
877} 1176}
878 1177
879static int __ocfs2_xattr_set_value_outside(struct inode *inode, 1178static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1179 handle_t *handle,
880 struct ocfs2_xattr_value_root *xv, 1180 struct ocfs2_xattr_value_root *xv,
881 const void *value, 1181 const void *value,
882 int value_len) 1182 int value_len)
883{ 1183{
884 int ret = 0, i, cp_len, credits; 1184 int ret = 0, i, cp_len;
885 u16 blocksize = inode->i_sb->s_blocksize; 1185 u16 blocksize = inode->i_sb->s_blocksize;
886 u32 p_cluster, num_clusters; 1186 u32 p_cluster, num_clusters;
887 u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 1187 u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
888 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); 1188 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
889 u64 blkno; 1189 u64 blkno;
890 struct buffer_head *bh = NULL; 1190 struct buffer_head *bh = NULL;
891 handle_t *handle;
892 1191
893 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); 1192 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
894 1193
895 credits = clusters * bpc;
896 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
897 if (IS_ERR(handle)) {
898 ret = PTR_ERR(handle);
899 mlog_errno(ret);
900 goto out;
901 }
902
903 while (cpos < clusters) { 1194 while (cpos < clusters) {
904 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 1195 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
905 &num_clusters, &xv->xr_list); 1196 &num_clusters, &xv->xr_list);
906 if (ret) { 1197 if (ret) {
907 mlog_errno(ret); 1198 mlog_errno(ret);
908 goto out_commit; 1199 goto out;
909 } 1200 }
910 1201
911 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 1202 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
912 1203
913 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 1204 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
914 ret = ocfs2_read_block(inode, blkno, &bh); 1205 ret = ocfs2_read_block(inode, blkno, &bh, NULL);
915 if (ret) { 1206 if (ret) {
916 mlog_errno(ret); 1207 mlog_errno(ret);
917 goto out_commit; 1208 goto out;
918 } 1209 }
919 1210
920 ret = ocfs2_journal_access(handle, 1211 ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
923 OCFS2_JOURNAL_ACCESS_WRITE); 1214 OCFS2_JOURNAL_ACCESS_WRITE);
924 if (ret < 0) { 1215 if (ret < 0) {
925 mlog_errno(ret); 1216 mlog_errno(ret);
926 goto out_commit; 1217 goto out;
927 } 1218 }
928 1219
929 cp_len = value_len > blocksize ? blocksize : value_len; 1220 cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
937 ret = ocfs2_journal_dirty(handle, bh); 1228 ret = ocfs2_journal_dirty(handle, bh);
938 if (ret < 0) { 1229 if (ret < 0) {
939 mlog_errno(ret); 1230 mlog_errno(ret);
940 goto out_commit; 1231 goto out;
941 } 1232 }
942 brelse(bh); 1233 brelse(bh);
943 bh = NULL; 1234 bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
951 } 1242 }
952 cpos += num_clusters; 1243 cpos += num_clusters;
953 } 1244 }
954out_commit:
955 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
956out: 1245out:
957 brelse(bh); 1246 brelse(bh);
958 1247
@@ -960,28 +1249,22 @@ out:
960} 1249}
961 1250
962static int ocfs2_xattr_cleanup(struct inode *inode, 1251static int ocfs2_xattr_cleanup(struct inode *inode,
1252 handle_t *handle,
963 struct ocfs2_xattr_info *xi, 1253 struct ocfs2_xattr_info *xi,
964 struct ocfs2_xattr_search *xs, 1254 struct ocfs2_xattr_search *xs,
1255 struct ocfs2_xattr_value_buf *vb,
965 size_t offs) 1256 size_t offs)
966{ 1257{
967 handle_t *handle = NULL;
968 int ret = 0; 1258 int ret = 0;
969 size_t name_len = strlen(xi->name); 1259 size_t name_len = strlen(xi->name);
970 void *val = xs->base + offs; 1260 void *val = xs->base + offs;
971 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 1261 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
972 1262
973 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1263 ret = vb->vb_access(handle, inode, vb->vb_bh,
974 OCFS2_XATTR_BLOCK_UPDATE_CREDITS); 1264 OCFS2_JOURNAL_ACCESS_WRITE);
975 if (IS_ERR(handle)) {
976 ret = PTR_ERR(handle);
977 mlog_errno(ret);
978 goto out;
979 }
980 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
981 OCFS2_JOURNAL_ACCESS_WRITE);
982 if (ret) { 1265 if (ret) {
983 mlog_errno(ret); 1266 mlog_errno(ret);
984 goto out_commit; 1267 goto out;
985 } 1268 }
986 /* Decrease xattr count */ 1269 /* Decrease xattr count */
987 le16_add_cpu(&xs->header->xh_count, -1); 1270 le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
989 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); 1272 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
990 memset(val, 0, size); 1273 memset(val, 0, size);
991 1274
992 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 1275 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
993 if (ret < 0) 1276 if (ret < 0)
994 mlog_errno(ret); 1277 mlog_errno(ret);
995out_commit:
996 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
997out: 1278out:
998 return ret; 1279 return ret;
999} 1280}
1000 1281
1001static int ocfs2_xattr_update_entry(struct inode *inode, 1282static int ocfs2_xattr_update_entry(struct inode *inode,
1283 handle_t *handle,
1002 struct ocfs2_xattr_info *xi, 1284 struct ocfs2_xattr_info *xi,
1003 struct ocfs2_xattr_search *xs, 1285 struct ocfs2_xattr_search *xs,
1286 struct ocfs2_xattr_value_buf *vb,
1004 size_t offs) 1287 size_t offs)
1005{ 1288{
1006 handle_t *handle = NULL; 1289 int ret;
1007 int ret = 0;
1008 1290
1009 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1291 ret = vb->vb_access(handle, inode, vb->vb_bh,
1010 OCFS2_XATTR_BLOCK_UPDATE_CREDITS); 1292 OCFS2_JOURNAL_ACCESS_WRITE);
1011 if (IS_ERR(handle)) {
1012 ret = PTR_ERR(handle);
1013 mlog_errno(ret);
1014 goto out;
1015 }
1016 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1017 OCFS2_JOURNAL_ACCESS_WRITE);
1018 if (ret) { 1293 if (ret) {
1019 mlog_errno(ret); 1294 mlog_errno(ret);
1020 goto out_commit; 1295 goto out;
1021 } 1296 }
1022 1297
1023 xs->here->xe_name_offset = cpu_to_le16(offs); 1298 xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
1028 ocfs2_xattr_set_local(xs->here, 0); 1303 ocfs2_xattr_set_local(xs->here, 0);
1029 ocfs2_xattr_hash_entry(inode, xs->header, xs->here); 1304 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1030 1305
1031 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 1306 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
1032 if (ret < 0) 1307 if (ret < 0)
1033 mlog_errno(ret); 1308 mlog_errno(ret);
1034out_commit:
1035 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1036out: 1309out:
1037 return ret; 1310 return ret;
1038} 1311}
@@ -1045,6 +1318,8 @@ out:
1045static int ocfs2_xattr_set_value_outside(struct inode *inode, 1318static int ocfs2_xattr_set_value_outside(struct inode *inode,
1046 struct ocfs2_xattr_info *xi, 1319 struct ocfs2_xattr_info *xi,
1047 struct ocfs2_xattr_search *xs, 1320 struct ocfs2_xattr_search *xs,
1321 struct ocfs2_xattr_set_ctxt *ctxt,
1322 struct ocfs2_xattr_value_buf *vb,
1048 size_t offs) 1323 size_t offs)
1049{ 1324{
1050 size_t name_len = strlen(xi->name); 1325 size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
1062 xv->xr_list.l_tree_depth = 0; 1337 xv->xr_list.l_tree_depth = 0;
1063 xv->xr_list.l_count = cpu_to_le16(1); 1338 xv->xr_list.l_count = cpu_to_le16(1);
1064 xv->xr_list.l_next_free_rec = 0; 1339 xv->xr_list.l_next_free_rec = 0;
1340 vb->vb_xv = xv;
1065 1341
1066 ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv, 1342 ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
1067 xi->value_len);
1068 if (ret < 0) { 1343 if (ret < 0) {
1069 mlog_errno(ret); 1344 mlog_errno(ret);
1070 return ret; 1345 return ret;
1071 } 1346 }
1072 ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value, 1347 ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
1073 xi->value_len);
1074 if (ret < 0) { 1348 if (ret < 0) {
1075 mlog_errno(ret); 1349 mlog_errno(ret);
1076 return ret; 1350 return ret;
1077 } 1351 }
1078 ret = ocfs2_xattr_update_entry(inode, xi, xs, offs); 1352 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
1353 xi->value, xi->value_len);
1079 if (ret < 0) 1354 if (ret < 0)
1080 mlog_errno(ret); 1355 mlog_errno(ret);
1081 1356
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
1195static int ocfs2_xattr_set_entry(struct inode *inode, 1470static int ocfs2_xattr_set_entry(struct inode *inode,
1196 struct ocfs2_xattr_info *xi, 1471 struct ocfs2_xattr_info *xi,
1197 struct ocfs2_xattr_search *xs, 1472 struct ocfs2_xattr_search *xs,
1473 struct ocfs2_xattr_set_ctxt *ctxt,
1198 int flag) 1474 int flag)
1199{ 1475{
1200 struct ocfs2_xattr_entry *last; 1476 struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1202 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 1478 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1203 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); 1479 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
1204 size_t size_l = 0; 1480 size_t size_l = 0;
1205 handle_t *handle = NULL; 1481 handle_t *handle = ctxt->handle;
1206 int free, i, ret; 1482 int free, i, ret;
1207 struct ocfs2_xattr_info xi_l = { 1483 struct ocfs2_xattr_info xi_l = {
1208 .name_index = xi->name_index, 1484 .name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1210 .value = xi->value, 1486 .value = xi->value,
1211 .value_len = xi->value_len, 1487 .value_len = xi->value_len,
1212 }; 1488 };
1489 struct ocfs2_xattr_value_buf vb = {
1490 .vb_bh = xs->xattr_bh,
1491 .vb_access = ocfs2_journal_access_di,
1492 };
1493
1494 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1495 BUG_ON(xs->xattr_bh == xs->inode_bh);
1496 vb.vb_access = ocfs2_journal_access_xb;
1497 } else
1498 BUG_ON(xs->xattr_bh != xs->inode_bh);
1213 1499
1214 /* Compute min_offs, last and free space. */ 1500 /* Compute min_offs, last and free space. */
1215 last = xs->header->xh_entries; 1501 last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1265 if (ocfs2_xattr_is_local(xs->here) && size == size_l) { 1551 if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
1266 /* Replace existing local xattr with tree root */ 1552 /* Replace existing local xattr with tree root */
1267 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, 1553 ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
1268 offs); 1554 ctxt, &vb, offs);
1269 if (ret < 0) 1555 if (ret < 0)
1270 mlog_errno(ret); 1556 mlog_errno(ret);
1271 goto out; 1557 goto out;
1272 } else if (!ocfs2_xattr_is_local(xs->here)) { 1558 } else if (!ocfs2_xattr_is_local(xs->here)) {
1273 /* For existing xattr which has value outside */ 1559 /* For existing xattr which has value outside */
1274 struct ocfs2_xattr_value_root *xv = NULL; 1560 vb.vb_xv = (struct ocfs2_xattr_value_root *)
1275 xv = (struct ocfs2_xattr_value_root *)(val + 1561 (val + OCFS2_XATTR_SIZE(name_len));
1276 OCFS2_XATTR_SIZE(name_len));
1277 1562
1278 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 1563 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1279 /* 1564 /*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1282 * then set new value with set_value_outside(). 1567 * then set new value with set_value_outside().
1283 */ 1568 */
1284 ret = ocfs2_xattr_value_truncate(inode, 1569 ret = ocfs2_xattr_value_truncate(inode,
1285 xs->xattr_bh, 1570 &vb,
1286 xv, 1571 xi->value_len,
1287 xi->value_len); 1572 ctxt);
1288 if (ret < 0) { 1573 if (ret < 0) {
1289 mlog_errno(ret); 1574 mlog_errno(ret);
1290 goto out; 1575 goto out;
1291 } 1576 }
1292 1577
1293 ret = __ocfs2_xattr_set_value_outside(inode, 1578 ret = ocfs2_xattr_update_entry(inode,
1294 xv, 1579 handle,
1295 xi->value, 1580 xi,
1296 xi->value_len); 1581 xs,
1582 &vb,
1583 offs);
1297 if (ret < 0) { 1584 if (ret < 0) {
1298 mlog_errno(ret); 1585 mlog_errno(ret);
1299 goto out; 1586 goto out;
1300 } 1587 }
1301 1588
1302 ret = ocfs2_xattr_update_entry(inode, 1589 ret = __ocfs2_xattr_set_value_outside(inode,
1303 xi, 1590 handle,
1304 xs, 1591 vb.vb_xv,
1305 offs); 1592 xi->value,
1593 xi->value_len);
1306 if (ret < 0) 1594 if (ret < 0)
1307 mlog_errno(ret); 1595 mlog_errno(ret);
1308 goto out; 1596 goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1312 * just trucate old value to zero. 1600 * just trucate old value to zero.
1313 */ 1601 */
1314 ret = ocfs2_xattr_value_truncate(inode, 1602 ret = ocfs2_xattr_value_truncate(inode,
1315 xs->xattr_bh, 1603 &vb,
1316 xv, 1604 0,
1317 0); 1605 ctxt);
1318 if (ret < 0) 1606 if (ret < 0)
1319 mlog_errno(ret); 1607 mlog_errno(ret);
1320 } 1608 }
1321 } 1609 }
1322 } 1610 }
1323 1611
1324 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1612 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
1325 OCFS2_INODE_UPDATE_CREDITS); 1613 OCFS2_JOURNAL_ACCESS_WRITE);
1326 if (IS_ERR(handle)) {
1327 ret = PTR_ERR(handle);
1328 mlog_errno(ret);
1329 goto out;
1330 }
1331
1332 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1333 OCFS2_JOURNAL_ACCESS_WRITE);
1334 if (ret) { 1614 if (ret) {
1335 mlog_errno(ret); 1615 mlog_errno(ret);
1336 goto out_commit; 1616 goto out;
1337 } 1617 }
1338 1618
1339 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 1619 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1340 /* set extended attribute in external block. */ 1620 ret = vb.vb_access(handle, inode, vb.vb_bh,
1341 ret = ocfs2_extend_trans(handle, 1621 OCFS2_JOURNAL_ACCESS_WRITE);
1342 OCFS2_INODE_UPDATE_CREDITS +
1343 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
1344 if (ret) {
1345 mlog_errno(ret);
1346 goto out_commit;
1347 }
1348 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1349 OCFS2_JOURNAL_ACCESS_WRITE);
1350 if (ret) { 1622 if (ret) {
1351 mlog_errno(ret); 1623 mlog_errno(ret);
1352 goto out_commit; 1624 goto out;
1353 } 1625 }
1354 } 1626 }
1355 1627
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1363 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 1635 ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
1364 if (ret < 0) { 1636 if (ret < 0) {
1365 mlog_errno(ret); 1637 mlog_errno(ret);
1366 goto out_commit; 1638 goto out;
1367 } 1639 }
1368 } 1640 }
1369 1641
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1391 oi->ip_dyn_features |= flag; 1663 oi->ip_dyn_features |= flag;
1392 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 1664 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1393 spin_unlock(&oi->ip_lock); 1665 spin_unlock(&oi->ip_lock);
1394 /* Update inode ctime */
1395 inode->i_ctime = CURRENT_TIME;
1396 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1397 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1398 1666
1399 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 1667 ret = ocfs2_journal_dirty(handle, xs->inode_bh);
1400 if (ret < 0) 1668 if (ret < 0)
1401 mlog_errno(ret); 1669 mlog_errno(ret);
1402 1670
1403out_commit:
1404 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1405
1406 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 1671 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1407 /* 1672 /*
1408 * Set value outside in B tree. 1673 * Set value outside in B tree.
1409 * This is the second step for value size > INLINE_SIZE. 1674 * This is the second step for value size > INLINE_SIZE.
1410 */ 1675 */
1411 size_t offs = le16_to_cpu(xs->here->xe_name_offset); 1676 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1412 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs); 1677 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
1678 &vb, offs);
1413 if (ret < 0) { 1679 if (ret < 0) {
1414 int ret2; 1680 int ret2;
1415 1681
@@ -1418,41 +1684,56 @@ out_commit:
1418 * If set value outside failed, we have to clean 1684 * If set value outside failed, we have to clean
1419 * the junk tree root we have already set in local. 1685 * the junk tree root we have already set in local.
1420 */ 1686 */
1421 ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs); 1687 ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
1688 xi, xs, &vb, offs);
1422 if (ret2 < 0) 1689 if (ret2 < 0)
1423 mlog_errno(ret2); 1690 mlog_errno(ret2);
1424 } 1691 }
1425 } 1692 }
1426out: 1693out:
1427 return ret; 1694 return ret;
1428
1429} 1695}
1430 1696
1431static int ocfs2_remove_value_outside(struct inode*inode, 1697static int ocfs2_remove_value_outside(struct inode*inode,
1432 struct buffer_head *bh, 1698 struct ocfs2_xattr_value_buf *vb,
1433 struct ocfs2_xattr_header *header) 1699 struct ocfs2_xattr_header *header)
1434{ 1700{
1435 int ret = 0, i; 1701 int ret = 0, i;
1702 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1703 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
1704
1705 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
1706
1707 ctxt.handle = ocfs2_start_trans(osb,
1708 ocfs2_remove_extent_credits(osb->sb));
1709 if (IS_ERR(ctxt.handle)) {
1710 ret = PTR_ERR(ctxt.handle);
1711 mlog_errno(ret);
1712 goto out;
1713 }
1436 1714
1437 for (i = 0; i < le16_to_cpu(header->xh_count); i++) { 1715 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
1438 struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; 1716 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
1439 1717
1440 if (!ocfs2_xattr_is_local(entry)) { 1718 if (!ocfs2_xattr_is_local(entry)) {
1441 struct ocfs2_xattr_value_root *xv;
1442 void *val; 1719 void *val;
1443 1720
1444 val = (void *)header + 1721 val = (void *)header +
1445 le16_to_cpu(entry->xe_name_offset); 1722 le16_to_cpu(entry->xe_name_offset);
1446 xv = (struct ocfs2_xattr_value_root *) 1723 vb->vb_xv = (struct ocfs2_xattr_value_root *)
1447 (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); 1724 (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
1448 ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0); 1725 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
1449 if (ret < 0) { 1726 if (ret < 0) {
1450 mlog_errno(ret); 1727 mlog_errno(ret);
1451 return ret; 1728 break;
1452 } 1729 }
1453 } 1730 }
1454 } 1731 }
1455 1732
1733 ocfs2_commit_trans(osb, ctxt.handle);
1734 ocfs2_schedule_truncate_log_flush(osb, 1);
1735 ocfs2_run_deallocs(osb, &ctxt.dealloc);
1736out:
1456 return ret; 1737 return ret;
1457} 1738}
1458 1739
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
1463 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1744 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1464 struct ocfs2_xattr_header *header; 1745 struct ocfs2_xattr_header *header;
1465 int ret; 1746 int ret;
1747 struct ocfs2_xattr_value_buf vb = {
1748 .vb_bh = di_bh,
1749 .vb_access = ocfs2_journal_access_di,
1750 };
1466 1751
1467 header = (struct ocfs2_xattr_header *) 1752 header = (struct ocfs2_xattr_header *)
1468 ((void *)di + inode->i_sb->s_blocksize - 1753 ((void *)di + inode->i_sb->s_blocksize -
1469 le16_to_cpu(di->i_xattr_inline_size)); 1754 le16_to_cpu(di->i_xattr_inline_size));
1470 1755
1471 ret = ocfs2_remove_value_outside(inode, di_bh, header); 1756 ret = ocfs2_remove_value_outside(inode, &vb, header);
1472 1757
1473 return ret; 1758 return ret;
1474} 1759}
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
1478{ 1763{
1479 struct ocfs2_xattr_block *xb; 1764 struct ocfs2_xattr_block *xb;
1480 int ret = 0; 1765 int ret = 0;
1766 struct ocfs2_xattr_value_buf vb = {
1767 .vb_bh = blk_bh,
1768 .vb_access = ocfs2_journal_access_xb,
1769 };
1481 1770
1482 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 1771 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1483 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 1772 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1484 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); 1773 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
1485 ret = ocfs2_remove_value_outside(inode, blk_bh, header); 1774 ret = ocfs2_remove_value_outside(inode, &vb, header);
1486 } else 1775 } else
1487 ret = ocfs2_delete_xattr_index_block(inode, blk_bh); 1776 ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
1488 1777
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
1502 u64 blk, bg_blkno; 1791 u64 blk, bg_blkno;
1503 u16 bit; 1792 u16 bit;
1504 1793
1505 ret = ocfs2_read_block(inode, block, &blk_bh); 1794 ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
1506 if (ret < 0) { 1795 if (ret < 0) {
1507 mlog_errno(ret); 1796 mlog_errno(ret);
1508 goto out; 1797 goto out;
1509 } 1798 }
1510 1799
1511 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1512 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
1513 ret = -EIO;
1514 goto out;
1515 }
1516
1517 ret = ocfs2_xattr_block_remove(inode, blk_bh); 1800 ret = ocfs2_xattr_block_remove(inode, blk_bh);
1518 if (ret < 0) { 1801 if (ret < 0) {
1519 mlog_errno(ret); 1802 mlog_errno(ret);
1520 goto out; 1803 goto out;
1521 } 1804 }
1522 1805
1806 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1523 blk = le64_to_cpu(xb->xb_blkno); 1807 blk = le64_to_cpu(xb->xb_blkno);
1524 bit = le16_to_cpu(xb->xb_suballoc_bit); 1808 bit = le16_to_cpu(xb->xb_suballoc_bit);
1525 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 1809 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1606 mlog_errno(ret); 1890 mlog_errno(ret);
1607 goto out; 1891 goto out;
1608 } 1892 }
1609 ret = ocfs2_journal_access(handle, inode, di_bh, 1893 ret = ocfs2_journal_access_di(handle, inode, di_bh,
1610 OCFS2_JOURNAL_ACCESS_WRITE); 1894 OCFS2_JOURNAL_ACCESS_WRITE);
1611 if (ret) { 1895 if (ret) {
1612 mlog_errno(ret); 1896 mlog_errno(ret);
1613 goto out_commit; 1897 goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
1714 */ 1998 */
1715static int ocfs2_xattr_ibody_set(struct inode *inode, 1999static int ocfs2_xattr_ibody_set(struct inode *inode,
1716 struct ocfs2_xattr_info *xi, 2000 struct ocfs2_xattr_info *xi,
1717 struct ocfs2_xattr_search *xs) 2001 struct ocfs2_xattr_search *xs,
2002 struct ocfs2_xattr_set_ctxt *ctxt)
1718{ 2003{
1719 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2004 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1720 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2005 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
1731 } 2016 }
1732 } 2017 }
1733 2018
1734 ret = ocfs2_xattr_set_entry(inode, xi, xs, 2019 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
1735 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); 2020 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
1736out: 2021out:
1737 up_write(&oi->ip_alloc_sem); 2022 up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
1758 if (!di->i_xattr_loc) 2043 if (!di->i_xattr_loc)
1759 return ret; 2044 return ret;
1760 2045
1761 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); 2046 ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
2047 &blk_bh);
1762 if (ret < 0) { 2048 if (ret < 0) {
1763 mlog_errno(ret); 2049 mlog_errno(ret);
1764 return ret; 2050 return ret;
1765 } 2051 }
1766 2052
1767 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1768 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
1769 ret = -EIO;
1770 goto cleanup;
1771 }
1772
1773 xs->xattr_bh = blk_bh; 2053 xs->xattr_bh = blk_bh;
2054 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1774 2055
1775 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 2056 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1776 xs->header = &xb->xb_attrs.xb_header; 2057 xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
1804 */ 2085 */
1805static int ocfs2_xattr_block_set(struct inode *inode, 2086static int ocfs2_xattr_block_set(struct inode *inode,
1806 struct ocfs2_xattr_info *xi, 2087 struct ocfs2_xattr_info *xi,
1807 struct ocfs2_xattr_search *xs) 2088 struct ocfs2_xattr_search *xs,
2089 struct ocfs2_xattr_set_ctxt *ctxt)
1808{ 2090{
1809 struct buffer_head *new_bh = NULL; 2091 struct buffer_head *new_bh = NULL;
1810 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2092 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1811 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2093 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1812 struct ocfs2_alloc_context *meta_ac = NULL; 2094 handle_t *handle = ctxt->handle;
1813 handle_t *handle = NULL;
1814 struct ocfs2_xattr_block *xblk = NULL; 2095 struct ocfs2_xattr_block *xblk = NULL;
1815 u16 suballoc_bit_start; 2096 u16 suballoc_bit_start;
1816 u32 num_got; 2097 u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
1818 int ret; 2099 int ret;
1819 2100
1820 if (!xs->xattr_bh) { 2101 if (!xs->xattr_bh) {
1821 /* 2102 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
1822 * Alloc one external block for extended attribute 2103 OCFS2_JOURNAL_ACCESS_CREATE);
1823 * outside of inode.
1824 */
1825 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
1826 if (ret < 0) { 2104 if (ret < 0) {
1827 mlog_errno(ret); 2105 mlog_errno(ret);
1828 goto out; 2106 goto end;
1829 }
1830 handle = ocfs2_start_trans(osb,
1831 OCFS2_XATTR_BLOCK_CREATE_CREDITS);
1832 if (IS_ERR(handle)) {
1833 ret = PTR_ERR(handle);
1834 mlog_errno(ret);
1835 goto out;
1836 }
1837 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1838 OCFS2_JOURNAL_ACCESS_CREATE);
1839 if (ret < 0) {
1840 mlog_errno(ret);
1841 goto out_commit;
1842 } 2107 }
1843 2108
1844 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 2109 ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
1845 &suballoc_bit_start, &num_got, 2110 &suballoc_bit_start, &num_got,
1846 &first_blkno); 2111 &first_blkno);
1847 if (ret < 0) { 2112 if (ret < 0) {
1848 mlog_errno(ret); 2113 mlog_errno(ret);
1849 goto out_commit; 2114 goto end;
1850 } 2115 }
1851 2116
1852 new_bh = sb_getblk(inode->i_sb, first_blkno); 2117 new_bh = sb_getblk(inode->i_sb, first_blkno);
1853 ocfs2_set_new_buffer_uptodate(inode, new_bh); 2118 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1854 2119
1855 ret = ocfs2_journal_access(handle, inode, new_bh, 2120 ret = ocfs2_journal_access_xb(handle, inode, new_bh,
1856 OCFS2_JOURNAL_ACCESS_CREATE); 2121 OCFS2_JOURNAL_ACCESS_CREATE);
1857 if (ret < 0) { 2122 if (ret < 0) {
1858 mlog_errno(ret); 2123 mlog_errno(ret);
1859 goto out_commit; 2124 goto end;
1860 } 2125 }
1861 2126
1862 /* Initialize ocfs2_xattr_block */ 2127 /* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
1874 xs->end = (void *)xblk + inode->i_sb->s_blocksize; 2139 xs->end = (void *)xblk + inode->i_sb->s_blocksize;
1875 xs->here = xs->header->xh_entries; 2140 xs->here = xs->header->xh_entries;
1876 2141
1877
1878 ret = ocfs2_journal_dirty(handle, new_bh); 2142 ret = ocfs2_journal_dirty(handle, new_bh);
1879 if (ret < 0) { 2143 if (ret < 0) {
1880 mlog_errno(ret); 2144 mlog_errno(ret);
1881 goto out_commit; 2145 goto end;
1882 } 2146 }
1883 di->i_xattr_loc = cpu_to_le64(first_blkno); 2147 di->i_xattr_loc = cpu_to_le64(first_blkno);
1884 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 2148 ocfs2_journal_dirty(handle, xs->inode_bh);
1885 if (ret < 0)
1886 mlog_errno(ret);
1887out_commit:
1888 ocfs2_commit_trans(osb, handle);
1889out:
1890 if (meta_ac)
1891 ocfs2_free_alloc_context(meta_ac);
1892 if (ret < 0)
1893 return ret;
1894 } else 2149 } else
1895 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2150 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
1896 2151
1897 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { 2152 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
1898 /* Set extended attribute into external block */ 2153 /* Set extended attribute into external block */
1899 ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL); 2154 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
2155 OCFS2_HAS_XATTR_FL);
1900 if (!ret || ret != -ENOSPC) 2156 if (!ret || ret != -ENOSPC)
1901 goto end; 2157 goto end;
1902 2158
1903 ret = ocfs2_xattr_create_index_block(inode, xs); 2159 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
1904 if (ret) 2160 if (ret)
1905 goto end; 2161 goto end;
1906 } 2162 }
1907 2163
1908 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); 2164 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
1909 2165
1910end: 2166end:
1911 2167
1912 return ret; 2168 return ret;
1913} 2169}
1914 2170
2171/* Check whether the new xattr can be inserted into the inode. */
2172static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2173 struct ocfs2_xattr_info *xi,
2174 struct ocfs2_xattr_search *xs)
2175{
2176 u64 value_size;
2177 struct ocfs2_xattr_entry *last;
2178 int free, i;
2179 size_t min_offs = xs->end - xs->base;
2180
2181 if (!xs->header)
2182 return 0;
2183
2184 last = xs->header->xh_entries;
2185
2186 for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
2187 size_t offs = le16_to_cpu(last->xe_name_offset);
2188 if (offs < min_offs)
2189 min_offs = offs;
2190 last += 1;
2191 }
2192
2193 free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
2194 if (free < 0)
2195 return 0;
2196
2197 BUG_ON(!xs->not_found);
2198
2199 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
2200 value_size = OCFS2_XATTR_ROOT_SIZE;
2201 else
2202 value_size = OCFS2_XATTR_SIZE(xi->value_len);
2203
2204 if (free >= sizeof(struct ocfs2_xattr_entry) +
2205 OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
2206 return 1;
2207
2208 return 0;
2209}
2210
2211static int ocfs2_calc_xattr_set_need(struct inode *inode,
2212 struct ocfs2_dinode *di,
2213 struct ocfs2_xattr_info *xi,
2214 struct ocfs2_xattr_search *xis,
2215 struct ocfs2_xattr_search *xbs,
2216 int *clusters_need,
2217 int *meta_need,
2218 int *credits_need)
2219{
2220 int ret = 0, old_in_xb = 0;
2221 int clusters_add = 0, meta_add = 0, credits = 0;
2222 struct buffer_head *bh = NULL;
2223 struct ocfs2_xattr_block *xb = NULL;
2224 struct ocfs2_xattr_entry *xe = NULL;
2225 struct ocfs2_xattr_value_root *xv = NULL;
2226 char *base = NULL;
2227 int name_offset, name_len = 0;
2228 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2229 xi->value_len);
2230 u64 value_size;
2231
2232 /*
2233 * Calculate the clusters we need to write.
2234 * No matter whether we replace an old one or add a new one,
2235 * we need this for writing.
2236 */
2237 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
2238 credits += new_clusters *
2239 ocfs2_clusters_to_blocks(inode->i_sb, 1);
2240
2241 if (xis->not_found && xbs->not_found) {
2242 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2243
2244 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
2245 clusters_add += new_clusters;
2246 credits += ocfs2_calc_extend_credits(inode->i_sb,
2247 &def_xv.xv.xr_list,
2248 new_clusters);
2249 }
2250
2251 goto meta_guess;
2252 }
2253
2254 if (!xis->not_found) {
2255 xe = xis->here;
2256 name_offset = le16_to_cpu(xe->xe_name_offset);
2257 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
2258 base = xis->base;
2259 credits += OCFS2_INODE_UPDATE_CREDITS;
2260 } else {
2261 int i, block_off = 0;
2262 xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
2263 xe = xbs->here;
2264 name_offset = le16_to_cpu(xe->xe_name_offset);
2265 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
2266 i = xbs->here - xbs->header->xh_entries;
2267 old_in_xb = 1;
2268
2269 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
2270 ret = ocfs2_xattr_bucket_get_name_value(inode,
2271 bucket_xh(xbs->bucket),
2272 i, &block_off,
2273 &name_offset);
2274 base = bucket_block(xbs->bucket, block_off);
2275 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2276 } else {
2277 base = xbs->base;
2278 credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
2279 }
2280 }
2281
2282 /*
2283 * delete a xattr doesn't need metadata and cluster allocation.
2284 * so just calculate the credits and return.
2285 *
2286 * The credits for removing the value tree will be extended
2287 * by ocfs2_remove_extent itself.
2288 */
2289 if (!xi->value) {
2290 if (!ocfs2_xattr_is_local(xe))
2291 credits += ocfs2_remove_extent_credits(inode->i_sb);
2292
2293 goto out;
2294 }
2295
2296 /* do cluster allocation guess first. */
2297 value_size = le64_to_cpu(xe->xe_value_size);
2298
2299 if (old_in_xb) {
2300 /*
2301 * In xattr set, we always try to set the xe in inode first,
2302 * so if it can be inserted into inode successfully, the old
2303 * one will be removed from the xattr block, and this xattr
2304 * will be inserted into inode as a new xattr in inode.
2305 */
2306 if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
2307 clusters_add += new_clusters;
2308 credits += ocfs2_remove_extent_credits(inode->i_sb) +
2309 OCFS2_INODE_UPDATE_CREDITS;
2310 if (!ocfs2_xattr_is_local(xe))
2311 credits += ocfs2_calc_extend_credits(
2312 inode->i_sb,
2313 &def_xv.xv.xr_list,
2314 new_clusters);
2315 goto out;
2316 }
2317 }
2318
2319 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
2320 /* the new values will be stored outside. */
2321 u32 old_clusters = 0;
2322
2323 if (!ocfs2_xattr_is_local(xe)) {
2324 old_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2325 value_size);
2326 xv = (struct ocfs2_xattr_value_root *)
2327 (base + name_offset + name_len);
2328 value_size = OCFS2_XATTR_ROOT_SIZE;
2329 } else
2330 xv = &def_xv.xv;
2331
2332 if (old_clusters >= new_clusters) {
2333 credits += ocfs2_remove_extent_credits(inode->i_sb);
2334 goto out;
2335 } else {
2336 meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
2337 clusters_add += new_clusters - old_clusters;
2338 credits += ocfs2_calc_extend_credits(inode->i_sb,
2339 &xv->xr_list,
2340 new_clusters -
2341 old_clusters);
2342 if (value_size >= OCFS2_XATTR_ROOT_SIZE)
2343 goto out;
2344 }
2345 } else {
2346 /*
2347 * Now the new value will be stored inside. So if the new
2348 * value is smaller than the size of value root or the old
2349 * value, we don't need any allocation, otherwise we have
2350 * to guess metadata allocation.
2351 */
2352 if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
2353 (!ocfs2_xattr_is_local(xe) &&
2354 OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
2355 goto out;
2356 }
2357
2358meta_guess:
2359 /* calculate metadata allocation. */
2360 if (di->i_xattr_loc) {
2361 if (!xbs->xattr_bh) {
2362 ret = ocfs2_read_xattr_block(inode,
2363 le64_to_cpu(di->i_xattr_loc),
2364 &bh);
2365 if (ret) {
2366 mlog_errno(ret);
2367 goto out;
2368 }
2369
2370 xb = (struct ocfs2_xattr_block *)bh->b_data;
2371 } else
2372 xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
2373
2374 /*
2375 * If there is already an xattr tree, good, we can calculate
2376 * like other b-trees. Otherwise we may have the chance of
2377 * create a tree, the credit calculation is borrowed from
2378 * ocfs2_calc_extend_credits with root_el = NULL. And the
2379 * new tree will be cluster based, so no meta is needed.
2380 */
2381 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
2382 struct ocfs2_extent_list *el =
2383 &xb->xb_attrs.xb_root.xt_list;
2384 meta_add += ocfs2_extend_meta_needed(el);
2385 credits += ocfs2_calc_extend_credits(inode->i_sb,
2386 el, 1);
2387 } else
2388 credits += OCFS2_SUBALLOC_ALLOC + 1;
2389
2390 /*
2391 * This cluster will be used either for new bucket or for
2392 * new xattr block.
2393 * If the cluster size is the same as the bucket size, one
2394 * more is needed since we may need to extend the bucket
2395 * also.
2396 */
2397 clusters_add += 1;
2398 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2399 if (OCFS2_XATTR_BUCKET_SIZE ==
2400 OCFS2_SB(inode->i_sb)->s_clustersize) {
2401 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2402 clusters_add += 1;
2403 }
2404 } else {
2405 meta_add += 1;
2406 credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
2407 }
2408out:
2409 if (clusters_need)
2410 *clusters_need = clusters_add;
2411 if (meta_need)
2412 *meta_need = meta_add;
2413 if (credits_need)
2414 *credits_need = credits;
2415 brelse(bh);
2416 return ret;
2417}
2418
2419static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2420 struct ocfs2_dinode *di,
2421 struct ocfs2_xattr_info *xi,
2422 struct ocfs2_xattr_search *xis,
2423 struct ocfs2_xattr_search *xbs,
2424 struct ocfs2_xattr_set_ctxt *ctxt,
2425 int *credits)
2426{
2427 int clusters_add, meta_add, ret;
2428 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2429
2430 memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
2431
2432 ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
2433
2434 ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
2435 &clusters_add, &meta_add, credits);
2436 if (ret) {
2437 mlog_errno(ret);
2438 return ret;
2439 }
2440
2441 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2442 "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
2443
2444 if (meta_add) {
2445 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
2446 &ctxt->meta_ac);
2447 if (ret) {
2448 mlog_errno(ret);
2449 goto out;
2450 }
2451 }
2452
2453 if (clusters_add) {
2454 ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
2455 if (ret)
2456 mlog_errno(ret);
2457 }
2458out:
2459 if (ret) {
2460 if (ctxt->meta_ac) {
2461 ocfs2_free_alloc_context(ctxt->meta_ac);
2462 ctxt->meta_ac = NULL;
2463 }
2464
2465 /*
2466 * We cannot have an error and a non null ctxt->data_ac.
2467 */
2468 }
2469
2470 return ret;
2471}
2472
2473static int __ocfs2_xattr_set_handle(struct inode *inode,
2474 struct ocfs2_dinode *di,
2475 struct ocfs2_xattr_info *xi,
2476 struct ocfs2_xattr_search *xis,
2477 struct ocfs2_xattr_search *xbs,
2478 struct ocfs2_xattr_set_ctxt *ctxt)
2479{
2480 int ret = 0, credits, old_found;
2481
2482 if (!xi->value) {
2483 /* Remove existing extended attribute */
2484 if (!xis->not_found)
2485 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
2486 else if (!xbs->not_found)
2487 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
2488 } else {
2489 /* We always try to set extended attribute into inode first*/
2490 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
2491 if (!ret && !xbs->not_found) {
2492 /*
2493 * If succeed and that extended attribute existing in
2494 * external block, then we will remove it.
2495 */
2496 xi->value = NULL;
2497 xi->value_len = 0;
2498
2499 old_found = xis->not_found;
2500 xis->not_found = -ENODATA;
2501 ret = ocfs2_calc_xattr_set_need(inode,
2502 di,
2503 xi,
2504 xis,
2505 xbs,
2506 NULL,
2507 NULL,
2508 &credits);
2509 xis->not_found = old_found;
2510 if (ret) {
2511 mlog_errno(ret);
2512 goto out;
2513 }
2514
2515 ret = ocfs2_extend_trans(ctxt->handle, credits +
2516 ctxt->handle->h_buffer_credits);
2517 if (ret) {
2518 mlog_errno(ret);
2519 goto out;
2520 }
2521 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
2522 } else if (ret == -ENOSPC) {
2523 if (di->i_xattr_loc && !xbs->xattr_bh) {
2524 ret = ocfs2_xattr_block_find(inode,
2525 xi->name_index,
2526 xi->name, xbs);
2527 if (ret)
2528 goto out;
2529
2530 old_found = xis->not_found;
2531 xis->not_found = -ENODATA;
2532 ret = ocfs2_calc_xattr_set_need(inode,
2533 di,
2534 xi,
2535 xis,
2536 xbs,
2537 NULL,
2538 NULL,
2539 &credits);
2540 xis->not_found = old_found;
2541 if (ret) {
2542 mlog_errno(ret);
2543 goto out;
2544 }
2545
2546 ret = ocfs2_extend_trans(ctxt->handle, credits +
2547 ctxt->handle->h_buffer_credits);
2548 if (ret) {
2549 mlog_errno(ret);
2550 goto out;
2551 }
2552 }
2553 /*
2554 * If no space in inode, we will set extended attribute
2555 * into external block.
2556 */
2557 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
2558 if (ret)
2559 goto out;
2560 if (!xis->not_found) {
2561 /*
2562 * If succeed and that extended attribute
2563 * existing in inode, we will remove it.
2564 */
2565 xi->value = NULL;
2566 xi->value_len = 0;
2567 xbs->not_found = -ENODATA;
2568 ret = ocfs2_calc_xattr_set_need(inode,
2569 di,
2570 xi,
2571 xis,
2572 xbs,
2573 NULL,
2574 NULL,
2575 &credits);
2576 if (ret) {
2577 mlog_errno(ret);
2578 goto out;
2579 }
2580
2581 ret = ocfs2_extend_trans(ctxt->handle, credits +
2582 ctxt->handle->h_buffer_credits);
2583 if (ret) {
2584 mlog_errno(ret);
2585 goto out;
2586 }
2587 ret = ocfs2_xattr_ibody_set(inode, xi,
2588 xis, ctxt);
2589 }
2590 }
2591 }
2592
2593 if (!ret) {
2594 /* Update inode ctime. */
2595 ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
2596 OCFS2_JOURNAL_ACCESS_WRITE);
2597 if (ret) {
2598 mlog_errno(ret);
2599 goto out;
2600 }
2601
2602 inode->i_ctime = CURRENT_TIME;
2603 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
2604 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
2605 ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
2606 }
2607out:
2608 return ret;
2609}
2610
2611/*
2612 * This function only called duing creating inode
2613 * for init security/acl xattrs of the new inode.
2614 * All transanction credits have been reserved in mknod.
2615 */
2616int ocfs2_xattr_set_handle(handle_t *handle,
2617 struct inode *inode,
2618 struct buffer_head *di_bh,
2619 int name_index,
2620 const char *name,
2621 const void *value,
2622 size_t value_len,
2623 int flags,
2624 struct ocfs2_alloc_context *meta_ac,
2625 struct ocfs2_alloc_context *data_ac)
2626{
2627 struct ocfs2_dinode *di;
2628 int ret;
2629
2630 struct ocfs2_xattr_info xi = {
2631 .name_index = name_index,
2632 .name = name,
2633 .value = value,
2634 .value_len = value_len,
2635 };
2636
2637 struct ocfs2_xattr_search xis = {
2638 .not_found = -ENODATA,
2639 };
2640
2641 struct ocfs2_xattr_search xbs = {
2642 .not_found = -ENODATA,
2643 };
2644
2645 struct ocfs2_xattr_set_ctxt ctxt = {
2646 .handle = handle,
2647 .meta_ac = meta_ac,
2648 .data_ac = data_ac,
2649 };
2650
2651 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
2652 return -EOPNOTSUPP;
2653
2654 /*
2655 * In extreme situation, may need xattr bucket when
2656 * block size is too small. And we have already reserved
2657 * the credits for bucket in mknod.
2658 */
2659 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
2660 xbs.bucket = ocfs2_xattr_bucket_new(inode);
2661 if (!xbs.bucket) {
2662 mlog_errno(-ENOMEM);
2663 return -ENOMEM;
2664 }
2665 }
2666
2667 xis.inode_bh = xbs.inode_bh = di_bh;
2668 di = (struct ocfs2_dinode *)di_bh->b_data;
2669
2670 down_write(&OCFS2_I(inode)->ip_xattr_sem);
2671
2672 ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
2673 if (ret)
2674 goto cleanup;
2675 if (xis.not_found) {
2676 ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
2677 if (ret)
2678 goto cleanup;
2679 }
2680
2681 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
2682
2683cleanup:
2684 up_write(&OCFS2_I(inode)->ip_xattr_sem);
2685 brelse(xbs.xattr_bh);
2686 ocfs2_xattr_bucket_free(xbs.bucket);
2687
2688 return ret;
2689}
2690
1915/* 2691/*
1916 * ocfs2_xattr_set() 2692 * ocfs2_xattr_set()
1917 * 2693 *
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
1928{ 2704{
1929 struct buffer_head *di_bh = NULL; 2705 struct buffer_head *di_bh = NULL;
1930 struct ocfs2_dinode *di; 2706 struct ocfs2_dinode *di;
1931 int ret; 2707 int ret, credits;
1932 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 2708 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2709 struct inode *tl_inode = osb->osb_tl_inode;
2710 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
1933 2711
1934 struct ocfs2_xattr_info xi = { 2712 struct ocfs2_xattr_info xi = {
1935 .name_index = name_index, 2713 .name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
1949 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) 2727 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
1950 return -EOPNOTSUPP; 2728 return -EOPNOTSUPP;
1951 2729
2730 /*
2731 * Only xbs will be used on indexed trees. xis doesn't need a
2732 * bucket.
2733 */
2734 xbs.bucket = ocfs2_xattr_bucket_new(inode);
2735 if (!xbs.bucket) {
2736 mlog_errno(-ENOMEM);
2737 return -ENOMEM;
2738 }
2739
1952 ret = ocfs2_inode_lock(inode, &di_bh, 1); 2740 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1953 if (ret < 0) { 2741 if (ret < 0) {
1954 mlog_errno(ret); 2742 mlog_errno(ret);
1955 return ret; 2743 goto cleanup_nolock;
1956 } 2744 }
1957 xis.inode_bh = xbs.inode_bh = di_bh; 2745 xis.inode_bh = xbs.inode_bh = di_bh;
1958 di = (struct ocfs2_dinode *)di_bh->b_data; 2746 di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
1984 goto cleanup; 2772 goto cleanup;
1985 } 2773 }
1986 2774
1987 if (!value) { 2775
1988 /* Remove existing extended attribute */ 2776 mutex_lock(&tl_inode->i_mutex);
1989 if (!xis.not_found) 2777
1990 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); 2778 if (ocfs2_truncate_log_needs_flush(osb)) {
1991 else if (!xbs.not_found) 2779 ret = __ocfs2_flush_truncate_log(osb);
1992 ret = ocfs2_xattr_block_set(inode, &xi, &xbs); 2780 if (ret < 0) {
1993 } else { 2781 mutex_unlock(&tl_inode->i_mutex);
1994 /* We always try to set extended attribute into inode first*/ 2782 mlog_errno(ret);
1995 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); 2783 goto cleanup;
1996 if (!ret && !xbs.not_found) {
1997 /*
1998 * If succeed and that extended attribute existing in
1999 * external block, then we will remove it.
2000 */
2001 xi.value = NULL;
2002 xi.value_len = 0;
2003 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2004 } else if (ret == -ENOSPC) {
2005 if (di->i_xattr_loc && !xbs.xattr_bh) {
2006 ret = ocfs2_xattr_block_find(inode, name_index,
2007 name, &xbs);
2008 if (ret)
2009 goto cleanup;
2010 }
2011 /*
2012 * If no space in inode, we will set extended attribute
2013 * into external block.
2014 */
2015 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2016 if (ret)
2017 goto cleanup;
2018 if (!xis.not_found) {
2019 /*
2020 * If succeed and that extended attribute
2021 * existing in inode, we will remove it.
2022 */
2023 xi.value = NULL;
2024 xi.value_len = 0;
2025 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
2026 }
2027 } 2784 }
2028 } 2785 }
2786 mutex_unlock(&tl_inode->i_mutex);
2787
2788 ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
2789 &xbs, &ctxt, &credits);
2790 if (ret) {
2791 mlog_errno(ret);
2792 goto cleanup;
2793 }
2794
2795 /* we need to update inode's ctime field, so add credit for it. */
2796 credits += OCFS2_INODE_UPDATE_CREDITS;
2797 ctxt.handle = ocfs2_start_trans(osb, credits);
2798 if (IS_ERR(ctxt.handle)) {
2799 ret = PTR_ERR(ctxt.handle);
2800 mlog_errno(ret);
2801 goto cleanup;
2802 }
2803
2804 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
2805
2806 ocfs2_commit_trans(osb, ctxt.handle);
2807
2808 if (ctxt.data_ac)
2809 ocfs2_free_alloc_context(ctxt.data_ac);
2810 if (ctxt.meta_ac)
2811 ocfs2_free_alloc_context(ctxt.meta_ac);
2812 if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
2813 ocfs2_schedule_truncate_log_flush(osb, 1);
2814 ocfs2_run_deallocs(osb, &ctxt.dealloc);
2029cleanup: 2815cleanup:
2030 up_write(&OCFS2_I(inode)->ip_xattr_sem); 2816 up_write(&OCFS2_I(inode)->ip_xattr_sem);
2031 ocfs2_inode_unlock(inode, 1); 2817 ocfs2_inode_unlock(inode, 1);
2818cleanup_nolock:
2032 brelse(di_bh); 2819 brelse(di_bh);
2033 brelse(xbs.xattr_bh); 2820 brelse(xbs.xattr_bh);
2034 for (i = 0; i < blk_per_bucket; i++) 2821 ocfs2_xattr_bucket_free(xbs.bucket);
2035 brelse(xbs.bucket.bhs[i]);
2036 2822
2037 return ret; 2823 return ret;
2038} 2824}
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
2107 void *para); 2893 void *para);
2108 2894
2109static int ocfs2_find_xe_in_bucket(struct inode *inode, 2895static int ocfs2_find_xe_in_bucket(struct inode *inode,
2110 struct buffer_head *header_bh, 2896 struct ocfs2_xattr_bucket *bucket,
2111 int name_index, 2897 int name_index,
2112 const char *name, 2898 const char *name,
2113 u32 name_hash, 2899 u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
2115 int *found) 2901 int *found)
2116{ 2902{
2117 int i, ret = 0, cmp = 1, block_off, new_offset; 2903 int i, ret = 0, cmp = 1, block_off, new_offset;
2118 struct ocfs2_xattr_header *xh = 2904 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
2119 (struct ocfs2_xattr_header *)header_bh->b_data;
2120 size_t name_len = strlen(name); 2905 size_t name_len = strlen(name);
2121 struct ocfs2_xattr_entry *xe = NULL; 2906 struct ocfs2_xattr_entry *xe = NULL;
2122 struct buffer_head *name_bh = NULL;
2123 char *xe_name; 2907 char *xe_name;
2124 2908
2125 /* 2909 /*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
2150 break; 2934 break;
2151 } 2935 }
2152 2936
2153 ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
2154 &name_bh);
2155 if (ret) {
2156 mlog_errno(ret);
2157 break;
2158 }
2159 xe_name = name_bh->b_data + new_offset;
2160 2937
2161 cmp = memcmp(name, xe_name, name_len); 2938 xe_name = bucket_block(bucket, block_off) + new_offset;
2162 brelse(name_bh); 2939 if (!memcmp(name, xe_name, name_len)) {
2163 name_bh = NULL;
2164
2165 if (cmp == 0) {
2166 *xe_index = i; 2940 *xe_index = i;
2167 *found = 1; 2941 *found = 1;
2168 ret = 0; 2942 ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2192 struct ocfs2_xattr_search *xs) 2966 struct ocfs2_xattr_search *xs)
2193{ 2967{
2194 int ret, found = 0; 2968 int ret, found = 0;
2195 struct buffer_head *bh = NULL;
2196 struct buffer_head *lower_bh = NULL;
2197 struct ocfs2_xattr_header *xh = NULL; 2969 struct ocfs2_xattr_header *xh = NULL;
2198 struct ocfs2_xattr_entry *xe = NULL; 2970 struct ocfs2_xattr_entry *xe = NULL;
2199 u16 index = 0; 2971 u16 index = 0;
2200 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 2972 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2201 int low_bucket = 0, bucket, high_bucket; 2973 int low_bucket = 0, bucket, high_bucket;
2974 struct ocfs2_xattr_bucket *search;
2202 u32 last_hash; 2975 u32 last_hash;
2203 u64 blkno; 2976 u64 blkno, lower_blkno = 0;
2204 2977
2205 ret = ocfs2_read_block(inode, p_blkno, &bh); 2978 search = ocfs2_xattr_bucket_new(inode);
2979 if (!search) {
2980 ret = -ENOMEM;
2981 mlog_errno(ret);
2982 goto out;
2983 }
2984
2985 ret = ocfs2_read_xattr_bucket(search, p_blkno);
2206 if (ret) { 2986 if (ret) {
2207 mlog_errno(ret); 2987 mlog_errno(ret);
2208 goto out; 2988 goto out;
2209 } 2989 }
2210 2990
2211 xh = (struct ocfs2_xattr_header *)bh->b_data; 2991 xh = bucket_xh(search);
2212 high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; 2992 high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
2213
2214 while (low_bucket <= high_bucket) { 2993 while (low_bucket <= high_bucket) {
2215 brelse(bh); 2994 ocfs2_xattr_bucket_relse(search);
2216 bh = NULL;
2217 bucket = (low_bucket + high_bucket) / 2;
2218 2995
2996 bucket = (low_bucket + high_bucket) / 2;
2219 blkno = p_blkno + bucket * blk_per_bucket; 2997 blkno = p_blkno + bucket * blk_per_bucket;
2220 2998 ret = ocfs2_read_xattr_bucket(search, blkno);
2221 ret = ocfs2_read_block(inode, blkno, &bh);
2222 if (ret) { 2999 if (ret) {
2223 mlog_errno(ret); 3000 mlog_errno(ret);
2224 goto out; 3001 goto out;
2225 } 3002 }
2226 3003
2227 xh = (struct ocfs2_xattr_header *)bh->b_data; 3004 xh = bucket_xh(search);
2228 xe = &xh->xh_entries[0]; 3005 xe = &xh->xh_entries[0];
2229 if (name_hash < le32_to_cpu(xe->xe_name_hash)) { 3006 if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
2230 high_bucket = bucket - 1; 3007 high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2241 3018
2242 last_hash = le32_to_cpu(xe->xe_name_hash); 3019 last_hash = le32_to_cpu(xe->xe_name_hash);
2243 3020
2244 /* record lower_bh which may be the insert place. */ 3021 /* record lower_blkno which may be the insert place. */
2245 brelse(lower_bh); 3022 lower_blkno = blkno;
2246 lower_bh = bh;
2247 bh = NULL;
2248 3023
2249 if (name_hash > le32_to_cpu(xe->xe_name_hash)) { 3024 if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
2250 low_bucket = bucket + 1; 3025 low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2252 } 3027 }
2253 3028
2254 /* the searched xattr should reside in this bucket if exists. */ 3029 /* the searched xattr should reside in this bucket if exists. */
2255 ret = ocfs2_find_xe_in_bucket(inode, lower_bh, 3030 ret = ocfs2_find_xe_in_bucket(inode, search,
2256 name_index, name, name_hash, 3031 name_index, name, name_hash,
2257 &index, &found); 3032 &index, &found);
2258 if (ret) { 3033 if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2267 * When the xattr's hash value is in the gap of 2 buckets, we will 3042 * When the xattr's hash value is in the gap of 2 buckets, we will
2268 * always set it to the previous bucket. 3043 * always set it to the previous bucket.
2269 */ 3044 */
2270 if (!lower_bh) { 3045 if (!lower_blkno)
2271 /* 3046 lower_blkno = p_blkno;
2272 * We can't find any bucket whose first name_hash is less 3047
2273 * than the find name_hash. 3048 /* This should be in cache - we just read it during the search */
2274 */ 3049 ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
2275 BUG_ON(bh->b_blocknr != p_blkno); 3050 if (ret) {
2276 lower_bh = bh; 3051 mlog_errno(ret);
2277 bh = NULL; 3052 goto out;
2278 } 3053 }
2279 xs->bucket.bhs[0] = lower_bh;
2280 xs->bucket.xh = (struct ocfs2_xattr_header *)
2281 xs->bucket.bhs[0]->b_data;
2282 lower_bh = NULL;
2283 3054
2284 xs->header = xs->bucket.xh; 3055 xs->header = bucket_xh(xs->bucket);
2285 xs->base = xs->bucket.bhs[0]->b_data; 3056 xs->base = bucket_block(xs->bucket, 0);
2286 xs->end = xs->base + inode->i_sb->s_blocksize; 3057 xs->end = xs->base + inode->i_sb->s_blocksize;
2287 3058
2288 if (found) { 3059 if (found) {
2289 /*
2290 * If we have found the xattr enty, read all the blocks in
2291 * this bucket.
2292 */
2293 ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
2294 blk_per_bucket - 1, &xs->bucket.bhs[1],
2295 0);
2296 if (ret) {
2297 mlog_errno(ret);
2298 goto out;
2299 }
2300
2301 xs->here = &xs->header->xh_entries[index]; 3060 xs->here = &xs->header->xh_entries[index];
2302 mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, 3061 mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
2303 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index); 3062 (unsigned long long)bucket_blkno(xs->bucket), index);
2304 } else 3063 } else
2305 ret = -ENODATA; 3064 ret = -ENODATA;
2306 3065
2307out: 3066out:
2308 brelse(bh); 3067 ocfs2_xattr_bucket_free(search);
2309 brelse(lower_bh);
2310 return ret; 3068 return ret;
2311} 3069}
2312 3070
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
2357 xattr_bucket_func *func, 3115 xattr_bucket_func *func,
2358 void *para) 3116 void *para)
2359{ 3117{
2360 int i, j, ret = 0; 3118 int i, ret = 0;
2361 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2362 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); 3119 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
2363 u32 num_buckets = clusters * bpc; 3120 u32 num_buckets = clusters * bpc;
2364 struct ocfs2_xattr_bucket bucket; 3121 struct ocfs2_xattr_bucket *bucket;
2365 3122
2366 memset(&bucket, 0, sizeof(bucket)); 3123 bucket = ocfs2_xattr_bucket_new(inode);
3124 if (!bucket) {
3125 mlog_errno(-ENOMEM);
3126 return -ENOMEM;
3127 }
2367 3128
2368 mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", 3129 mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
2369 clusters, (unsigned long long)blkno); 3130 clusters, (unsigned long long)blkno);
2370 3131
2371 for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { 3132 for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
2372 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, 3133 ret = ocfs2_read_xattr_bucket(bucket, blkno);
2373 bucket.bhs, 0);
2374 if (ret) { 3134 if (ret) {
2375 mlog_errno(ret); 3135 mlog_errno(ret);
2376 goto out; 3136 break;
2377 } 3137 }
2378 3138
2379 bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
2380 /* 3139 /*
2381 * The real bucket num in this series of blocks is stored 3140 * The real bucket num in this series of blocks is stored
2382 * in the 1st bucket. 3141 * in the 1st bucket.
2383 */ 3142 */
2384 if (i == 0) 3143 if (i == 0)
2385 num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); 3144 num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
2386 3145
2387 mlog(0, "iterating xattr bucket %llu, first hash %u\n", 3146 mlog(0, "iterating xattr bucket %llu, first hash %u\n",
2388 (unsigned long long)blkno, 3147 (unsigned long long)blkno,
2389 le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); 3148 le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
2390 if (func) { 3149 if (func) {
2391 ret = func(inode, &bucket, para); 3150 ret = func(inode, bucket, para);
2392 if (ret) { 3151 if (ret)
2393 mlog_errno(ret); 3152 mlog_errno(ret);
2394 break; 3153 /* Fall through to bucket_relse() */
2395 }
2396 } 3154 }
2397 3155
2398 for (j = 0; j < blk_per_bucket; j++) 3156 ocfs2_xattr_bucket_relse(bucket);
2399 brelse(bucket.bhs[j]); 3157 if (ret)
2400 memset(&bucket, 0, sizeof(bucket)); 3158 break;
2401 } 3159 }
2402 3160
2403out: 3161 ocfs2_xattr_bucket_free(bucket);
2404 for (j = 0; j < blk_per_bucket; j++)
2405 brelse(bucket.bhs[j]);
2406
2407 return ret; 3162 return ret;
2408} 3163}
2409 3164
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
2441 int i, block_off, new_offset; 3196 int i, block_off, new_offset;
2442 const char *prefix, *name; 3197 const char *prefix, *name;
2443 3198
2444 for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) { 3199 for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
2445 struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i]; 3200 struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
2446 type = ocfs2_xattr_get_type(entry); 3201 type = ocfs2_xattr_get_type(entry);
2447 prefix = ocfs2_xattr_prefix(type); 3202 prefix = ocfs2_xattr_prefix(type);
2448 3203
2449 if (prefix) { 3204 if (prefix) {
2450 ret = ocfs2_xattr_bucket_get_name_value(inode, 3205 ret = ocfs2_xattr_bucket_get_name_value(inode,
2451 bucket->xh, 3206 bucket_xh(bucket),
2452 i, 3207 i,
2453 &block_off, 3208 &block_off,
2454 &new_offset); 3209 &new_offset);
2455 if (ret) 3210 if (ret)
2456 break; 3211 break;
2457 3212
2458 name = (const char *)bucket->bhs[block_off]->b_data + 3213 name = (const char *)bucket_block(bucket, block_off) +
2459 new_offset; 3214 new_offset;
2460 ret = ocfs2_xattr_list_entry(xl->buffer, 3215 ret = ocfs2_xattr_list_entry(xl->buffer,
2461 xl->buffer_size, 3216 xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
2540/* 3295/*
2541 * When the ocfs2_xattr_block is filled up, new bucket will be created 3296 * When the ocfs2_xattr_block is filled up, new bucket will be created
2542 * and all the xattr entries will be moved to the new bucket. 3297 * and all the xattr entries will be moved to the new bucket.
3298 * The header goes at the start of the bucket, and the names+values are
3299 * filled from the end. This is why *target starts as the last buffer.
2543 * Note: we need to sort the entries since they are not saved in order 3300 * Note: we need to sort the entries since they are not saved in order
2544 * in the ocfs2_xattr_block. 3301 * in the ocfs2_xattr_block.
2545 */ 3302 */
2546static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, 3303static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2547 struct buffer_head *xb_bh, 3304 struct buffer_head *xb_bh,
2548 struct buffer_head *xh_bh, 3305 struct ocfs2_xattr_bucket *bucket)
2549 struct buffer_head *data_bh)
2550{ 3306{
2551 int i, blocksize = inode->i_sb->s_blocksize; 3307 int i, blocksize = inode->i_sb->s_blocksize;
3308 int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2552 u16 offset, size, off_change; 3309 u16 offset, size, off_change;
2553 struct ocfs2_xattr_entry *xe; 3310 struct ocfs2_xattr_entry *xe;
2554 struct ocfs2_xattr_block *xb = 3311 struct ocfs2_xattr_block *xb =
2555 (struct ocfs2_xattr_block *)xb_bh->b_data; 3312 (struct ocfs2_xattr_block *)xb_bh->b_data;
2556 struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; 3313 struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
2557 struct ocfs2_xattr_header *xh = 3314 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
2558 (struct ocfs2_xattr_header *)xh_bh->b_data;
2559 u16 count = le16_to_cpu(xb_xh->xh_count); 3315 u16 count = le16_to_cpu(xb_xh->xh_count);
2560 char *target = xh_bh->b_data, *src = xb_bh->b_data; 3316 char *src = xb_bh->b_data;
3317 char *target = bucket_block(bucket, blks - 1);
2561 3318
2562 mlog(0, "cp xattr from block %llu to bucket %llu\n", 3319 mlog(0, "cp xattr from block %llu to bucket %llu\n",
2563 (unsigned long long)xb_bh->b_blocknr, 3320 (unsigned long long)xb_bh->b_blocknr,
2564 (unsigned long long)xh_bh->b_blocknr); 3321 (unsigned long long)bucket_blkno(bucket));
3322
3323 for (i = 0; i < blks; i++)
3324 memset(bucket_block(bucket, i), 0, blocksize);
2565 3325
2566 memset(xh_bh->b_data, 0, blocksize);
2567 if (data_bh)
2568 memset(data_bh->b_data, 0, blocksize);
2569 /* 3326 /*
2570 * Since the xe_name_offset is based on ocfs2_xattr_header, 3327 * Since the xe_name_offset is based on ocfs2_xattr_header,
2571 * there is a offset change corresponding to the change of 3328 * there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2577 size = blocksize - offset; 3334 size = blocksize - offset;
2578 3335
2579 /* copy all the names and values. */ 3336 /* copy all the names and values. */
2580 if (data_bh)
2581 target = data_bh->b_data;
2582 memcpy(target + offset, src + offset, size); 3337 memcpy(target + offset, src + offset, size);
2583 3338
2584 /* Init new header now. */ 3339 /* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2588 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); 3343 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
2589 3344
2590 /* copy all the entries. */ 3345 /* copy all the entries. */
2591 target = xh_bh->b_data; 3346 target = bucket_block(bucket, 0);
2592 offset = offsetof(struct ocfs2_xattr_header, xh_entries); 3347 offset = offsetof(struct ocfs2_xattr_header, xh_entries);
2593 size = count * sizeof(struct ocfs2_xattr_entry); 3348 size = count * sizeof(struct ocfs2_xattr_entry);
2594 memcpy(target + offset, (char *)xb_xh + offset, size); 3349 memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2614 * While if the entry is in index b-tree, "bucket" indicates the 3369 * While if the entry is in index b-tree, "bucket" indicates the
2615 * real place of the xattr. 3370 * real place of the xattr.
2616 */ 3371 */
2617static int ocfs2_xattr_update_xattr_search(struct inode *inode, 3372static void ocfs2_xattr_update_xattr_search(struct inode *inode,
2618 struct ocfs2_xattr_search *xs, 3373 struct ocfs2_xattr_search *xs,
2619 struct buffer_head *old_bh, 3374 struct buffer_head *old_bh)
2620 struct buffer_head *new_bh)
2621{ 3375{
2622 int ret = 0;
2623 char *buf = old_bh->b_data; 3376 char *buf = old_bh->b_data;
2624 struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; 3377 struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
2625 struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; 3378 struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
2626 int i, blocksize = inode->i_sb->s_blocksize; 3379 int i;
2627 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2628
2629 xs->bucket.bhs[0] = new_bh;
2630 get_bh(new_bh);
2631 xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
2632 xs->header = xs->bucket.xh;
2633 3380
2634 xs->base = new_bh->b_data; 3381 xs->header = bucket_xh(xs->bucket);
3382 xs->base = bucket_block(xs->bucket, 0);
2635 xs->end = xs->base + inode->i_sb->s_blocksize; 3383 xs->end = xs->base + inode->i_sb->s_blocksize;
2636 3384
2637 if (!xs->not_found) { 3385 if (xs->not_found)
2638 if (OCFS2_XATTR_BUCKET_SIZE != blocksize) { 3386 return;
2639 ret = ocfs2_read_blocks(inode,
2640 xs->bucket.bhs[0]->b_blocknr + 1,
2641 blk_per_bucket - 1, &xs->bucket.bhs[1],
2642 0);
2643 if (ret) {
2644 mlog_errno(ret);
2645 return ret;
2646 }
2647
2648 }
2649 i = xs->here - old_xh->xh_entries;
2650 xs->here = &xs->header->xh_entries[i];
2651 }
2652 3387
2653 return ret; 3388 i = xs->here - old_xh->xh_entries;
3389 xs->here = &xs->header->xh_entries[i];
2654} 3390}
2655 3391
2656static int ocfs2_xattr_create_index_block(struct inode *inode, 3392static int ocfs2_xattr_create_index_block(struct inode *inode,
2657 struct ocfs2_xattr_search *xs) 3393 struct ocfs2_xattr_search *xs,
3394 struct ocfs2_xattr_set_ctxt *ctxt)
2658{ 3395{
2659 int ret, credits = OCFS2_SUBALLOC_ALLOC; 3396 int ret;
2660 u32 bit_off, len; 3397 u32 bit_off, len;
2661 u64 blkno; 3398 u64 blkno;
2662 handle_t *handle; 3399 handle_t *handle = ctxt->handle;
2663 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3400 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2664 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3401 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2665 struct ocfs2_alloc_context *data_ac;
2666 struct buffer_head *xh_bh = NULL, *data_bh = NULL;
2667 struct buffer_head *xb_bh = xs->xattr_bh; 3402 struct buffer_head *xb_bh = xs->xattr_bh;
2668 struct ocfs2_xattr_block *xb = 3403 struct ocfs2_xattr_block *xb =
2669 (struct ocfs2_xattr_block *)xb_bh->b_data; 3404 (struct ocfs2_xattr_block *)xb_bh->b_data;
2670 struct ocfs2_xattr_tree_root *xr; 3405 struct ocfs2_xattr_tree_root *xr;
2671 u16 xb_flags = le16_to_cpu(xb->xb_flags); 3406 u16 xb_flags = le16_to_cpu(xb->xb_flags);
2672 u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2673 3407
2674 mlog(0, "create xattr index block for %llu\n", 3408 mlog(0, "create xattr index block for %llu\n",
2675 (unsigned long long)xb_bh->b_blocknr); 3409 (unsigned long long)xb_bh->b_blocknr);
2676 3410
2677 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); 3411 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
2678 3412 BUG_ON(!xs->bucket);
2679 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
2680 if (ret) {
2681 mlog_errno(ret);
2682 goto out;
2683 }
2684 3413
2685 /* 3414 /*
2686 * XXX: 3415 * XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
2689 */ 3418 */
2690 down_write(&oi->ip_alloc_sem); 3419 down_write(&oi->ip_alloc_sem);
2691 3420
2692 /* 3421 ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
2693 * 3 more credits, one for xattr block update, one for the 1st block 3422 OCFS2_JOURNAL_ACCESS_WRITE);
2694 * of the new xattr bucket and one for the value/data.
2695 */
2696 credits += 3;
2697 handle = ocfs2_start_trans(osb, credits);
2698 if (IS_ERR(handle)) {
2699 ret = PTR_ERR(handle);
2700 mlog_errno(ret);
2701 goto out_sem;
2702 }
2703
2704 ret = ocfs2_journal_access(handle, inode, xb_bh,
2705 OCFS2_JOURNAL_ACCESS_WRITE);
2706 if (ret) { 3423 if (ret) {
2707 mlog_errno(ret); 3424 mlog_errno(ret);
2708 goto out_commit; 3425 goto out;
2709 } 3426 }
2710 3427
2711 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); 3428 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
3429 1, 1, &bit_off, &len);
2712 if (ret) { 3430 if (ret) {
2713 mlog_errno(ret); 3431 mlog_errno(ret);
2714 goto out_commit; 3432 goto out;
2715 } 3433 }
2716 3434
2717 /* 3435 /*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
2724 mlog(0, "allocate 1 cluster from %llu to xattr block\n", 3442 mlog(0, "allocate 1 cluster from %llu to xattr block\n",
2725 (unsigned long long)blkno); 3443 (unsigned long long)blkno);
2726 3444
2727 xh_bh = sb_getblk(inode->i_sb, blkno); 3445 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
2728 if (!xh_bh) { 3446 if (ret) {
2729 ret = -EIO;
2730 mlog_errno(ret); 3447 mlog_errno(ret);
2731 goto out_commit; 3448 goto out;
2732 } 3449 }
2733 3450
2734 ocfs2_set_new_buffer_uptodate(inode, xh_bh); 3451 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
2735 3452 OCFS2_JOURNAL_ACCESS_CREATE);
2736 ret = ocfs2_journal_access(handle, inode, xh_bh,
2737 OCFS2_JOURNAL_ACCESS_CREATE);
2738 if (ret) { 3453 if (ret) {
2739 mlog_errno(ret); 3454 mlog_errno(ret);
2740 goto out_commit; 3455 goto out;
2741 }
2742
2743 if (bpb > 1) {
2744 data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
2745 if (!data_bh) {
2746 ret = -EIO;
2747 mlog_errno(ret);
2748 goto out_commit;
2749 }
2750
2751 ocfs2_set_new_buffer_uptodate(inode, data_bh);
2752
2753 ret = ocfs2_journal_access(handle, inode, data_bh,
2754 OCFS2_JOURNAL_ACCESS_CREATE);
2755 if (ret) {
2756 mlog_errno(ret);
2757 goto out_commit;
2758 }
2759 } 3456 }
2760 3457
2761 ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh); 3458 ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
3459 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
2762 3460
2763 ocfs2_journal_dirty(handle, xh_bh); 3461 ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
2764 if (data_bh)
2765 ocfs2_journal_dirty(handle, data_bh);
2766
2767 ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
2768 if (ret) {
2769 mlog_errno(ret);
2770 goto out_commit;
2771 }
2772 3462
2773 /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ 3463 /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
2774 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - 3464 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
2787 3477
2788 xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); 3478 xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
2789 3479
2790 ret = ocfs2_journal_dirty(handle, xb_bh); 3480 ocfs2_journal_dirty(handle, xb_bh);
2791 if (ret) {
2792 mlog_errno(ret);
2793 goto out_commit;
2794 }
2795
2796out_commit:
2797 ocfs2_commit_trans(osb, handle);
2798
2799out_sem:
2800 up_write(&oi->ip_alloc_sem);
2801 3481
2802out: 3482out:
2803 if (data_ac) 3483 up_write(&oi->ip_alloc_sem);
2804 ocfs2_free_alloc_context(data_ac);
2805
2806 brelse(xh_bh);
2807 brelse(data_bh);
2808 3484
2809 return ret; 3485 return ret;
2810} 3486}
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
2829 * so that we can spare some space for insertion. 3505 * so that we can spare some space for insertion.
2830 */ 3506 */
2831static int ocfs2_defrag_xattr_bucket(struct inode *inode, 3507static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3508 handle_t *handle,
2832 struct ocfs2_xattr_bucket *bucket) 3509 struct ocfs2_xattr_bucket *bucket)
2833{ 3510{
2834 int ret, i; 3511 int ret, i;
2835 size_t end, offset, len, value_len; 3512 size_t end, offset, len, value_len;
2836 struct ocfs2_xattr_header *xh; 3513 struct ocfs2_xattr_header *xh;
2837 char *entries, *buf, *bucket_buf = NULL; 3514 char *entries, *buf, *bucket_buf = NULL;
2838 u64 blkno = bucket->bhs[0]->b_blocknr; 3515 u64 blkno = bucket_blkno(bucket);
2839 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2840 u16 xh_free_start; 3516 u16 xh_free_start;
2841 size_t blocksize = inode->i_sb->s_blocksize; 3517 size_t blocksize = inode->i_sb->s_blocksize;
2842 handle_t *handle;
2843 struct buffer_head **bhs;
2844 struct ocfs2_xattr_entry *xe; 3518 struct ocfs2_xattr_entry *xe;
2845 3519
2846 bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
2847 GFP_NOFS);
2848 if (!bhs)
2849 return -ENOMEM;
2850
2851 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
2852 if (ret)
2853 goto out;
2854
2855 /* 3520 /*
2856 * In order to make the operation more efficient and generic, 3521 * In order to make the operation more efficient and generic,
2857 * we copy all the blocks into a contiguous memory and do the 3522 * we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2865 } 3530 }
2866 3531
2867 buf = bucket_buf; 3532 buf = bucket_buf;
2868 for (i = 0; i < blk_per_bucket; i++, buf += blocksize) 3533 for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
2869 memcpy(buf, bhs[i]->b_data, blocksize); 3534 memcpy(buf, bucket_block(bucket, i), blocksize);
2870 3535
2871 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket); 3536 ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
2872 if (IS_ERR(handle)) { 3537 OCFS2_JOURNAL_ACCESS_WRITE);
2873 ret = PTR_ERR(handle); 3538 if (ret < 0) {
2874 handle = NULL;
2875 mlog_errno(ret); 3539 mlog_errno(ret);
2876 goto out; 3540 goto out;
2877 } 3541 }
2878 3542
2879 for (i = 0; i < blk_per_bucket; i++) {
2880 ret = ocfs2_journal_access(handle, inode, bhs[i],
2881 OCFS2_JOURNAL_ACCESS_WRITE);
2882 if (ret < 0) {
2883 mlog_errno(ret);
2884 goto commit;
2885 }
2886 }
2887
2888 xh = (struct ocfs2_xattr_header *)bucket_buf; 3543 xh = (struct ocfs2_xattr_header *)bucket_buf;
2889 entries = (char *)xh->xh_entries; 3544 entries = (char *)xh->xh_entries;
2890 xh_free_start = le16_to_cpu(xh->xh_free_start); 3545 xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2940 "bucket %llu\n", (unsigned long long)blkno); 3595 "bucket %llu\n", (unsigned long long)blkno);
2941 3596
2942 if (xh_free_start == end) 3597 if (xh_free_start == end)
2943 goto commit; 3598 goto out;
2944 3599
2945 memset(bucket_buf + xh_free_start, 0, end - xh_free_start); 3600 memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
2946 xh->xh_free_start = cpu_to_le16(end); 3601 xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2951 cmp_xe, swap_xe); 3606 cmp_xe, swap_xe);
2952 3607
2953 buf = bucket_buf; 3608 buf = bucket_buf;
2954 for (i = 0; i < blk_per_bucket; i++, buf += blocksize) { 3609 for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
2955 memcpy(bhs[i]->b_data, buf, blocksize); 3610 memcpy(bucket_block(bucket, i), buf, blocksize);
2956 ocfs2_journal_dirty(handle, bhs[i]); 3611 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
2957 }
2958 3612
2959commit:
2960 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
2961out: 3613out:
2962
2963 if (bhs) {
2964 for (i = 0; i < blk_per_bucket; i++)
2965 brelse(bhs[i]);
2966 }
2967 kfree(bhs);
2968
2969 kfree(bucket_buf); 3614 kfree(bucket_buf);
2970 return ret; 3615 return ret;
2971} 3616}
2972 3617
2973/* 3618/*
2974 * Move half nums of the xattr bucket in the previous cluster to this new 3619 * prev_blkno points to the start of an existing extent. new_blkno
2975 * cluster. We only touch the last cluster of the previous extend record. 3620 * points to a newly allocated extent. Because we know each of our
3621 * clusters contains more than bucket, we can easily split one cluster
3622 * at a bucket boundary. So we take the last cluster of the existing
3623 * extent and split it down the middle. We move the last half of the
3624 * buckets in the last cluster of the existing extent over to the new
3625 * extent.
3626 *
3627 * first_bh is the buffer at prev_blkno so we can update the existing
3628 * extent's bucket count. header_bh is the bucket were we were hoping
3629 * to insert our xattr. If the bucket move places the target in the new
3630 * extent, we'll update first_bh and header_bh after modifying the old
3631 * extent.
2976 * 3632 *
2977 * first_bh is the first buffer_head of a series of bucket in the same 3633 * first_hash will be set as the 1st xe's name_hash in the new extent.
2978 * extent rec and header_bh is the header of one bucket in this cluster.
2979 * They will be updated if we move the data header_bh contains to the new
2980 * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
2981 */ 3634 */
2982static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, 3635static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
2983 handle_t *handle, 3636 handle_t *handle,
2984 struct buffer_head **first_bh, 3637 struct ocfs2_xattr_bucket *first,
2985 struct buffer_head **header_bh, 3638 struct ocfs2_xattr_bucket *target,
2986 u64 new_blkno, 3639 u64 new_blkno,
2987 u64 prev_blkno,
2988 u32 num_clusters, 3640 u32 num_clusters,
2989 u32 *first_hash) 3641 u32 *first_hash)
2990{ 3642{
2991 int i, ret, credits; 3643 int ret;
2992 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3644 struct super_block *sb = inode->i_sb;
2993 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 3645 int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
2994 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); 3646 int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
2995 int blocksize = inode->i_sb->s_blocksize; 3647 int to_move = num_buckets / 2;
2996 struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL; 3648 u64 src_blkno;
2997 struct ocfs2_xattr_header *new_xh; 3649 u64 last_cluster_blkno = bucket_blkno(first) +
2998 struct ocfs2_xattr_header *xh = 3650 ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
2999 (struct ocfs2_xattr_header *)((*first_bh)->b_data);
3000
3001 BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
3002 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
3003
3004 prev_bh = *first_bh;
3005 get_bh(prev_bh);
3006 xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
3007 3651
3008 prev_blkno += (num_clusters - 1) * bpc + bpc / 2; 3652 BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
3653 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
3009 3654
3010 mlog(0, "move half of xattrs in cluster %llu to %llu\n", 3655 mlog(0, "move half of xattrs in cluster %llu to %llu\n",
3011 (unsigned long long)prev_blkno, (unsigned long long)new_blkno); 3656 (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
3012 3657
3013 /* 3658 ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
3014 * We need to update the 1st half of the new cluster and 3659 last_cluster_blkno, new_blkno,
3015 * 1 more for the update of the 1st bucket of the previous 3660 to_move, first_hash);
3016 * extent record.
3017 */
3018 credits = bpc / 2 + 1;
3019 ret = ocfs2_extend_trans(handle, credits);
3020 if (ret) { 3661 if (ret) {
3021 mlog_errno(ret); 3662 mlog_errno(ret);
3022 goto out; 3663 goto out;
3023 } 3664 }
3024 3665
3025 ret = ocfs2_journal_access(handle, inode, prev_bh, 3666 /* This is the first bucket that got moved */
3026 OCFS2_JOURNAL_ACCESS_WRITE); 3667 src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
3027 if (ret) {
3028 mlog_errno(ret);
3029 goto out;
3030 }
3031 3668
3032 for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) { 3669 /*
3033 old_bh = new_bh = NULL; 3670 * If the target bucket was part of the moved buckets, we need to
3034 new_bh = sb_getblk(inode->i_sb, new_blkno); 3671 * update first and target.
3035 if (!new_bh) { 3672 */
3036 ret = -EIO; 3673 if (bucket_blkno(target) >= src_blkno) {
3037 mlog_errno(ret); 3674 /* Find the block for the new target bucket */
3038 goto out; 3675 src_blkno = new_blkno +
3039 } 3676 (bucket_blkno(target) - src_blkno);
3040 3677
3041 ocfs2_set_new_buffer_uptodate(inode, new_bh); 3678 ocfs2_xattr_bucket_relse(first);
3679 ocfs2_xattr_bucket_relse(target);
3042 3680
3043 ret = ocfs2_journal_access(handle, inode, new_bh, 3681 /*
3044 OCFS2_JOURNAL_ACCESS_CREATE); 3682 * These shouldn't fail - the buffers are in the
3045 if (ret < 0) { 3683 * journal from ocfs2_cp_xattr_bucket().
3684 */
3685 ret = ocfs2_read_xattr_bucket(first, new_blkno);
3686 if (ret) {
3046 mlog_errno(ret); 3687 mlog_errno(ret);
3047 brelse(new_bh);
3048 goto out; 3688 goto out;
3049 } 3689 }
3050 3690 ret = ocfs2_read_xattr_bucket(target, src_blkno);
3051 ret = ocfs2_read_block(inode, prev_blkno, &old_bh); 3691 if (ret)
3052 if (ret < 0) {
3053 mlog_errno(ret); 3692 mlog_errno(ret);
3054 brelse(new_bh);
3055 goto out;
3056 }
3057 3693
3058 memcpy(new_bh->b_data, old_bh->b_data, blocksize);
3059
3060 if (i == 0) {
3061 new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
3062 new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
3063
3064 if (first_hash)
3065 *first_hash = le32_to_cpu(
3066 new_xh->xh_entries[0].xe_name_hash);
3067 new_first_bh = new_bh;
3068 get_bh(new_first_bh);
3069 }
3070
3071 ocfs2_journal_dirty(handle, new_bh);
3072
3073 if (*header_bh == old_bh) {
3074 brelse(*header_bh);
3075 *header_bh = new_bh;
3076 get_bh(*header_bh);
3077
3078 brelse(*first_bh);
3079 *first_bh = new_first_bh;
3080 get_bh(*first_bh);
3081 }
3082 brelse(new_bh);
3083 brelse(old_bh);
3084 } 3694 }
3085 3695
3086 le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
3087
3088 ocfs2_journal_dirty(handle, prev_bh);
3089out: 3696out:
3090 brelse(prev_bh);
3091 brelse(new_first_bh);
3092 return ret;
3093}
3094
3095static int ocfs2_read_xattr_bucket(struct inode *inode,
3096 u64 blkno,
3097 struct buffer_head **bhs,
3098 int new)
3099{
3100 int ret = 0;
3101 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3102
3103 if (!new)
3104 return ocfs2_read_blocks(inode, blkno,
3105 blk_per_bucket, bhs, 0);
3106
3107 for (i = 0; i < blk_per_bucket; i++) {
3108 bhs[i] = sb_getblk(inode->i_sb, blkno + i);
3109 if (bhs[i] == NULL) {
3110 ret = -EIO;
3111 mlog_errno(ret);
3112 break;
3113 }
3114 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
3115 }
3116
3117 return ret; 3697 return ret;
3118} 3698}
3119 3699
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3178{ 3758{
3179 int ret, i; 3759 int ret, i;
3180 int count, start, len, name_value_len = 0, xe_len, name_offset = 0; 3760 int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
3181 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3761 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
3182 struct buffer_head **s_bhs, **t_bhs = NULL;
3183 struct ocfs2_xattr_header *xh; 3762 struct ocfs2_xattr_header *xh;
3184 struct ocfs2_xattr_entry *xe; 3763 struct ocfs2_xattr_entry *xe;
3185 int blocksize = inode->i_sb->s_blocksize; 3764 int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3187 mlog(0, "move some of xattrs from bucket %llu to %llu\n", 3766 mlog(0, "move some of xattrs from bucket %llu to %llu\n",
3188 (unsigned long long)blk, (unsigned long long)new_blk); 3767 (unsigned long long)blk, (unsigned long long)new_blk);
3189 3768
3190 s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); 3769 s_bucket = ocfs2_xattr_bucket_new(inode);
3191 if (!s_bhs) 3770 t_bucket = ocfs2_xattr_bucket_new(inode);
3192 return -ENOMEM; 3771 if (!s_bucket || !t_bucket) {
3193 3772 ret = -ENOMEM;
3194 ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
3195 if (ret) {
3196 mlog_errno(ret); 3773 mlog_errno(ret);
3197 goto out; 3774 goto out;
3198 } 3775 }
3199 3776
3200 ret = ocfs2_journal_access(handle, inode, s_bhs[0], 3777 ret = ocfs2_read_xattr_bucket(s_bucket, blk);
3201 OCFS2_JOURNAL_ACCESS_WRITE);
3202 if (ret) { 3778 if (ret) {
3203 mlog_errno(ret); 3779 mlog_errno(ret);
3204 goto out; 3780 goto out;
3205 } 3781 }
3206 3782
3207 t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); 3783 ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
3208 if (!t_bhs) { 3784 OCFS2_JOURNAL_ACCESS_WRITE);
3209 ret = -ENOMEM; 3785 if (ret) {
3786 mlog_errno(ret);
3210 goto out; 3787 goto out;
3211 } 3788 }
3212 3789
3213 ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head); 3790 /*
3791 * Even if !new_bucket_head, we're overwriting t_bucket. Thus,
3792 * there's no need to read it.
3793 */
3794 ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
3214 if (ret) { 3795 if (ret) {
3215 mlog_errno(ret); 3796 mlog_errno(ret);
3216 goto out; 3797 goto out;
3217 } 3798 }
3218 3799
3219 for (i = 0; i < blk_per_bucket; i++) { 3800 /*
3220 ret = ocfs2_journal_access(handle, inode, t_bhs[i], 3801 * Hey, if we're overwriting t_bucket, what difference does
3221 new_bucket_head ? 3802 * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the
3222 OCFS2_JOURNAL_ACCESS_CREATE : 3803 * same part of ocfs2_cp_xattr_bucket().
3223 OCFS2_JOURNAL_ACCESS_WRITE); 3804 */
3224 if (ret) { 3805 ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
3225 mlog_errno(ret); 3806 new_bucket_head ?
3226 goto out; 3807 OCFS2_JOURNAL_ACCESS_CREATE :
3227 } 3808 OCFS2_JOURNAL_ACCESS_WRITE);
3809 if (ret) {
3810 mlog_errno(ret);
3811 goto out;
3228 } 3812 }
3229 3813
3230 xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; 3814 xh = bucket_xh(s_bucket);
3231 count = le16_to_cpu(xh->xh_count); 3815 count = le16_to_cpu(xh->xh_count);
3232 start = ocfs2_xattr_find_divide_pos(xh); 3816 start = ocfs2_xattr_find_divide_pos(xh);
3233 3817
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3239 * The hash value is set as one larger than 3823 * The hash value is set as one larger than
3240 * that of the last entry in the previous bucket. 3824 * that of the last entry in the previous bucket.
3241 */ 3825 */
3242 for (i = 0; i < blk_per_bucket; i++) 3826 for (i = 0; i < t_bucket->bu_blocks; i++)
3243 memset(t_bhs[i]->b_data, 0, blocksize); 3827 memset(bucket_block(t_bucket, i), 0, blocksize);
3244 3828
3245 xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; 3829 xh = bucket_xh(t_bucket);
3246 xh->xh_free_start = cpu_to_le16(blocksize); 3830 xh->xh_free_start = cpu_to_le16(blocksize);
3247 xh->xh_entries[0].xe_name_hash = xe->xe_name_hash; 3831 xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
3248 le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1); 3832 le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3251 } 3835 }
3252 3836
3253 /* copy the whole bucket to the new first. */ 3837 /* copy the whole bucket to the new first. */
3254 for (i = 0; i < blk_per_bucket; i++) 3838 ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
3255 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3256 3839
3257 /* update the new bucket. */ 3840 /* update the new bucket. */
3258 xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; 3841 xh = bucket_xh(t_bucket);
3259 3842
3260 /* 3843 /*
3261 * Calculate the total name/value len and xh_free_start for 3844 * Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
3319 else 3902 else
3320 xh->xh_num_buckets = 0; 3903 xh->xh_num_buckets = 0;
3321 3904
3322 for (i = 0; i < blk_per_bucket; i++) { 3905 ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
3323 ocfs2_journal_dirty(handle, t_bhs[i]);
3324 if (ret)
3325 mlog_errno(ret);
3326 }
3327 3906
3328 /* store the first_hash of the new bucket. */ 3907 /* store the first_hash of the new bucket. */
3329 if (first_hash) 3908 if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
3337 if (start == count) 3916 if (start == count)
3338 goto out; 3917 goto out;
3339 3918
3340 xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; 3919 xh = bucket_xh(s_bucket);
3341 memset(&xh->xh_entries[start], 0, 3920 memset(&xh->xh_entries[start], 0,
3342 sizeof(struct ocfs2_xattr_entry) * (count - start)); 3921 sizeof(struct ocfs2_xattr_entry) * (count - start));
3343 xh->xh_count = cpu_to_le16(start); 3922 xh->xh_count = cpu_to_le16(start);
3344 xh->xh_free_start = cpu_to_le16(name_offset); 3923 xh->xh_free_start = cpu_to_le16(name_offset);
3345 xh->xh_name_value_len = cpu_to_le16(name_value_len); 3924 xh->xh_name_value_len = cpu_to_le16(name_value_len);
3346 3925
3347 ocfs2_journal_dirty(handle, s_bhs[0]); 3926 ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
3348 if (ret)
3349 mlog_errno(ret);
3350 3927
3351out: 3928out:
3352 if (s_bhs) { 3929 ocfs2_xattr_bucket_free(s_bucket);
3353 for (i = 0; i < blk_per_bucket; i++) 3930 ocfs2_xattr_bucket_free(t_bucket);
3354 brelse(s_bhs[i]);
3355 }
3356 kfree(s_bhs);
3357
3358 if (t_bhs) {
3359 for (i = 0; i < blk_per_bucket; i++)
3360 brelse(t_bhs[i]);
3361 }
3362 kfree(t_bhs);
3363 3931
3364 return ret; 3932 return ret;
3365} 3933}
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
3376 u64 t_blkno, 3944 u64 t_blkno,
3377 int t_is_new) 3945 int t_is_new)
3378{ 3946{
3379 int ret, i; 3947 int ret;
3380 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3948 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
3381 int blocksize = inode->i_sb->s_blocksize;
3382 struct buffer_head **s_bhs, **t_bhs = NULL;
3383 3949
3384 BUG_ON(s_blkno == t_blkno); 3950 BUG_ON(s_blkno == t_blkno);
3385 3951
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
3387 (unsigned long long)s_blkno, (unsigned long long)t_blkno, 3953 (unsigned long long)s_blkno, (unsigned long long)t_blkno,
3388 t_is_new); 3954 t_is_new);
3389 3955
3390 s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, 3956 s_bucket = ocfs2_xattr_bucket_new(inode);
3391 GFP_NOFS); 3957 t_bucket = ocfs2_xattr_bucket_new(inode);
3392 if (!s_bhs) 3958 if (!s_bucket || !t_bucket) {
3393 return -ENOMEM; 3959 ret = -ENOMEM;
3960 mlog_errno(ret);
3961 goto out;
3962 }
3394 3963
3395 ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0); 3964 ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
3396 if (ret) 3965 if (ret)
3397 goto out; 3966 goto out;
3398 3967
3399 t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, 3968 /*
3400 GFP_NOFS); 3969 * Even if !t_is_new, we're overwriting t_bucket. Thus,
3401 if (!t_bhs) { 3970 * there's no need to read it.
3402 ret = -ENOMEM; 3971 */
3972 ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
3973 if (ret)
3403 goto out; 3974 goto out;
3404 }
3405 3975
3406 ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new); 3976 /*
3977 * Hey, if we're overwriting t_bucket, what difference does
3978 * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new
3979 * cluster to fill, we came here from
3980 * ocfs2_mv_xattr_buckets(), and it is really new -
3981 * ACCESS_CREATE is required. But we also might have moved data
3982 * out of t_bucket before extending back into it.
3983 * ocfs2_add_new_xattr_bucket() can do this - its call to
3984 * ocfs2_add_new_xattr_cluster() may have created a new extent
3985 * and copied out the end of the old extent. Then it re-extends
3986 * the old extent back to create space for new xattrs. That's
3987 * how we get here, and the bucket isn't really new.
3988 */
3989 ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
3990 t_is_new ?
3991 OCFS2_JOURNAL_ACCESS_CREATE :
3992 OCFS2_JOURNAL_ACCESS_WRITE);
3407 if (ret) 3993 if (ret)
3408 goto out; 3994 goto out;
3409 3995
3410 for (i = 0; i < blk_per_bucket; i++) { 3996 ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
3411 ret = ocfs2_journal_access(handle, inode, t_bhs[i], 3997 ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
3412 t_is_new ?
3413 OCFS2_JOURNAL_ACCESS_CREATE :
3414 OCFS2_JOURNAL_ACCESS_WRITE);
3415 if (ret)
3416 goto out;
3417 }
3418
3419 for (i = 0; i < blk_per_bucket; i++) {
3420 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3421 ocfs2_journal_dirty(handle, t_bhs[i]);
3422 }
3423 3998
3424out: 3999out:
3425 if (s_bhs) { 4000 ocfs2_xattr_bucket_free(t_bucket);
3426 for (i = 0; i < blk_per_bucket; i++) 4001 ocfs2_xattr_bucket_free(s_bucket);
3427 brelse(s_bhs[i]);
3428 }
3429 kfree(s_bhs);
3430
3431 if (t_bhs) {
3432 for (i = 0; i < blk_per_bucket; i++)
3433 brelse(t_bhs[i]);
3434 }
3435 kfree(t_bhs);
3436 4002
3437 return ret; 4003 return ret;
3438} 4004}
3439 4005
3440/* 4006/*
3441 * Copy one xattr cluster from src_blk to to_blk. 4007 * src_blk points to the start of an existing extent. last_blk points to
3442 * The to_blk will become the first bucket header of the cluster, so its 4008 * last cluster in that extent. to_blk points to a newly allocated
3443 * xh_num_buckets will be initialized as the bucket num in the cluster. 4009 * extent. We copy the buckets from the cluster at last_blk to the new
4010 * extent. If start_bucket is non-zero, we skip that many buckets before
4011 * we start copying. The new extent's xh_num_buckets gets set to the
4012 * number of buckets we copied. The old extent's xh_num_buckets shrinks
4013 * by the same amount.
3444 */ 4014 */
3445static int ocfs2_cp_xattr_cluster(struct inode *inode, 4015static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
3446 handle_t *handle, 4016 u64 src_blk, u64 last_blk, u64 to_blk,
3447 struct buffer_head *first_bh, 4017 unsigned int start_bucket,
3448 u64 src_blk,
3449 u64 to_blk,
3450 u32 *first_hash) 4018 u32 *first_hash)
3451{ 4019{
3452 int i, ret, credits; 4020 int i, ret, credits;
3453 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 4021 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3454 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 4022 int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3455 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); 4023 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
3456 struct buffer_head *bh = NULL; 4024 struct ocfs2_xattr_bucket *old_first, *new_first;
3457 struct ocfs2_xattr_header *xh; 4025
3458 u64 to_blk_start = to_blk; 4026 mlog(0, "mv xattrs from cluster %llu to %llu\n",
4027 (unsigned long long)last_blk, (unsigned long long)to_blk);
4028
4029 BUG_ON(start_bucket >= num_buckets);
4030 if (start_bucket) {
4031 num_buckets -= start_bucket;
4032 last_blk += (start_bucket * blks_per_bucket);
4033 }
4034
4035 /* The first bucket of the original extent */
4036 old_first = ocfs2_xattr_bucket_new(inode);
4037 /* The first bucket of the new extent */
4038 new_first = ocfs2_xattr_bucket_new(inode);
4039 if (!old_first || !new_first) {
4040 ret = -ENOMEM;
4041 mlog_errno(ret);
4042 goto out;
4043 }
3459 4044
3460 mlog(0, "cp xattrs from cluster %llu to %llu\n", 4045 ret = ocfs2_read_xattr_bucket(old_first, src_blk);
3461 (unsigned long long)src_blk, (unsigned long long)to_blk); 4046 if (ret) {
4047 mlog_errno(ret);
4048 goto out;
4049 }
3462 4050
3463 /* 4051 /*
3464 * We need to update the new cluster and 1 more for the update of 4052 * We need to update the first bucket of the old extent and all
3465 * the 1st bucket of the previous extent rec. 4053 * the buckets going to the new extent.
3466 */ 4054 */
3467 credits = bpc + 1; 4055 credits = ((num_buckets + 1) * blks_per_bucket) +
4056 handle->h_buffer_credits;
3468 ret = ocfs2_extend_trans(handle, credits); 4057 ret = ocfs2_extend_trans(handle, credits);
3469 if (ret) { 4058 if (ret) {
3470 mlog_errno(ret); 4059 mlog_errno(ret);
3471 goto out; 4060 goto out;
3472 } 4061 }
3473 4062
3474 ret = ocfs2_journal_access(handle, inode, first_bh, 4063 ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
3475 OCFS2_JOURNAL_ACCESS_WRITE); 4064 OCFS2_JOURNAL_ACCESS_WRITE);
3476 if (ret) { 4065 if (ret) {
3477 mlog_errno(ret); 4066 mlog_errno(ret);
3478 goto out; 4067 goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
3480 4069
3481 for (i = 0; i < num_buckets; i++) { 4070 for (i = 0; i < num_buckets; i++) {
3482 ret = ocfs2_cp_xattr_bucket(inode, handle, 4071 ret = ocfs2_cp_xattr_bucket(inode, handle,
3483 src_blk, to_blk, 1); 4072 last_blk + (i * blks_per_bucket),
4073 to_blk + (i * blks_per_bucket),
4074 1);
3484 if (ret) { 4075 if (ret) {
3485 mlog_errno(ret); 4076 mlog_errno(ret);
3486 goto out; 4077 goto out;
3487 } 4078 }
3488
3489 src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3490 to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3491 } 4079 }
3492 4080
3493 /* update the old bucket header. */ 4081 /*
3494 xh = (struct ocfs2_xattr_header *)first_bh->b_data; 4082 * Get the new bucket ready before we dirty anything
3495 le16_add_cpu(&xh->xh_num_buckets, -num_buckets); 4083 * (This actually shouldn't fail, because we already dirtied
3496 4084 * it once in ocfs2_cp_xattr_bucket()).
3497 ocfs2_journal_dirty(handle, first_bh); 4085 */
3498 4086 ret = ocfs2_read_xattr_bucket(new_first, to_blk);
3499 /* update the new bucket header. */ 4087 if (ret) {
3500 ret = ocfs2_read_block(inode, to_blk_start, &bh);
3501 if (ret < 0) {
3502 mlog_errno(ret); 4088 mlog_errno(ret);
3503 goto out; 4089 goto out;
3504 } 4090 }
3505 4091 ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
3506 ret = ocfs2_journal_access(handle, inode, bh, 4092 OCFS2_JOURNAL_ACCESS_WRITE);
3507 OCFS2_JOURNAL_ACCESS_WRITE);
3508 if (ret) { 4093 if (ret) {
3509 mlog_errno(ret); 4094 mlog_errno(ret);
3510 goto out; 4095 goto out;
3511 } 4096 }
3512 4097
3513 xh = (struct ocfs2_xattr_header *)bh->b_data; 4098 /* Now update the headers */
3514 xh->xh_num_buckets = cpu_to_le16(num_buckets); 4099 le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
4100 ocfs2_xattr_bucket_journal_dirty(handle, old_first);
3515 4101
3516 ocfs2_journal_dirty(handle, bh); 4102 bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
4103 ocfs2_xattr_bucket_journal_dirty(handle, new_first);
3517 4104
3518 if (first_hash) 4105 if (first_hash)
3519 *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); 4106 *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
4107
3520out: 4108out:
3521 brelse(bh); 4109 ocfs2_xattr_bucket_free(new_first);
4110 ocfs2_xattr_bucket_free(old_first);
3522 return ret; 4111 return ret;
3523} 4112}
3524 4113
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
3534 u32 *first_hash) 4123 u32 *first_hash)
3535{ 4124{
3536 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4125 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3537 int ret, credits = 2 * blk_per_bucket; 4126 int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
3538 4127
3539 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); 4128 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
3540 4129
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
3577 */ 4166 */
3578static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, 4167static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
3579 handle_t *handle, 4168 handle_t *handle,
3580 struct buffer_head **first_bh, 4169 struct ocfs2_xattr_bucket *first,
3581 struct buffer_head **header_bh, 4170 struct ocfs2_xattr_bucket *target,
3582 u64 new_blk, 4171 u64 new_blk,
3583 u64 prev_blk,
3584 u32 prev_clusters, 4172 u32 prev_clusters,
3585 u32 *v_start, 4173 u32 *v_start,
3586 int *extend) 4174 int *extend)
3587{ 4175{
3588 int ret = 0; 4176 int ret;
3589 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3590 4177
3591 mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", 4178 mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
3592 (unsigned long long)prev_blk, prev_clusters, 4179 (unsigned long long)bucket_blkno(first), prev_clusters,
3593 (unsigned long long)new_blk); 4180 (unsigned long long)new_blk);
3594 4181
3595 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) 4182 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
3596 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, 4183 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
3597 handle, 4184 handle,
3598 first_bh, 4185 first, target,
3599 header_bh,
3600 new_blk, 4186 new_blk,
3601 prev_blk,
3602 prev_clusters, 4187 prev_clusters,
3603 v_start); 4188 v_start);
3604 else { 4189 if (ret)
3605 u64 last_blk = prev_blk + bpc * (prev_clusters - 1); 4190 mlog_errno(ret);
3606 4191 } else {
3607 if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk) 4192 /* The start of the last cluster in the first extent */
3608 ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh, 4193 u64 last_blk = bucket_blkno(first) +
3609 last_blk, new_blk, 4194 ((prev_clusters - 1) *
4195 ocfs2_clusters_to_blocks(inode->i_sb, 1));
4196
4197 if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
4198 ret = ocfs2_mv_xattr_buckets(inode, handle,
4199 bucket_blkno(first),
4200 last_blk, new_blk, 0,
3610 v_start); 4201 v_start);
3611 else { 4202 if (ret)
4203 mlog_errno(ret);
4204 } else {
3612 ret = ocfs2_divide_xattr_cluster(inode, handle, 4205 ret = ocfs2_divide_xattr_cluster(inode, handle,
3613 last_blk, new_blk, 4206 last_blk, new_blk,
3614 v_start); 4207 v_start);
4208 if (ret)
4209 mlog_errno(ret);
3615 4210
3616 if ((*header_bh)->b_blocknr == last_blk && extend) 4211 if ((bucket_blkno(target) == last_blk) && extend)
3617 *extend = 0; 4212 *extend = 0;
3618 } 4213 }
3619 } 4214 }
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
3639 */ 4234 */
3640static int ocfs2_add_new_xattr_cluster(struct inode *inode, 4235static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3641 struct buffer_head *root_bh, 4236 struct buffer_head *root_bh,
3642 struct buffer_head **first_bh, 4237 struct ocfs2_xattr_bucket *first,
3643 struct buffer_head **header_bh, 4238 struct ocfs2_xattr_bucket *target,
3644 u32 *num_clusters, 4239 u32 *num_clusters,
3645 u32 prev_cpos, 4240 u32 prev_cpos,
3646 u64 prev_blkno, 4241 int *extend,
3647 int *extend) 4242 struct ocfs2_xattr_set_ctxt *ctxt)
3648{ 4243{
3649 int ret, credits; 4244 int ret;
3650 u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 4245 u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3651 u32 prev_clusters = *num_clusters; 4246 u32 prev_clusters = *num_clusters;
3652 u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; 4247 u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
3653 u64 block; 4248 u64 block;
3654 handle_t *handle = NULL; 4249 handle_t *handle = ctxt->handle;
3655 struct ocfs2_alloc_context *data_ac = NULL;
3656 struct ocfs2_alloc_context *meta_ac = NULL;
3657 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 4250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3658 struct ocfs2_extent_tree et; 4251 struct ocfs2_extent_tree et;
3659 4252
3660 mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " 4253 mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
3661 "previous xattr blkno = %llu\n", 4254 "previous xattr blkno = %llu\n",
3662 (unsigned long long)OCFS2_I(inode)->ip_blkno, 4255 (unsigned long long)OCFS2_I(inode)->ip_blkno,
3663 prev_cpos, (unsigned long long)prev_blkno); 4256 prev_cpos, (unsigned long long)bucket_blkno(first));
3664 4257
3665 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); 4258 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
3666 4259
3667 ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 4260 ret = ocfs2_journal_access_xb(handle, inode, root_bh,
3668 &data_ac, &meta_ac); 4261 OCFS2_JOURNAL_ACCESS_WRITE);
3669 if (ret) {
3670 mlog_errno(ret);
3671 goto leave;
3672 }
3673
3674 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
3675 clusters_to_add);
3676 handle = ocfs2_start_trans(osb, credits);
3677 if (IS_ERR(handle)) {
3678 ret = PTR_ERR(handle);
3679 handle = NULL;
3680 mlog_errno(ret);
3681 goto leave;
3682 }
3683
3684 ret = ocfs2_journal_access(handle, inode, root_bh,
3685 OCFS2_JOURNAL_ACCESS_WRITE);
3686 if (ret < 0) { 4262 if (ret < 0) {
3687 mlog_errno(ret); 4263 mlog_errno(ret);
3688 goto leave; 4264 goto leave;
3689 } 4265 }
3690 4266
3691 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4267 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
3692 clusters_to_add, &bit_off, &num_bits); 4268 clusters_to_add, &bit_off, &num_bits);
3693 if (ret < 0) { 4269 if (ret < 0) {
3694 if (ret != -ENOSPC) 4270 if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3702 mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", 4278 mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
3703 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4279 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
3704 4280
3705 if (prev_blkno + prev_clusters * bpc == block && 4281 if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
3706 (prev_clusters + num_bits) << osb->s_clustersize_bits <= 4282 (prev_clusters + num_bits) << osb->s_clustersize_bits <=
3707 OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { 4283 OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
3708 /* 4284 /*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3721 } else { 4297 } else {
3722 ret = ocfs2_adjust_xattr_cross_cluster(inode, 4298 ret = ocfs2_adjust_xattr_cross_cluster(inode,
3723 handle, 4299 handle,
3724 first_bh, 4300 first,
3725 header_bh, 4301 target,
3726 block, 4302 block,
3727 prev_blkno,
3728 prev_clusters, 4303 prev_clusters,
3729 &v_start, 4304 &v_start,
3730 extend); 4305 extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3734 } 4309 }
3735 } 4310 }
3736 4311
3737 if (handle->h_buffer_credits < credits) {
3738 /*
3739 * The journal has been restarted before, and don't
3740 * have enough space for the insertion, so extend it
3741 * here.
3742 */
3743 ret = ocfs2_extend_trans(handle, credits);
3744 if (ret) {
3745 mlog_errno(ret);
3746 goto leave;
3747 }
3748 }
3749 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", 4312 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
3750 num_bits, (unsigned long long)block, v_start); 4313 num_bits, (unsigned long long)block, v_start);
3751 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, 4314 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
3752 num_bits, 0, meta_ac); 4315 num_bits, 0, ctxt->meta_ac);
3753 if (ret < 0) { 4316 if (ret < 0) {
3754 mlog_errno(ret); 4317 mlog_errno(ret);
3755 goto leave; 4318 goto leave;
3756 } 4319 }
3757 4320
3758 ret = ocfs2_journal_dirty(handle, root_bh); 4321 ret = ocfs2_journal_dirty(handle, root_bh);
3759 if (ret < 0) { 4322 if (ret < 0)
3760 mlog_errno(ret); 4323 mlog_errno(ret);
3761 goto leave;
3762 }
3763 4324
3764leave: 4325leave:
3765 if (handle)
3766 ocfs2_commit_trans(osb, handle);
3767 if (data_ac)
3768 ocfs2_free_alloc_context(data_ac);
3769 if (meta_ac)
3770 ocfs2_free_alloc_context(meta_ac);
3771
3772 return ret; 4326 return ret;
3773} 4327}
3774 4328
3775/* 4329/*
3776 * Extend a new xattr bucket and move xattrs to the end one by one until 4330 * We are given an extent. 'first' is the bucket at the very front of
3777 * We meet with start_bh. Only move half of the xattrs to the bucket after it. 4331 * the extent. The extent has space for an additional bucket past
4332 * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number
4333 * of the target bucket. We wish to shift every bucket past the target
4334 * down one, filling in that additional space. When we get back to the
4335 * target, we split the target between itself and the now-empty bucket
4336 * at target+1 (aka, target_blkno + blks_per_bucket).
3778 */ 4337 */
3779static int ocfs2_extend_xattr_bucket(struct inode *inode, 4338static int ocfs2_extend_xattr_bucket(struct inode *inode,
3780 struct buffer_head *first_bh, 4339 handle_t *handle,
3781 struct buffer_head *start_bh, 4340 struct ocfs2_xattr_bucket *first,
4341 u64 target_blk,
3782 u32 num_clusters) 4342 u32 num_clusters)
3783{ 4343{
3784 int ret, credits; 4344 int ret, credits;
3785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 4345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3786 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4346 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3787 u64 start_blk = start_bh->b_blocknr, end_blk; 4347 u64 end_blk;
3788 u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb); 4348 u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
3789 handle_t *handle;
3790 struct ocfs2_xattr_header *first_xh =
3791 (struct ocfs2_xattr_header *)first_bh->b_data;
3792 u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
3793 4349
3794 mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " 4350 mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
3795 "from %llu, len = %u\n", (unsigned long long)start_blk, 4351 "from %llu, len = %u\n", (unsigned long long)target_blk,
3796 (unsigned long long)first_bh->b_blocknr, num_clusters); 4352 (unsigned long long)bucket_blkno(first), num_clusters);
3797 4353
3798 BUG_ON(bucket >= num_buckets); 4354 /* The extent must have room for an additional bucket */
4355 BUG_ON(new_bucket >=
4356 (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
3799 4357
3800 end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket; 4358 /* end_blk points to the last existing bucket */
4359 end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
3801 4360
3802 /* 4361 /*
3803 * We will touch all the buckets after the start_bh(include it). 4362 * end_blk is the start of the last existing bucket.
3804 * Add one more bucket and modify the first_bh. 4363 * Thus, (end_blk - target_blk) covers the target bucket and
4364 * every bucket after it up to, but not including, the last
4365 * existing bucket. Then we add the last existing bucket, the
4366 * new bucket, and the first bucket (3 * blk_per_bucket).
3805 */ 4367 */
3806 credits = end_blk - start_blk + 2 * blk_per_bucket + 1; 4368 credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
3807 handle = ocfs2_start_trans(osb, credits); 4369 handle->h_buffer_credits;
3808 if (IS_ERR(handle)) { 4370 ret = ocfs2_extend_trans(handle, credits);
3809 ret = PTR_ERR(handle); 4371 if (ret) {
3810 handle = NULL;
3811 mlog_errno(ret); 4372 mlog_errno(ret);
3812 goto out; 4373 goto out;
3813 } 4374 }
3814 4375
3815 ret = ocfs2_journal_access(handle, inode, first_bh, 4376 ret = ocfs2_xattr_bucket_journal_access(handle, first,
3816 OCFS2_JOURNAL_ACCESS_WRITE); 4377 OCFS2_JOURNAL_ACCESS_WRITE);
3817 if (ret) { 4378 if (ret) {
3818 mlog_errno(ret); 4379 mlog_errno(ret);
3819 goto commit; 4380 goto out;
3820 } 4381 }
3821 4382
3822 while (end_blk != start_blk) { 4383 while (end_blk != target_blk) {
3823 ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, 4384 ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
3824 end_blk + blk_per_bucket, 0); 4385 end_blk + blk_per_bucket, 0);
3825 if (ret) 4386 if (ret)
3826 goto commit; 4387 goto out;
3827 end_blk -= blk_per_bucket; 4388 end_blk -= blk_per_bucket;
3828 } 4389 }
3829 4390
3830 /* Move half of the xattr in start_blk to the next bucket. */ 4391 /* Move half of the xattr in target_blkno to the next bucket. */
3831 ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk, 4392 ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
3832 start_blk + blk_per_bucket, NULL, 0); 4393 target_blk + blk_per_bucket, NULL, 0);
3833 4394
3834 le16_add_cpu(&first_xh->xh_num_buckets, 1); 4395 le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
3835 ocfs2_journal_dirty(handle, first_bh); 4396 ocfs2_xattr_bucket_journal_dirty(handle, first);
3836 4397
3837commit:
3838 ocfs2_commit_trans(osb, handle);
3839out: 4398out:
3840 return ret; 4399 return ret;
3841} 4400}
3842 4401
3843/* 4402/*
3844 * Add new xattr bucket in an extent record and adjust the buckets accordingly. 4403 * Add new xattr bucket in an extent record and adjust the buckets
3845 * xb_bh is the ocfs2_xattr_block. 4404 * accordingly. xb_bh is the ocfs2_xattr_block, and target is the
3846 * We will move all the buckets starting from header_bh to the next place. As 4405 * bucket we want to insert into.
3847 * for this one, half num of its xattrs will be moved to the next one. 4406 *
4407 * In the easy case, we will move all the buckets after target down by
4408 * one. Half of target's xattrs will be moved to the next bucket.
3848 * 4409 *
3849 * We will allocate a new cluster if current cluster is full and adjust 4410 * If current cluster is full, we'll allocate a new one. This may not
3850 * header_bh and first_bh if the insert place is moved to the new cluster. 4411 * be contiguous. The underlying calls will make sure that there is
4412 * space for the insert, shifting buckets around if necessary.
4413 * 'target' may be moved by those calls.
3851 */ 4414 */
3852static int ocfs2_add_new_xattr_bucket(struct inode *inode, 4415static int ocfs2_add_new_xattr_bucket(struct inode *inode,
3853 struct buffer_head *xb_bh, 4416 struct buffer_head *xb_bh,
3854 struct buffer_head *header_bh) 4417 struct ocfs2_xattr_bucket *target,
4418 struct ocfs2_xattr_set_ctxt *ctxt)
3855{ 4419{
3856 struct ocfs2_xattr_header *first_xh = NULL;
3857 struct buffer_head *first_bh = NULL;
3858 struct ocfs2_xattr_block *xb = 4420 struct ocfs2_xattr_block *xb =
3859 (struct ocfs2_xattr_block *)xb_bh->b_data; 4421 (struct ocfs2_xattr_block *)xb_bh->b_data;
3860 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; 4422 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
3861 struct ocfs2_extent_list *el = &xb_root->xt_list; 4423 struct ocfs2_extent_list *el = &xb_root->xt_list;
3862 struct ocfs2_xattr_header *xh = 4424 u32 name_hash =
3863 (struct ocfs2_xattr_header *)header_bh->b_data; 4425 le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
3864 u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); 4426 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3865 struct super_block *sb = inode->i_sb;
3866 struct ocfs2_super *osb = OCFS2_SB(sb);
3867 int ret, num_buckets, extend = 1; 4427 int ret, num_buckets, extend = 1;
3868 u64 p_blkno; 4428 u64 p_blkno;
3869 u32 e_cpos, num_clusters; 4429 u32 e_cpos, num_clusters;
4430 /* The bucket at the front of the extent */
4431 struct ocfs2_xattr_bucket *first;
3870 4432
3871 mlog(0, "Add new xattr bucket starting form %llu\n", 4433 mlog(0, "Add new xattr bucket starting from %llu\n",
3872 (unsigned long long)header_bh->b_blocknr); 4434 (unsigned long long)bucket_blkno(target));
3873 4435
3874 /* 4436 /* The first bucket of the original extent */
3875 * Add refrence for header_bh here because it may be 4437 first = ocfs2_xattr_bucket_new(inode);
3876 * changed in ocfs2_add_new_xattr_cluster and we need 4438 if (!first) {
3877 * to free it in the end. 4439 ret = -ENOMEM;
3878 */ 4440 mlog_errno(ret);
3879 get_bh(header_bh); 4441 goto out;
4442 }
3880 4443
3881 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, 4444 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
3882 &num_clusters, el); 4445 &num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
3885 goto out; 4448 goto out;
3886 } 4449 }
3887 4450
3888 ret = ocfs2_read_block(inode, p_blkno, &first_bh); 4451 ret = ocfs2_read_xattr_bucket(first, p_blkno);
3889 if (ret) { 4452 if (ret) {
3890 mlog_errno(ret); 4453 mlog_errno(ret);
3891 goto out; 4454 goto out;
3892 } 4455 }
3893 4456
3894 num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; 4457 num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
3895 first_xh = (struct ocfs2_xattr_header *)first_bh->b_data; 4458 if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
3896 4459 /*
3897 if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) { 4460 * This can move first+target if the target bucket moves
4461 * to the new extent.
4462 */
3898 ret = ocfs2_add_new_xattr_cluster(inode, 4463 ret = ocfs2_add_new_xattr_cluster(inode,
3899 xb_bh, 4464 xb_bh,
3900 &first_bh, 4465 first,
3901 &header_bh, 4466 target,
3902 &num_clusters, 4467 &num_clusters,
3903 e_cpos, 4468 e_cpos,
3904 p_blkno, 4469 &extend,
3905 &extend); 4470 ctxt);
3906 if (ret) { 4471 if (ret) {
3907 mlog_errno(ret); 4472 mlog_errno(ret);
3908 goto out; 4473 goto out;
3909 } 4474 }
3910 } 4475 }
3911 4476
3912 if (extend) 4477 if (extend) {
3913 ret = ocfs2_extend_xattr_bucket(inode, 4478 ret = ocfs2_extend_xattr_bucket(inode,
3914 first_bh, 4479 ctxt->handle,
3915 header_bh, 4480 first,
4481 bucket_blkno(target),
3916 num_clusters); 4482 num_clusters);
3917 if (ret) 4483 if (ret)
3918 mlog_errno(ret); 4484 mlog_errno(ret);
4485 }
4486
3919out: 4487out:
3920 brelse(first_bh); 4488 ocfs2_xattr_bucket_free(first);
3921 brelse(header_bh); 4489
3922 return ret; 4490 return ret;
3923} 4491}
3924 4492
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
3929 int block_off = offs >> inode->i_sb->s_blocksize_bits; 4497 int block_off = offs >> inode->i_sb->s_blocksize_bits;
3930 4498
3931 offs = offs % inode->i_sb->s_blocksize; 4499 offs = offs % inode->i_sb->s_blocksize;
3932 return bucket->bhs[block_off]->b_data + offs; 4500 return bucket_block(bucket, block_off) + offs;
3933} 4501}
3934 4502
3935/* 4503/*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
3984 xe->xe_value_size = 0; 4552 xe->xe_value_size = 0;
3985 4553
3986 val = ocfs2_xattr_bucket_get_val(inode, 4554 val = ocfs2_xattr_bucket_get_val(inode,
3987 &xs->bucket, offs); 4555 xs->bucket, offs);
3988 memset(val + OCFS2_XATTR_SIZE(name_len), 0, 4556 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
3989 size - OCFS2_XATTR_SIZE(name_len)); 4557 size - OCFS2_XATTR_SIZE(name_len));
3990 if (OCFS2_XATTR_SIZE(xi->value_len) > 0) 4558 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
4062 xh->xh_free_start = cpu_to_le16(offs); 4630 xh->xh_free_start = cpu_to_le16(offs);
4063 } 4631 }
4064 4632
4065 val = ocfs2_xattr_bucket_get_val(inode, 4633 val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
4066 &xs->bucket, offs - size);
4067 xe->xe_name_offset = cpu_to_le16(offs - size); 4634 xe->xe_name_offset = cpu_to_le16(offs - size);
4068 4635
4069 memset(val, 0, size); 4636 memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
4079 return; 4646 return;
4080} 4647}
4081 4648
4082static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
4083 handle_t *handle,
4084 struct ocfs2_xattr_search *xs,
4085 struct buffer_head **bhs,
4086 u16 bh_num)
4087{
4088 int ret = 0, off, block_off;
4089 struct ocfs2_xattr_entry *xe = xs->here;
4090
4091 /*
4092 * First calculate all the blocks we should journal_access
4093 * and journal_dirty. The first block should always be touched.
4094 */
4095 ret = ocfs2_journal_dirty(handle, bhs[0]);
4096 if (ret)
4097 mlog_errno(ret);
4098
4099 /* calc the data. */
4100 off = le16_to_cpu(xe->xe_name_offset);
4101 block_off = off >> inode->i_sb->s_blocksize_bits;
4102 ret = ocfs2_journal_dirty(handle, bhs[block_off]);
4103 if (ret)
4104 mlog_errno(ret);
4105
4106 return ret;
4107}
4108
4109/* 4649/*
4110 * Set the xattr entry in the specified bucket. 4650 * Set the xattr entry in the specified bucket.
4111 * The bucket is indicated by xs->bucket and it should have the enough 4651 * The bucket is indicated by xs->bucket and it should have the enough
4112 * space for the xattr insertion. 4652 * space for the xattr insertion.
4113 */ 4653 */
4114static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, 4654static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4655 handle_t *handle,
4115 struct ocfs2_xattr_info *xi, 4656 struct ocfs2_xattr_info *xi,
4116 struct ocfs2_xattr_search *xs, 4657 struct ocfs2_xattr_search *xs,
4117 u32 name_hash, 4658 u32 name_hash,
4118 int local) 4659 int local)
4119{ 4660{
4120 int i, ret; 4661 int ret;
4121 handle_t *handle = NULL; 4662 u64 blkno;
4122 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4123 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4124 4663
4125 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", 4664 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4126 (unsigned long)xi->value_len, xi->name_index, 4665 (unsigned long)xi->value_len, xi->name_index,
4127 (unsigned long long)xs->bucket.bhs[0]->b_blocknr); 4666 (unsigned long long)bucket_blkno(xs->bucket));
4128 4667
4129 if (!xs->bucket.bhs[1]) { 4668 if (!xs->bucket->bu_bhs[1]) {
4130 ret = ocfs2_read_blocks(inode, 4669 blkno = bucket_blkno(xs->bucket);
4131 xs->bucket.bhs[0]->b_blocknr + 1, 4670 ocfs2_xattr_bucket_relse(xs->bucket);
4132 blk_per_bucket - 1, &xs->bucket.bhs[1], 4671 ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
4133 0);
4134 if (ret) { 4672 if (ret) {
4135 mlog_errno(ret); 4673 mlog_errno(ret);
4136 goto out; 4674 goto out;
4137 } 4675 }
4138 } 4676 }
4139 4677
4140 handle = ocfs2_start_trans(osb, blk_per_bucket); 4678 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4141 if (IS_ERR(handle)) { 4679 OCFS2_JOURNAL_ACCESS_WRITE);
4142 ret = PTR_ERR(handle); 4680 if (ret < 0) {
4143 handle = NULL;
4144 mlog_errno(ret); 4681 mlog_errno(ret);
4145 goto out; 4682 goto out;
4146 } 4683 }
4147 4684
4148 for (i = 0; i < blk_per_bucket; i++) {
4149 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
4150 OCFS2_JOURNAL_ACCESS_WRITE);
4151 if (ret < 0) {
4152 mlog_errno(ret);
4153 goto out;
4154 }
4155 }
4156
4157 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); 4685 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4686 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4158 4687
4159 /*Only dirty the blocks we have touched in set xattr. */
4160 ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
4161 xs->bucket.bhs, blk_per_bucket);
4162 if (ret)
4163 mlog_errno(ret);
4164out:
4165 ocfs2_commit_trans(osb, handle);
4166
4167 return ret;
4168}
4169
4170static int ocfs2_xattr_value_update_size(struct inode *inode,
4171 struct buffer_head *xe_bh,
4172 struct ocfs2_xattr_entry *xe,
4173 u64 new_size)
4174{
4175 int ret;
4176 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4177 handle_t *handle = NULL;
4178
4179 handle = ocfs2_start_trans(osb, 1);
4180 if (IS_ERR(handle)) {
4181 ret = -ENOMEM;
4182 mlog_errno(ret);
4183 goto out;
4184 }
4185
4186 ret = ocfs2_journal_access(handle, inode, xe_bh,
4187 OCFS2_JOURNAL_ACCESS_WRITE);
4188 if (ret < 0) {
4189 mlog_errno(ret);
4190 goto out_commit;
4191 }
4192
4193 xe->xe_value_size = cpu_to_le64(new_size);
4194
4195 ret = ocfs2_journal_dirty(handle, xe_bh);
4196 if (ret < 0)
4197 mlog_errno(ret);
4198
4199out_commit:
4200 ocfs2_commit_trans(osb, handle);
4201out: 4688out:
4202 return ret; 4689 return ret;
4203} 4690}
@@ -4210,18 +4697,19 @@ out:
4210 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. 4697 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
4211 */ 4698 */
4212static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, 4699static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4213 struct buffer_head *header_bh, 4700 struct ocfs2_xattr_bucket *bucket,
4214 int xe_off, 4701 int xe_off,
4215 int len) 4702 int len,
4703 struct ocfs2_xattr_set_ctxt *ctxt)
4216{ 4704{
4217 int ret, offset; 4705 int ret, offset;
4218 u64 value_blk; 4706 u64 value_blk;
4219 struct buffer_head *value_bh = NULL;
4220 struct ocfs2_xattr_value_root *xv;
4221 struct ocfs2_xattr_entry *xe; 4707 struct ocfs2_xattr_entry *xe;
4222 struct ocfs2_xattr_header *xh = 4708 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
4223 (struct ocfs2_xattr_header *)header_bh->b_data;
4224 size_t blocksize = inode->i_sb->s_blocksize; 4709 size_t blocksize = inode->i_sb->s_blocksize;
4710 struct ocfs2_xattr_value_buf vb = {
4711 .vb_access = ocfs2_journal_access,
4712 };
4225 4713
4226 xe = &xh->xh_entries[xe_off]; 4714 xe = &xh->xh_entries[xe_off];
4227 4715
@@ -4234,49 +4722,58 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4234 4722
4235 /* We don't allow ocfs2_xattr_value to be stored in different block. */ 4723 /* We don't allow ocfs2_xattr_value to be stored in different block. */
4236 BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); 4724 BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
4237 value_blk += header_bh->b_blocknr;
4238 4725
4239 ret = ocfs2_read_block(inode, value_blk, &value_bh); 4726 vb.vb_bh = bucket->bu_bhs[value_blk];
4240 if (ret) { 4727 BUG_ON(!vb.vb_bh);
4241 mlog_errno(ret);
4242 goto out;
4243 }
4244 4728
4245 xv = (struct ocfs2_xattr_value_root *) 4729 vb.vb_xv = (struct ocfs2_xattr_value_root *)
4246 (value_bh->b_data + offset % blocksize); 4730 (vb.vb_bh->b_data + offset % blocksize);
4247 4731
4248 mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", 4732 ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
4249 xe_off, (unsigned long long)header_bh->b_blocknr, len); 4733 OCFS2_JOURNAL_ACCESS_WRITE);
4250 ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
4251 if (ret) { 4734 if (ret) {
4252 mlog_errno(ret); 4735 mlog_errno(ret);
4253 goto out; 4736 goto out;
4254 } 4737 }
4255 4738
4256 ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len); 4739 /*
4740 * From here on out we have to dirty the bucket. The generic
4741 * value calls only modify one of the bucket's bhs, but we need
4742 * to send the bucket at once. So if they error, they *could* have
4743 * modified something. We have to assume they did, and dirty
4744 * the whole bucket. This leaves us in a consistent state.
4745 */
4746 mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
4747 xe_off, (unsigned long long)bucket_blkno(bucket), len);
4748 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
4257 if (ret) { 4749 if (ret) {
4258 mlog_errno(ret); 4750 mlog_errno(ret);
4259 goto out; 4751 goto out_dirty;
4260 } 4752 }
4261 4753
4754 xe->xe_value_size = cpu_to_le64(len);
4755
4756out_dirty:
4757 ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
4758
4262out: 4759out:
4263 brelse(value_bh);
4264 return ret; 4760 return ret;
4265} 4761}
4266 4762
4267static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, 4763static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
4268 struct ocfs2_xattr_search *xs, 4764 struct ocfs2_xattr_search *xs,
4269 int len) 4765 int len,
4766 struct ocfs2_xattr_set_ctxt *ctxt)
4270{ 4767{
4271 int ret, offset; 4768 int ret, offset;
4272 struct ocfs2_xattr_entry *xe = xs->here; 4769 struct ocfs2_xattr_entry *xe = xs->here;
4273 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; 4770 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
4274 4771
4275 BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe)); 4772 BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
4276 4773
4277 offset = xe - xh->xh_entries; 4774 offset = xe - xh->xh_entries;
4278 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0], 4775 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
4279 offset, len); 4776 offset, len, ctxt);
4280 if (ret) 4777 if (ret)
4281 mlog_errno(ret); 4778 mlog_errno(ret);
4282 4779
@@ -4284,6 +4781,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
4284} 4781}
4285 4782
4286static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, 4783static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4784 handle_t *handle,
4287 struct ocfs2_xattr_search *xs, 4785 struct ocfs2_xattr_search *xs,
4288 char *val, 4786 char *val,
4289 int value_len) 4787 int value_len)
@@ -4299,7 +4797,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4299 4797
4300 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); 4798 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
4301 4799
4302 return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len); 4800 return __ocfs2_xattr_set_value_outside(inode, handle,
4801 xv, val, value_len);
4303} 4802}
4304 4803
4305static int ocfs2_rm_xattr_cluster(struct inode *inode, 4804static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4842,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4343 } 4842 }
4344 } 4843 }
4345 4844
4346 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); 4845 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
4347 if (IS_ERR(handle)) { 4846 if (IS_ERR(handle)) {
4348 ret = -ENOMEM; 4847 ret = -ENOMEM;
4349 mlog_errno(ret); 4848 mlog_errno(ret);
4350 goto out; 4849 goto out;
4351 } 4850 }
4352 4851
4353 ret = ocfs2_journal_access(handle, inode, root_bh, 4852 ret = ocfs2_journal_access_xb(handle, inode, root_bh,
4354 OCFS2_JOURNAL_ACCESS_WRITE); 4853 OCFS2_JOURNAL_ACCESS_WRITE);
4355 if (ret) { 4854 if (ret) {
4356 mlog_errno(ret); 4855 mlog_errno(ret);
4357 goto out_commit; 4856 goto out_commit;
@@ -4392,26 +4891,19 @@ out:
4392} 4891}
4393 4892
4394static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, 4893static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
4894 handle_t *handle,
4395 struct ocfs2_xattr_search *xs) 4895 struct ocfs2_xattr_search *xs)
4396{ 4896{
4397 handle_t *handle = NULL; 4897 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
4398 struct ocfs2_xattr_header *xh = xs->bucket.xh;
4399 struct ocfs2_xattr_entry *last = &xh->xh_entries[ 4898 struct ocfs2_xattr_entry *last = &xh->xh_entries[
4400 le16_to_cpu(xh->xh_count) - 1]; 4899 le16_to_cpu(xh->xh_count) - 1];
4401 int ret = 0; 4900 int ret = 0;
4402 4901
4403 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1); 4902 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4404 if (IS_ERR(handle)) { 4903 OCFS2_JOURNAL_ACCESS_WRITE);
4405 ret = PTR_ERR(handle);
4406 mlog_errno(ret);
4407 return;
4408 }
4409
4410 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
4411 OCFS2_JOURNAL_ACCESS_WRITE);
4412 if (ret) { 4904 if (ret) {
4413 mlog_errno(ret); 4905 mlog_errno(ret);
4414 goto out_commit; 4906 return;
4415 } 4907 }
4416 4908
4417 /* Remove the old entry. */ 4909 /* Remove the old entry. */
@@ -4420,11 +4912,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
4420 memset(last, 0, sizeof(struct ocfs2_xattr_entry)); 4912 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
4421 le16_add_cpu(&xh->xh_count, -1); 4913 le16_add_cpu(&xh->xh_count, -1);
4422 4914
4423 ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]); 4915 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4424 if (ret < 0)
4425 mlog_errno(ret);
4426out_commit:
4427 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
4428} 4916}
4429 4917
4430/* 4918/*
@@ -4440,7 +4928,8 @@ out_commit:
4440 */ 4928 */
4441static int ocfs2_xattr_set_in_bucket(struct inode *inode, 4929static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4442 struct ocfs2_xattr_info *xi, 4930 struct ocfs2_xattr_info *xi,
4443 struct ocfs2_xattr_search *xs) 4931 struct ocfs2_xattr_search *xs,
4932 struct ocfs2_xattr_set_ctxt *ctxt)
4444{ 4933{
4445 int ret, local = 1; 4934 int ret, local = 1;
4446 size_t value_len; 4935 size_t value_len;
@@ -4468,7 +4957,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4468 value_len = 0; 4957 value_len = 0;
4469 4958
4470 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, 4959 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4471 value_len); 4960 value_len,
4961 ctxt);
4472 if (ret) 4962 if (ret)
4473 goto out; 4963 goto out;
4474 4964
@@ -4488,7 +4978,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4488 xi->value_len = OCFS2_XATTR_ROOT_SIZE; 4978 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
4489 } 4979 }
4490 4980
4491 ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local); 4981 ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
4982 name_hash, local);
4492 if (ret) { 4983 if (ret) {
4493 mlog_errno(ret); 4984 mlog_errno(ret);
4494 goto out; 4985 goto out;
@@ -4499,7 +4990,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4499 4990
4500 /* allocate the space now for the outside block storage. */ 4991 /* allocate the space now for the outside block storage. */
4501 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, 4992 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4502 value_len); 4993 value_len, ctxt);
4503 if (ret) { 4994 if (ret) {
4504 mlog_errno(ret); 4995 mlog_errno(ret);
4505 4996
@@ -4509,13 +5000,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4509 * storage and we have allocated xattr already, 5000 * storage and we have allocated xattr already,
4510 * so need to remove it. 5001 * so need to remove it.
4511 */ 5002 */
4512 ocfs2_xattr_bucket_remove_xs(inode, xs); 5003 ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
4513 } 5004 }
4514 goto out; 5005 goto out;
4515 } 5006 }
4516 5007
4517set_value_outside: 5008set_value_outside:
4518 ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len); 5009 ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
5010 xs, val, value_len);
4519out: 5011out:
4520 return ret; 5012 return ret;
4521} 5013}
@@ -4530,7 +5022,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4530 struct ocfs2_xattr_bucket *bucket, 5022 struct ocfs2_xattr_bucket *bucket,
4531 const char *name) 5023 const char *name)
4532{ 5024{
4533 struct ocfs2_xattr_header *xh = bucket->xh; 5025 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
4534 u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); 5026 u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
4535 5027
4536 if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash)) 5028 if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5032,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4540 xh->xh_entries[0].xe_name_hash) { 5032 xh->xh_entries[0].xe_name_hash) {
4541 mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " 5033 mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
4542 "hash = %u\n", 5034 "hash = %u\n",
4543 (unsigned long long)bucket->bhs[0]->b_blocknr, 5035 (unsigned long long)bucket_blkno(bucket),
4544 le32_to_cpu(xh->xh_entries[0].xe_name_hash)); 5036 le32_to_cpu(xh->xh_entries[0].xe_name_hash));
4545 return -ENOSPC; 5037 return -ENOSPC;
4546 } 5038 }
@@ -4550,16 +5042,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4550 5042
4551static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 5043static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
4552 struct ocfs2_xattr_info *xi, 5044 struct ocfs2_xattr_info *xi,
4553 struct ocfs2_xattr_search *xs) 5045 struct ocfs2_xattr_search *xs,
5046 struct ocfs2_xattr_set_ctxt *ctxt)
4554{ 5047{
4555 struct ocfs2_xattr_header *xh; 5048 struct ocfs2_xattr_header *xh;
4556 struct ocfs2_xattr_entry *xe; 5049 struct ocfs2_xattr_entry *xe;
4557 u16 count, header_size, xh_free_start; 5050 u16 count, header_size, xh_free_start;
4558 int i, free, max_free, need, old; 5051 int free, max_free, need, old;
4559 size_t value_size = 0, name_len = strlen(xi->name); 5052 size_t value_size = 0, name_len = strlen(xi->name);
4560 size_t blocksize = inode->i_sb->s_blocksize; 5053 size_t blocksize = inode->i_sb->s_blocksize;
4561 int ret, allocation = 0; 5054 int ret, allocation = 0;
4562 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4563 5055
4564 mlog_entry("Set xattr %s in xattr index block\n", xi->name); 5056 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
4565 5057
@@ -4574,7 +5066,7 @@ try_again:
4574 5066
4575 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " 5067 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
4576 "of %u which exceed block size\n", 5068 "of %u which exceed block size\n",
4577 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, 5069 (unsigned long long)bucket_blkno(xs->bucket),
4578 header_size); 5070 header_size);
4579 5071
4580 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) 5072 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5106,13 @@ try_again:
4614 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " 5106 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
4615 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" 5107 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
4616 " %u\n", xs->not_found, 5108 " %u\n", xs->not_found,
4617 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, 5109 (unsigned long long)bucket_blkno(xs->bucket),
4618 free, need, max_free, le16_to_cpu(xh->xh_free_start), 5110 free, need, max_free, le16_to_cpu(xh->xh_free_start),
4619 le16_to_cpu(xh->xh_name_value_len)); 5111 le16_to_cpu(xh->xh_name_value_len));
4620 5112
4621 if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { 5113 if (free < need ||
5114 (xs->not_found &&
5115 count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
4622 if (need <= max_free && 5116 if (need <= max_free &&
4623 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { 5117 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
4624 /* 5118 /*
@@ -4626,7 +5120,8 @@ try_again:
4626 * name/value will be moved, the xe shouldn't be changed 5120 * name/value will be moved, the xe shouldn't be changed
4627 * in xs. 5121 * in xs.
4628 */ 5122 */
4629 ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket); 5123 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5124 xs->bucket);
4630 if (ret) { 5125 if (ret) {
4631 mlog_errno(ret); 5126 mlog_errno(ret);
4632 goto out; 5127 goto out;
@@ -4658,7 +5153,7 @@ try_again:
4658 * add a new bucket for the insert. 5153 * add a new bucket for the insert.
4659 */ 5154 */
4660 ret = ocfs2_check_xattr_bucket_collision(inode, 5155 ret = ocfs2_check_xattr_bucket_collision(inode,
4661 &xs->bucket, 5156 xs->bucket,
4662 xi->name); 5157 xi->name);
4663 if (ret) { 5158 if (ret) {
4664 mlog_errno(ret); 5159 mlog_errno(ret);
@@ -4667,17 +5162,21 @@ try_again:
4667 5162
4668 ret = ocfs2_add_new_xattr_bucket(inode, 5163 ret = ocfs2_add_new_xattr_bucket(inode,
4669 xs->xattr_bh, 5164 xs->xattr_bh,
4670 xs->bucket.bhs[0]); 5165 xs->bucket,
5166 ctxt);
4671 if (ret) { 5167 if (ret) {
4672 mlog_errno(ret); 5168 mlog_errno(ret);
4673 goto out; 5169 goto out;
4674 } 5170 }
4675 5171
4676 for (i = 0; i < blk_per_bucket; i++) 5172 /*
4677 brelse(xs->bucket.bhs[i]); 5173 * ocfs2_add_new_xattr_bucket() will have updated
4678 5174 * xs->bucket if it moved, but it will not have updated
4679 memset(&xs->bucket, 0, sizeof(xs->bucket)); 5175 * any of the other search fields. Thus, we drop it and
4680 5176 * re-search. Everything should be cached, so it'll be
5177 * quick.
5178 */
5179 ocfs2_xattr_bucket_relse(xs->bucket);
4681 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, 5180 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
4682 xi->name_index, 5181 xi->name_index,
4683 xi->name, xs); 5182 xi->name, xs);
@@ -4689,7 +5188,7 @@ try_again:
4689 } 5188 }
4690 5189
4691xattr_set: 5190xattr_set:
4692 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs); 5191 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
4693out: 5192out:
4694 mlog_exit(ret); 5193 mlog_exit(ret);
4695 return ret; 5194 return ret;
@@ -4700,24 +5199,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
4700 void *para) 5199 void *para)
4701{ 5200{
4702 int ret = 0; 5201 int ret = 0;
4703 struct ocfs2_xattr_header *xh = bucket->xh; 5202 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
4704 u16 i; 5203 u16 i;
4705 struct ocfs2_xattr_entry *xe; 5204 struct ocfs2_xattr_entry *xe;
5205 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5206 struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
5207 int credits = ocfs2_remove_extent_credits(osb->sb) +
5208 ocfs2_blocks_per_xattr_bucket(inode->i_sb);
5209
5210
5211 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
4706 5212
4707 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 5213 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4708 xe = &xh->xh_entries[i]; 5214 xe = &xh->xh_entries[i];
4709 if (ocfs2_xattr_is_local(xe)) 5215 if (ocfs2_xattr_is_local(xe))
4710 continue; 5216 continue;
4711 5217
4712 ret = ocfs2_xattr_bucket_value_truncate(inode, 5218 ctxt.handle = ocfs2_start_trans(osb, credits);
4713 bucket->bhs[0], 5219 if (IS_ERR(ctxt.handle)) {
4714 i, 0); 5220 ret = PTR_ERR(ctxt.handle);
5221 mlog_errno(ret);
5222 break;
5223 }
5224
5225 ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
5226 i, 0, &ctxt);
5227
5228 ocfs2_commit_trans(osb, ctxt.handle);
4715 if (ret) { 5229 if (ret) {
4716 mlog_errno(ret); 5230 mlog_errno(ret);
4717 break; 5231 break;
4718 } 5232 }
4719 } 5233 }
4720 5234
5235 ocfs2_schedule_truncate_log_flush(osb, 1);
5236 ocfs2_run_deallocs(osb, &ctxt.dealloc);
4721 return ret; 5237 return ret;
4722} 5238}
4723 5239
@@ -4768,6 +5284,74 @@ out:
4768} 5284}
4769 5285
4770/* 5286/*
5287 * 'security' attributes support
5288 */
5289static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
5290 size_t list_size, const char *name,
5291 size_t name_len)
5292{
5293 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
5294 const size_t total_len = prefix_len + name_len + 1;
5295
5296 if (list && total_len <= list_size) {
5297 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
5298 memcpy(list + prefix_len, name, name_len);
5299 list[prefix_len + name_len] = '\0';
5300 }
5301 return total_len;
5302}
5303
5304static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
5305 void *buffer, size_t size)
5306{
5307 if (strcmp(name, "") == 0)
5308 return -EINVAL;
5309 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
5310 buffer, size);
5311}
5312
5313static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
5314 const void *value, size_t size, int flags)
5315{
5316 if (strcmp(name, "") == 0)
5317 return -EINVAL;
5318
5319 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
5320 size, flags);
5321}
5322
5323int ocfs2_init_security_get(struct inode *inode,
5324 struct inode *dir,
5325 struct ocfs2_security_xattr_info *si)
5326{
5327 /* check whether ocfs2 support feature xattr */
5328 if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
5329 return -EOPNOTSUPP;
5330 return security_inode_init_security(inode, dir, &si->name, &si->value,
5331 &si->value_len);
5332}
5333
5334int ocfs2_init_security_set(handle_t *handle,
5335 struct inode *inode,
5336 struct buffer_head *di_bh,
5337 struct ocfs2_security_xattr_info *si,
5338 struct ocfs2_alloc_context *xattr_ac,
5339 struct ocfs2_alloc_context *data_ac)
5340{
5341 return ocfs2_xattr_set_handle(handle, inode, di_bh,
5342 OCFS2_XATTR_INDEX_SECURITY,
5343 si->name, si->value, si->value_len, 0,
5344 xattr_ac, data_ac);
5345}
5346
5347struct xattr_handler ocfs2_xattr_security_handler = {
5348 .prefix = XATTR_SECURITY_PREFIX,
5349 .list = ocfs2_xattr_security_list,
5350 .get = ocfs2_xattr_security_get,
5351 .set = ocfs2_xattr_security_set,
5352};
5353
5354/*
4771 * 'trusted' attributes support 5355 * 'trusted' attributes support
4772 */ 5356 */
4773static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, 5357static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656d..5a1ebc789f7e 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
30 OCFS2_XATTR_MAX 30 OCFS2_XATTR_MAX
31}; 31};
32 32
33struct ocfs2_security_xattr_info {
34 int enable;
35 char *name;
36 void *value;
37 size_t value_len;
38};
39
33extern struct xattr_handler ocfs2_xattr_user_handler; 40extern struct xattr_handler ocfs2_xattr_user_handler;
34extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler;
43#ifdef CONFIG_OCFS2_FS_POSIX_ACL
44extern struct xattr_handler ocfs2_xattr_acl_access_handler;
45extern struct xattr_handler ocfs2_xattr_acl_default_handler;
46#endif
35extern struct xattr_handler *ocfs2_xattr_handlers[]; 47extern struct xattr_handler *ocfs2_xattr_handlers[];
36 48
37ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 49ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
50int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
51 const char *, void *, size_t);
38int ocfs2_xattr_set(struct inode *, int, const char *, const void *, 52int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
39 size_t, int); 53 size_t, int);
54int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
55 int, const char *, const void *, size_t, int,
56 struct ocfs2_alloc_context *,
57 struct ocfs2_alloc_context *);
40int ocfs2_xattr_remove(struct inode *, struct buffer_head *); 58int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
59int ocfs2_init_security_get(struct inode *, struct inode *,
60 struct ocfs2_security_xattr_info *);
61int ocfs2_init_security_set(handle_t *, struct inode *,
62 struct buffer_head *,
63 struct ocfs2_security_xattr_info *,
64 struct ocfs2_alloc_context *,
65 struct ocfs2_alloc_context *);
66int ocfs2_calc_security_init(struct inode *,
67 struct ocfs2_security_xattr_info *,
68 int *, int *, struct ocfs2_alloc_context **);
69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
70 int, struct ocfs2_security_xattr_info *,
71 int *, int *, struct ocfs2_alloc_context **);
72
73/*
74 * xattrs can live inside an inode, as part of an external xattr block,
75 * or inside an xattr bucket, which is the leaf of a tree rooted in an
76 * xattr block. Some of the xattr calls, especially the value setting
77 * functions, want to treat each of these locations as equal. Let's wrap
78 * them in a structure that we can pass around instead of raw buffer_heads.
79 */
80struct ocfs2_xattr_value_buf {
81 struct buffer_head *vb_bh;
82 ocfs2_journal_access_func vb_access;
83 struct ocfs2_xattr_value_root *vb_xv;
84};
85
41 86
42#endif /* OCFS2_XATTR_H */ 87#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f84..633e9dc972bb 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
39 inode->i_mode = mode; 39 inode->i_mode = mode;
40 inode->i_uid = current_fsuid(); 40 inode->i_uid = current_fsuid();
41 inode->i_gid = current_fsgid(); 41 inode->i_gid = current_fsgid();
42 inode->i_blocks = 0;
43 inode->i_mapping->a_ops = &omfs_aops; 42 inode->i_mapping->a_ops = &omfs_aops;
44 43
45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 44 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index 1cd7d40e9991..d882fd2351d6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -412,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
412 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 412 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
413 goto out_fput; 413 goto out_fput;
414 414
415 if (inode->i_op && inode->i_op->fallocate) 415 if (inode->i_op->fallocate)
416 ret = inode->i_op->fallocate(inode, mode, offset, len); 416 ret = inode->i_op->fallocate(inode, mode, offset, len);
417 else 417 else
418 ret = -EOPNOTSUPP; 418 ret = -EOPNOTSUPP;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de4..ffcd04f0012c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
256 break; 256 break;
257 } 257 }
258 258
259 inode->i_gid = 0;
260 inode->i_uid = 0;
261
262 d_add(dentry, inode); 259 d_add(dentry, inode);
263 return NULL; 260 return NULL;
264} 261}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d5b213b8a9b..5198ada67398 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -384,9 +384,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
384 384
385 dname = dev_name(ddev); 385 dname = dev_name(ddev);
386 if (isdigit(dname[strlen(dname) - 1])) 386 if (isdigit(dname[strlen(dname) - 1]))
387 snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno); 387 dev_set_name(pdev, "%sp%d", dname, partno);
388 else 388 else
389 snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno); 389 dev_set_name(pdev, "%s%d", dname, partno);
390 390
391 device_initialize(pdev); 391 device_initialize(pdev);
392 pdev->class = &block_class; 392 pdev->class = &block_class;
@@ -447,16 +447,11 @@ void register_disk(struct gendisk *disk)
447 struct block_device *bdev; 447 struct block_device *bdev;
448 struct disk_part_iter piter; 448 struct disk_part_iter piter;
449 struct hd_struct *part; 449 struct hd_struct *part;
450 char *s;
451 int err; 450 int err;
452 451
453 ddev->parent = disk->driverfs_dev; 452 ddev->parent = disk->driverfs_dev;
454 453
455 strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE); 454 dev_set_name(ddev, disk->disk_name);
456 /* ewww... some of these buggers have / in the name... */
457 s = strchr(ddev->bus_id, '/');
458 if (s)
459 *s = '!';
460 455
461 /* delay uevents, until we scanned partition table */ 456 /* delay uevents, until we scanned partition table */
462 ddev->uevent_suppress = 1; 457 ddev->uevent_suppress = 1;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b3..0c9de19a1633 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
65#include <linux/mm.h> 65#include <linux/mm.h>
66#include <linux/rcupdate.h> 66#include <linux/rcupdate.h>
67#include <linux/kallsyms.h> 67#include <linux/kallsyms.h>
68#include <linux/stacktrace.h>
68#include <linux/resource.h> 69#include <linux/resource.h>
69#include <linux/module.h> 70#include <linux/module.h>
70#include <linux/mount.h> 71#include <linux/mount.h>
@@ -109,25 +110,22 @@ struct pid_entry {
109 .op = OP, \ 110 .op = OP, \
110} 111}
111 112
112#define DIR(NAME, MODE, OTYPE) \ 113#define DIR(NAME, MODE, iops, fops) \
113 NOD(NAME, (S_IFDIR|(MODE)), \ 114 NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
114 &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations, \ 115#define LNK(NAME, get_link) \
115 {} )
116#define LNK(NAME, OTYPE) \
117 NOD(NAME, (S_IFLNK|S_IRWXUGO), \ 116 NOD(NAME, (S_IFLNK|S_IRWXUGO), \
118 &proc_pid_link_inode_operations, NULL, \ 117 &proc_pid_link_inode_operations, NULL, \
119 { .proc_get_link = &proc_##OTYPE##_link } ) 118 { .proc_get_link = get_link } )
120#define REG(NAME, MODE, OTYPE) \ 119#define REG(NAME, MODE, fops) \
121 NOD(NAME, (S_IFREG|(MODE)), NULL, \ 120 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
122 &proc_##OTYPE##_operations, {}) 121#define INF(NAME, MODE, read) \
123#define INF(NAME, MODE, OTYPE) \
124 NOD(NAME, (S_IFREG|(MODE)), \ 122 NOD(NAME, (S_IFREG|(MODE)), \
125 NULL, &proc_info_file_operations, \ 123 NULL, &proc_info_file_operations, \
126 { .proc_read = &proc_##OTYPE } ) 124 { .proc_read = read } )
127#define ONE(NAME, MODE, OTYPE) \ 125#define ONE(NAME, MODE, show) \
128 NOD(NAME, (S_IFREG|(MODE)), \ 126 NOD(NAME, (S_IFREG|(MODE)), \
129 NULL, &proc_single_file_operations, \ 127 NULL, &proc_single_file_operations, \
130 { .proc_show = &proc_##OTYPE } ) 128 { .proc_show = show } )
131 129
132/* 130/*
133 * Count the number of hardlinks for the pid_entry table, excluding the . 131 * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
308 struct mm_struct *mm = get_task_mm(task); 306 struct mm_struct *mm = get_task_mm(task);
309 if (mm) { 307 if (mm) {
310 unsigned int nwords = 0; 308 unsigned int nwords = 0;
311 do 309 do {
312 nwords += 2; 310 nwords += 2;
313 while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ 311 } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
314 res = nwords * sizeof(mm->saved_auxv[0]); 312 res = nwords * sizeof(mm->saved_auxv[0]);
315 if (res > PAGE_SIZE) 313 if (res > PAGE_SIZE)
316 res = PAGE_SIZE; 314 res = PAGE_SIZE;
@@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
340} 338}
341#endif /* CONFIG_KALLSYMS */ 339#endif /* CONFIG_KALLSYMS */
342 340
341#ifdef CONFIG_STACKTRACE
342
343#define MAX_STACK_TRACE_DEPTH 64
344
345static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
346 struct pid *pid, struct task_struct *task)
347{
348 struct stack_trace trace;
349 unsigned long *entries;
350 int i;
351
352 entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
353 if (!entries)
354 return -ENOMEM;
355
356 trace.nr_entries = 0;
357 trace.max_entries = MAX_STACK_TRACE_DEPTH;
358 trace.entries = entries;
359 trace.skip = 0;
360 save_stack_trace_tsk(task, &trace);
361
362 for (i = 0; i < trace.nr_entries; i++) {
363 seq_printf(m, "[<%p>] %pS\n",
364 (void *)entries[i], (void *)entries[i]);
365 }
366 kfree(entries);
367
368 return 0;
369}
370#endif
371
343#ifdef CONFIG_SCHEDSTATS 372#ifdef CONFIG_SCHEDSTATS
344/* 373/*
345 * Provides /proc/PID/schedstat 374 * Provides /proc/PID/schedstat
@@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v)
1186 struct inode *inode = m->private; 1215 struct inode *inode = m->private;
1187 struct task_struct *p; 1216 struct task_struct *p;
1188 1217
1189 WARN_ON(!inode);
1190
1191 p = get_proc_task(inode); 1218 p = get_proc_task(inode);
1192 if (!p) 1219 if (!p)
1193 return -ESRCH; 1220 return -ESRCH;
@@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf,
1205 struct inode *inode = file->f_path.dentry->d_inode; 1232 struct inode *inode = file->f_path.dentry->d_inode;
1206 struct task_struct *p; 1233 struct task_struct *p;
1207 1234
1208 WARN_ON(!inode);
1209
1210 p = get_proc_task(inode); 1235 p = get_proc_task(inode);
1211 if (!p) 1236 if (!p)
1212 return -ESRCH; 1237 return -ESRCH;
@@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
1426 if (!ei->pid) 1451 if (!ei->pid)
1427 goto out_unlock; 1452 goto out_unlock;
1428 1453
1429 inode->i_uid = 0;
1430 inode->i_gid = 0;
1431 if (task_dumpable(task)) { 1454 if (task_dumpable(task)) {
1432 rcu_read_lock(); 1455 rcu_read_lock();
1433 cred = __task_cred(task); 1456 cred = __task_cred(task);
@@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
1976 const struct pid_entry *ents, 1999 const struct pid_entry *ents,
1977 unsigned int nents) 2000 unsigned int nents)
1978{ 2001{
1979 struct inode *inode;
1980 struct dentry *error; 2002 struct dentry *error;
1981 struct task_struct *task = get_proc_task(dir); 2003 struct task_struct *task = get_proc_task(dir);
1982 const struct pid_entry *p, *last; 2004 const struct pid_entry *p, *last;
1983 2005
1984 error = ERR_PTR(-ENOENT); 2006 error = ERR_PTR(-ENOENT);
1985 inode = NULL;
1986 2007
1987 if (!task) 2008 if (!task)
1988 goto out_no_task; 2009 goto out_no_task;
@@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = {
2138}; 2159};
2139 2160
2140static const struct pid_entry attr_dir_stuff[] = { 2161static const struct pid_entry attr_dir_stuff[] = {
2141 REG("current", S_IRUGO|S_IWUGO, pid_attr), 2162 REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2142 REG("prev", S_IRUGO, pid_attr), 2163 REG("prev", S_IRUGO, proc_pid_attr_operations),
2143 REG("exec", S_IRUGO|S_IWUGO, pid_attr), 2164 REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2144 REG("fscreate", S_IRUGO|S_IWUGO, pid_attr), 2165 REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2145 REG("keycreate", S_IRUGO|S_IWUGO, pid_attr), 2166 REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2146 REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr), 2167 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2147}; 2168};
2148 2169
2149static int proc_attr_dir_readdir(struct file * filp, 2170static int proc_attr_dir_readdir(struct file * filp,
@@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2349 if (!ei->pid) 2370 if (!ei->pid)
2350 goto out_iput; 2371 goto out_iput;
2351 2372
2352 inode->i_uid = 0;
2353 inode->i_gid = 0;
2354 inode->i_mode = p->mode; 2373 inode->i_mode = p->mode;
2355 if (S_ISDIR(inode->i_mode)) 2374 if (S_ISDIR(inode->i_mode))
2356 inode->i_nlink = 2; 2375 inode->i_nlink = 2;
@@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations;
2465static const struct inode_operations proc_task_inode_operations; 2484static const struct inode_operations proc_task_inode_operations;
2466 2485
2467static const struct pid_entry tgid_base_stuff[] = { 2486static const struct pid_entry tgid_base_stuff[] = {
2468 DIR("task", S_IRUGO|S_IXUGO, task), 2487 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2469 DIR("fd", S_IRUSR|S_IXUSR, fd), 2488 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2470 DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), 2489 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2471#ifdef CONFIG_NET 2490#ifdef CONFIG_NET
2472 DIR("net", S_IRUGO|S_IXUGO, net), 2491 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2473#endif 2492#endif
2474 REG("environ", S_IRUSR, environ), 2493 REG("environ", S_IRUSR, proc_environ_operations),
2475 INF("auxv", S_IRUSR, pid_auxv), 2494 INF("auxv", S_IRUSR, proc_pid_auxv),
2476 ONE("status", S_IRUGO, pid_status), 2495 ONE("status", S_IRUGO, proc_pid_status),
2477 ONE("personality", S_IRUSR, pid_personality), 2496 ONE("personality", S_IRUSR, proc_pid_personality),
2478 INF("limits", S_IRUSR, pid_limits), 2497 INF("limits", S_IRUSR, proc_pid_limits),
2479#ifdef CONFIG_SCHED_DEBUG 2498#ifdef CONFIG_SCHED_DEBUG
2480 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2499 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2481#endif 2500#endif
2482#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2501#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2483 INF("syscall", S_IRUSR, pid_syscall), 2502 INF("syscall", S_IRUSR, proc_pid_syscall),
2484#endif 2503#endif
2485 INF("cmdline", S_IRUGO, pid_cmdline), 2504 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2486 ONE("stat", S_IRUGO, tgid_stat), 2505 ONE("stat", S_IRUGO, proc_tgid_stat),
2487 ONE("statm", S_IRUGO, pid_statm), 2506 ONE("statm", S_IRUGO, proc_pid_statm),
2488 REG("maps", S_IRUGO, maps), 2507 REG("maps", S_IRUGO, proc_maps_operations),
2489#ifdef CONFIG_NUMA 2508#ifdef CONFIG_NUMA
2490 REG("numa_maps", S_IRUGO, numa_maps), 2509 REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
2491#endif 2510#endif
2492 REG("mem", S_IRUSR|S_IWUSR, mem), 2511 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2493 LNK("cwd", cwd), 2512 LNK("cwd", proc_cwd_link),
2494 LNK("root", root), 2513 LNK("root", proc_root_link),
2495 LNK("exe", exe), 2514 LNK("exe", proc_exe_link),
2496 REG("mounts", S_IRUGO, mounts), 2515 REG("mounts", S_IRUGO, proc_mounts_operations),
2497 REG("mountinfo", S_IRUGO, mountinfo), 2516 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
2498 REG("mountstats", S_IRUSR, mountstats), 2517 REG("mountstats", S_IRUSR, proc_mountstats_operations),
2499#ifdef CONFIG_PROC_PAGE_MONITOR 2518#ifdef CONFIG_PROC_PAGE_MONITOR
2500 REG("clear_refs", S_IWUSR, clear_refs), 2519 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2501 REG("smaps", S_IRUGO, smaps), 2520 REG("smaps", S_IRUGO, proc_smaps_operations),
2502 REG("pagemap", S_IRUSR, pagemap), 2521 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2503#endif 2522#endif
2504#ifdef CONFIG_SECURITY 2523#ifdef CONFIG_SECURITY
2505 DIR("attr", S_IRUGO|S_IXUGO, attr_dir), 2524 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2506#endif 2525#endif
2507#ifdef CONFIG_KALLSYMS 2526#ifdef CONFIG_KALLSYMS
2508 INF("wchan", S_IRUGO, pid_wchan), 2527 INF("wchan", S_IRUGO, proc_pid_wchan),
2528#endif
2529#ifdef CONFIG_STACKTRACE
2530 ONE("stack", S_IRUSR, proc_pid_stack),
2509#endif 2531#endif
2510#ifdef CONFIG_SCHEDSTATS 2532#ifdef CONFIG_SCHEDSTATS
2511 INF("schedstat", S_IRUGO, pid_schedstat), 2533 INF("schedstat", S_IRUGO, proc_pid_schedstat),
2512#endif 2534#endif
2513#ifdef CONFIG_LATENCYTOP 2535#ifdef CONFIG_LATENCYTOP
2514 REG("latency", S_IRUGO, lstats), 2536 REG("latency", S_IRUGO, proc_lstats_operations),
2515#endif 2537#endif
2516#ifdef CONFIG_PROC_PID_CPUSET 2538#ifdef CONFIG_PROC_PID_CPUSET
2517 REG("cpuset", S_IRUGO, cpuset), 2539 REG("cpuset", S_IRUGO, proc_cpuset_operations),
2518#endif 2540#endif
2519#ifdef CONFIG_CGROUPS 2541#ifdef CONFIG_CGROUPS
2520 REG("cgroup", S_IRUGO, cgroup), 2542 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2521#endif 2543#endif
2522 INF("oom_score", S_IRUGO, oom_score), 2544 INF("oom_score", S_IRUGO, proc_oom_score),
2523 REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), 2545 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2524#ifdef CONFIG_AUDITSYSCALL 2546#ifdef CONFIG_AUDITSYSCALL
2525 REG("loginuid", S_IWUSR|S_IRUGO, loginuid), 2547 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2526 REG("sessionid", S_IRUGO, sessionid), 2548 REG("sessionid", S_IRUGO, proc_sessionid_operations),
2527#endif 2549#endif
2528#ifdef CONFIG_FAULT_INJECTION 2550#ifdef CONFIG_FAULT_INJECTION
2529 REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), 2551 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2530#endif 2552#endif
2531#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) 2553#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2532 REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), 2554 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
2533#endif 2555#endif
2534#ifdef CONFIG_TASK_IO_ACCOUNTING 2556#ifdef CONFIG_TASK_IO_ACCOUNTING
2535 INF("io", S_IRUGO, tgid_io_accounting), 2557 INF("io", S_IRUGO, proc_tgid_io_accounting),
2536#endif 2558#endif
2537}; 2559};
2538 2560
@@ -2805,66 +2827,69 @@ out_no_task:
2805 * Tasks 2827 * Tasks
2806 */ 2828 */
2807static const struct pid_entry tid_base_stuff[] = { 2829static const struct pid_entry tid_base_stuff[] = {
2808 DIR("fd", S_IRUSR|S_IXUSR, fd), 2830 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2809 DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), 2831 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
2810 REG("environ", S_IRUSR, environ), 2832 REG("environ", S_IRUSR, proc_environ_operations),
2811 INF("auxv", S_IRUSR, pid_auxv), 2833 INF("auxv", S_IRUSR, proc_pid_auxv),
2812 ONE("status", S_IRUGO, pid_status), 2834 ONE("status", S_IRUGO, proc_pid_status),
2813 ONE("personality", S_IRUSR, pid_personality), 2835 ONE("personality", S_IRUSR, proc_pid_personality),
2814 INF("limits", S_IRUSR, pid_limits), 2836 INF("limits", S_IRUSR, proc_pid_limits),
2815#ifdef CONFIG_SCHED_DEBUG 2837#ifdef CONFIG_SCHED_DEBUG
2816 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2838 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2817#endif 2839#endif
2818#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2840#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2819 INF("syscall", S_IRUSR, pid_syscall), 2841 INF("syscall", S_IRUSR, proc_pid_syscall),
2820#endif 2842#endif
2821 INF("cmdline", S_IRUGO, pid_cmdline), 2843 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2822 ONE("stat", S_IRUGO, tid_stat), 2844 ONE("stat", S_IRUGO, proc_tid_stat),
2823 ONE("statm", S_IRUGO, pid_statm), 2845 ONE("statm", S_IRUGO, proc_pid_statm),
2824 REG("maps", S_IRUGO, maps), 2846 REG("maps", S_IRUGO, proc_maps_operations),
2825#ifdef CONFIG_NUMA 2847#ifdef CONFIG_NUMA
2826 REG("numa_maps", S_IRUGO, numa_maps), 2848 REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
2827#endif 2849#endif
2828 REG("mem", S_IRUSR|S_IWUSR, mem), 2850 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2829 LNK("cwd", cwd), 2851 LNK("cwd", proc_cwd_link),
2830 LNK("root", root), 2852 LNK("root", proc_root_link),
2831 LNK("exe", exe), 2853 LNK("exe", proc_exe_link),
2832 REG("mounts", S_IRUGO, mounts), 2854 REG("mounts", S_IRUGO, proc_mounts_operations),
2833 REG("mountinfo", S_IRUGO, mountinfo), 2855 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
2834#ifdef CONFIG_PROC_PAGE_MONITOR 2856#ifdef CONFIG_PROC_PAGE_MONITOR
2835 REG("clear_refs", S_IWUSR, clear_refs), 2857 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2836 REG("smaps", S_IRUGO, smaps), 2858 REG("smaps", S_IRUGO, proc_smaps_operations),
2837 REG("pagemap", S_IRUSR, pagemap), 2859 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2838#endif 2860#endif
2839#ifdef CONFIG_SECURITY 2861#ifdef CONFIG_SECURITY
2840 DIR("attr", S_IRUGO|S_IXUGO, attr_dir), 2862 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2841#endif 2863#endif
2842#ifdef CONFIG_KALLSYMS 2864#ifdef CONFIG_KALLSYMS
2843 INF("wchan", S_IRUGO, pid_wchan), 2865 INF("wchan", S_IRUGO, proc_pid_wchan),
2866#endif
2867#ifdef CONFIG_STACKTRACE
2868 ONE("stack", S_IRUSR, proc_pid_stack),
2844#endif 2869#endif
2845#ifdef CONFIG_SCHEDSTATS 2870#ifdef CONFIG_SCHEDSTATS
2846 INF("schedstat", S_IRUGO, pid_schedstat), 2871 INF("schedstat", S_IRUGO, proc_pid_schedstat),
2847#endif 2872#endif
2848#ifdef CONFIG_LATENCYTOP 2873#ifdef CONFIG_LATENCYTOP
2849 REG("latency", S_IRUGO, lstats), 2874 REG("latency", S_IRUGO, proc_lstats_operations),
2850#endif 2875#endif
2851#ifdef CONFIG_PROC_PID_CPUSET 2876#ifdef CONFIG_PROC_PID_CPUSET
2852 REG("cpuset", S_IRUGO, cpuset), 2877 REG("cpuset", S_IRUGO, proc_cpuset_operations),
2853#endif 2878#endif
2854#ifdef CONFIG_CGROUPS 2879#ifdef CONFIG_CGROUPS
2855 REG("cgroup", S_IRUGO, cgroup), 2880 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2856#endif 2881#endif
2857 INF("oom_score", S_IRUGO, oom_score), 2882 INF("oom_score", S_IRUGO, proc_oom_score),
2858 REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), 2883 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2859#ifdef CONFIG_AUDITSYSCALL 2884#ifdef CONFIG_AUDITSYSCALL
2860 REG("loginuid", S_IWUSR|S_IRUGO, loginuid), 2885 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2861 REG("sessionid", S_IRUSR, sessionid), 2886 REG("sessionid", S_IRUSR, proc_sessionid_operations),
2862#endif 2887#endif
2863#ifdef CONFIG_FAULT_INJECTION 2888#ifdef CONFIG_FAULT_INJECTION
2864 REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), 2889 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2865#endif 2890#endif
2866#ifdef CONFIG_TASK_IO_ACCOUNTING 2891#ifdef CONFIG_TASK_IO_ACCOUNTING
2867 INF("io", S_IRUGO, tid_io_accounting), 2892 INF("io", S_IRUGO, proc_tid_io_accounting),
2868#endif 2893#endif
2869}; 2894};
2870 2895
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 60a359b35582..db7fa5cab988 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -14,7 +14,6 @@
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/smp_lock.h>
18#include <linux/init.h> 17#include <linux/init.h>
19#include <linux/idr.h> 18#include <linux/idr.h>
20#include <linux/namei.h> 19#include <linux/namei.h>
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
379 struct inode *inode = NULL; 378 struct inode *inode = NULL;
380 int error = -ENOENT; 379 int error = -ENOENT;
381 380
382 lock_kernel();
383 spin_lock(&proc_subdir_lock); 381 spin_lock(&proc_subdir_lock);
384 for (de = de->subdir; de ; de = de->next) { 382 for (de = de->subdir; de ; de = de->next) {
385 if (de->namelen != dentry->d_name.len) 383 if (de->namelen != dentry->d_name.len)
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
397 } 395 }
398 spin_unlock(&proc_subdir_lock); 396 spin_unlock(&proc_subdir_lock);
399out_unlock: 397out_unlock:
400 unlock_kernel();
401 398
402 if (inode) { 399 if (inode) {
403 dentry->d_op = &proc_dentry_operations; 400 dentry->d_op = &proc_dentry_operations;
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
432 struct inode *inode = filp->f_path.dentry->d_inode; 429 struct inode *inode = filp->f_path.dentry->d_inode;
433 int ret = 0; 430 int ret = 0;
434 431
435 lock_kernel();
436
437 ino = inode->i_ino; 432 ino = inode->i_ino;
438 i = filp->f_pos; 433 i = filp->f_pos;
439 switch (i) { 434 switch (i) {
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
487 spin_unlock(&proc_subdir_lock); 482 spin_unlock(&proc_subdir_lock);
488 } 483 }
489 ret = 1; 484 ret = 1;
490out: unlock_kernel(); 485out:
491 return ret; 486 return ret;
492} 487}
493 488
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
504 * the /proc directory. 499 * the /proc directory.
505 */ 500 */
506static const struct file_operations proc_dir_operations = { 501static const struct file_operations proc_dir_operations = {
502 .llseek = generic_file_llseek,
507 .read = generic_read_dir, 503 .read = generic_read_dir,
508 .readdir = proc_readdir, 504 .readdir = proc_readdir,
509}; 505};
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c658..3e76bb9b3ad6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
35 */ 35 */
36void de_put(struct proc_dir_entry *de) 36void de_put(struct proc_dir_entry *de)
37{ 37{
38 lock_kernel();
39 if (!atomic_read(&de->count)) { 38 if (!atomic_read(&de->count)) {
40 printk("de_put: entry %s already free!\n", de->name); 39 printk("de_put: entry %s already free!\n", de->name);
41 unlock_kernel();
42 return; 40 return;
43 } 41 }
44 42
45 if (atomic_dec_and_test(&de->count)) 43 if (atomic_dec_and_test(&de->count))
46 free_proc_entry(de); 44 free_proc_entry(de);
47 unlock_kernel();
48} 45}
49 46
50/* 47/*
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7bc296f424ae..04d1270f1c38 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -18,7 +18,6 @@
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/smp_lock.h>
22#include <linux/mount.h> 21#include <linux/mount.h>
23#include <linux/nsproxy.h> 22#include <linux/nsproxy.h>
24#include <net/net_namespace.h> 23#include <net/net_namespace.h>
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
172} 171}
173 172
174const struct file_operations proc_net_operations = { 173const struct file_operations proc_net_operations = {
174 .llseek = generic_file_llseek,
175 .read = generic_read_dir, 175 .read = generic_read_dir,
176 .readdir = proc_tgid_net_readdir, 176 .readdir = proc_tgid_net_readdir,
177}; 177};
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9e..94fcfff6863a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
31 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 31 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
32 inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ 32 inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
33 inode->i_mode = table->mode; 33 inode->i_mode = table->mode;
34 inode->i_uid = inode->i_gid = 0;
35 if (!table->child) { 34 if (!table->child) {
36 inode->i_mode |= S_IFREG; 35 inode->i_mode |= S_IFREG;
37 inode->i_op = &proc_sys_inode_operations; 36 inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7761602af9de..f6299a25594e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,7 +16,6 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/smp_lock.h>
20#include <linux/mount.h> 19#include <linux/mount.h>
21#include <linux/pid_namespace.h> 20#include <linux/pid_namespace.h>
22 21
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp,
162 unsigned int nr = filp->f_pos; 161 unsigned int nr = filp->f_pos;
163 int ret; 162 int ret;
164 163
165 lock_kernel();
166
167 if (nr < FIRST_PROCESS_ENTRY) { 164 if (nr < FIRST_PROCESS_ENTRY) {
168 int error = proc_readdir(filp, dirent, filldir); 165 int error = proc_readdir(filp, dirent, filldir);
169 if (error <= 0) { 166 if (error <= 0)
170 unlock_kernel();
171 return error; 167 return error;
172 }
173 filp->f_pos = FIRST_PROCESS_ENTRY; 168 filp->f_pos = FIRST_PROCESS_ENTRY;
174 } 169 }
175 unlock_kernel();
176 170
177 ret = proc_pid_readdir(filp, dirent, filldir); 171 ret = proc_pid_readdir(filp, dirent, filldir);
178 return ret; 172 return ret;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3a8bdd7f5756..94063840832a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v)
396 "Private_Clean: %8lu kB\n" 396 "Private_Clean: %8lu kB\n"
397 "Private_Dirty: %8lu kB\n" 397 "Private_Dirty: %8lu kB\n"
398 "Referenced: %8lu kB\n" 398 "Referenced: %8lu kB\n"
399 "Swap: %8lu kB\n", 399 "Swap: %8lu kB\n"
400 "KernelPageSize: %8lu kB\n"
401 "MMUPageSize: %8lu kB\n",
400 (vma->vm_end - vma->vm_start) >> 10, 402 (vma->vm_end - vma->vm_start) >> 10,
401 mss.resident >> 10, 403 mss.resident >> 10,
402 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 404 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
405 mss.private_clean >> 10, 407 mss.private_clean >> 10,
406 mss.private_dirty >> 10, 408 mss.private_dirty >> 10,
407 mss.referenced >> 10, 409 mss.referenced >> 10,
408 mss.swap >> 10); 410 mss.swap >> 10,
411 vma_kernel_pagesize(vma) >> 10,
412 vma_mmu_pagesize(vma) >> 10);
409 413
410 if (m->count < m->size) /* vma is copied successfully */ 414 if (m->count < m->size) /* vma is copied successfully */
411 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 415 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 219bd79ea894..d4a8be32b902 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -9,7 +9,7 @@
9 9
10/* 10/*
11 * Logic: we've got two memory sums for each process, "shared", and 11 * Logic: we've got two memory sums for each process, "shared", and
12 * "non-shared". Shared memory may get counted more then once, for 12 * "non-shared". Shared memory may get counted more than once, for
13 * each process that owns it. Non-shared memory is counted 13 * each process that owns it. Non-shared memory is counted
14 * accurately. 14 * accurately.
15 */ 15 */
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec59504906..5edcc3f92ba7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
47 47
48 offset = (unsigned long)(*ppos % PAGE_SIZE); 48 offset = (unsigned long)(*ppos % PAGE_SIZE);
49 pfn = (unsigned long)(*ppos / PAGE_SIZE); 49 pfn = (unsigned long)(*ppos / PAGE_SIZE);
50 if (pfn > saved_max_pfn)
51 return -EINVAL;
52 50
53 do { 51 do {
54 if (count > (PAGE_SIZE - offset)) 52 if (count > (PAGE_SIZE - offset))
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e01618..4a8c94f05f76 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
73 case Q_SETQUOTA: 73 case Q_SETQUOTA:
74 case Q_GETQUOTA: 74 case Q_GETQUOTA:
75 /* This is just informative test so we are satisfied without a lock */ 75 /* This is just informative test so we are satisfied without a lock */
76 if (!sb_has_quota_enabled(sb, type)) 76 if (!sb_has_quota_active(sb, type))
77 return -ESRCH; 77 return -ESRCH;
78 } 78 }
79 79
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
160 int cnt; 160 int cnt;
161 161
162 sb->s_qcop->quota_sync(sb, type); 162 sb->s_qcop->quota_sync(sb, type);
163
164 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
165 return;
163 /* This is not very clever (and fast) but currently I don't know about 166 /* This is not very clever (and fast) but currently I don't know about
164 * any other simple way of getting quota data to disk and we must get 167 * any other simple way of getting quota data to disk and we must get
165 * them there for userspace to be visible... */ 168 * them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
175 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 178 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
176 if (type != -1 && cnt != type) 179 if (type != -1 && cnt != type)
177 continue; 180 continue;
178 if (!sb_has_quota_enabled(sb, cnt)) 181 if (!sb_has_quota_active(sb, cnt))
179 continue; 182 continue;
180 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); 183 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
181 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); 184 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
201 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 204 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
202 if (type != -1 && type != cnt) 205 if (type != -1 && type != cnt)
203 continue; 206 continue;
204 if (!sb_has_quota_enabled(sb, cnt)) 207 if (!sb_has_quota_active(sb, cnt))
205 continue; 208 continue;
206 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && 209 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
207 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) 210 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
245 __u32 fmt; 248 __u32 fmt;
246 249
247 down_read(&sb_dqopt(sb)->dqptr_sem); 250 down_read(&sb_dqopt(sb)->dqptr_sem);
248 if (!sb_has_quota_enabled(sb, type)) { 251 if (!sb_has_quota_active(sb, type)) {
249 up_read(&sb_dqopt(sb)->dqptr_sem); 252 up_read(&sb_dqopt(sb)->dqptr_sem);
250 return -ESRCH; 253 return -ESRCH;
251 } 254 }
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 000000000000..953404c95b17
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
1/*
2 * vfsv0 quota IO operations on file
3 */
4
5#include <linux/errno.h>
6#include <linux/fs.h>
7#include <linux/mount.h>
8#include <linux/dqblk_v2.h>
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13#include <linux/quotaops.h>
14
15#include <asm/byteorder.h>
16
17#include "quota_tree.h"
18
19MODULE_AUTHOR("Jan Kara");
20MODULE_DESCRIPTION("Quota trie support");
21MODULE_LICENSE("GPL");
22
23#define __QUOTA_QT_PARANOIA
24
25typedef char *dqbuf_t;
26
27static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
28{
29 unsigned int epb = info->dqi_usable_bs >> 2;
30
31 depth = info->dqi_qtree_depth - depth - 1;
32 while (depth--)
33 id /= epb;
34 return id % epb;
35}
36
37/* Number of entries in one blocks */
38static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
39{
40 return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
41 / info->dqi_entry_size;
42}
43
44static dqbuf_t getdqbuf(size_t size)
45{
46 dqbuf_t buf = kmalloc(size, GFP_NOFS);
47 if (!buf)
48 printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
49 return buf;
50}
51
52static inline void freedqbuf(dqbuf_t buf)
53{
54 kfree(buf);
55}
56
57static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
58{
59 struct super_block *sb = info->dqi_sb;
60
61 memset(buf, 0, info->dqi_usable_bs);
62 return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
63 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
64}
65
66static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
67{
68 struct super_block *sb = info->dqi_sb;
69
70 return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
71 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
72}
73
74/* Remove empty block from list and return it */
75static int get_free_dqblk(struct qtree_mem_dqinfo *info)
76{
77 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
78 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
79 int ret, blk;
80
81 if (!buf)
82 return -ENOMEM;
83 if (info->dqi_free_blk) {
84 blk = info->dqi_free_blk;
85 ret = read_blk(info, blk, buf);
86 if (ret < 0)
87 goto out_buf;
88 info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
89 }
90 else {
91 memset(buf, 0, info->dqi_usable_bs);
92 /* Assure block allocation... */
93 ret = write_blk(info, info->dqi_blocks, buf);
94 if (ret < 0)
95 goto out_buf;
96 blk = info->dqi_blocks++;
97 }
98 mark_info_dirty(info->dqi_sb, info->dqi_type);
99 ret = blk;
100out_buf:
101 freedqbuf(buf);
102 return ret;
103}
104
105/* Insert empty block to the list */
106static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
107{
108 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
109 int err;
110
111 dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
112 dh->dqdh_prev_free = cpu_to_le32(0);
113 dh->dqdh_entries = cpu_to_le16(0);
114 err = write_blk(info, blk, buf);
115 if (err < 0)
116 return err;
117 info->dqi_free_blk = blk;
118 mark_info_dirty(info->dqi_sb, info->dqi_type);
119 return 0;
120}
121
122/* Remove given block from the list of blocks with free entries */
123static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
124{
125 dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
126 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
127 uint nextblk = le32_to_cpu(dh->dqdh_next_free);
128 uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
129 int err;
130
131 if (!tmpbuf)
132 return -ENOMEM;
133 if (nextblk) {
134 err = read_blk(info, nextblk, tmpbuf);
135 if (err < 0)
136 goto out_buf;
137 ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
138 dh->dqdh_prev_free;
139 err = write_blk(info, nextblk, tmpbuf);
140 if (err < 0)
141 goto out_buf;
142 }
143 if (prevblk) {
144 err = read_blk(info, prevblk, tmpbuf);
145 if (err < 0)
146 goto out_buf;
147 ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
148 dh->dqdh_next_free;
149 err = write_blk(info, prevblk, tmpbuf);
150 if (err < 0)
151 goto out_buf;
152 } else {
153 info->dqi_free_entry = nextblk;
154 mark_info_dirty(info->dqi_sb, info->dqi_type);
155 }
156 freedqbuf(tmpbuf);
157 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
158 /* No matter whether write succeeds block is out of list */
159 if (write_blk(info, blk, buf) < 0)
160 printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
161 return 0;
162out_buf:
163 freedqbuf(tmpbuf);
164 return err;
165}
166
167/* Insert given block to the beginning of list with free entries */
168static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
169{
170 dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
171 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
172 int err;
173
174 if (!tmpbuf)
175 return -ENOMEM;
176 dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
177 dh->dqdh_prev_free = cpu_to_le32(0);
178 err = write_blk(info, blk, buf);
179 if (err < 0)
180 goto out_buf;
181 if (info->dqi_free_entry) {
182 err = read_blk(info, info->dqi_free_entry, tmpbuf);
183 if (err < 0)
184 goto out_buf;
185 ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
186 cpu_to_le32(blk);
187 err = write_blk(info, info->dqi_free_entry, tmpbuf);
188 if (err < 0)
189 goto out_buf;
190 }
191 freedqbuf(tmpbuf);
192 info->dqi_free_entry = blk;
193 mark_info_dirty(info->dqi_sb, info->dqi_type);
194 return 0;
195out_buf:
196 freedqbuf(tmpbuf);
197 return err;
198}
199
200/* Is the entry in the block free? */
201int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
202{
203 int i;
204
205 for (i = 0; i < info->dqi_entry_size; i++)
206 if (disk[i])
207 return 0;
208 return 1;
209}
210EXPORT_SYMBOL(qtree_entry_unused);
211
212/* Find space for dquot */
213static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
214 struct dquot *dquot, int *err)
215{
216 uint blk, i;
217 struct qt_disk_dqdbheader *dh;
218 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
219 char *ddquot;
220
221 *err = 0;
222 if (!buf) {
223 *err = -ENOMEM;
224 return 0;
225 }
226 dh = (struct qt_disk_dqdbheader *)buf;
227 if (info->dqi_free_entry) {
228 blk = info->dqi_free_entry;
229 *err = read_blk(info, blk, buf);
230 if (*err < 0)
231 goto out_buf;
232 } else {
233 blk = get_free_dqblk(info);
234 if ((int)blk < 0) {
235 *err = blk;
236 freedqbuf(buf);
237 return 0;
238 }
239 memset(buf, 0, info->dqi_usable_bs);
240 /* This is enough as block is already zeroed and entry list is empty... */
241 info->dqi_free_entry = blk;
242 mark_info_dirty(dquot->dq_sb, dquot->dq_type);
243 }
244 /* Block will be full? */
245 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
246 *err = remove_free_dqentry(info, buf, blk);
247 if (*err < 0) {
248 printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
249 "remove block (%u) from entry free list.\n",
250 blk);
251 goto out_buf;
252 }
253 }
254 le16_add_cpu(&dh->dqdh_entries, 1);
255 /* Find free structure in block */
256 for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
257 i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
258 i++, ddquot += info->dqi_entry_size);
259#ifdef __QUOTA_QT_PARANOIA
260 if (i == qtree_dqstr_in_blk(info)) {
261 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
262 "but it shouldn't.\n");
263 *err = -EIO;
264 goto out_buf;
265 }
266#endif
267 *err = write_blk(info, blk, buf);
268 if (*err < 0) {
269 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
270 "data block %u.\n", blk);
271 goto out_buf;
272 }
273 dquot->dq_off = (blk << info->dqi_blocksize_bits) +
274 sizeof(struct qt_disk_dqdbheader) +
275 i * info->dqi_entry_size;
276 freedqbuf(buf);
277 return blk;
278out_buf:
279 freedqbuf(buf);
280 return 0;
281}
282
283/* Insert reference to structure into the trie */
284static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
285 uint *treeblk, int depth)
286{
287 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
288 int ret = 0, newson = 0, newact = 0;
289 __le32 *ref;
290 uint newblk;
291
292 if (!buf)
293 return -ENOMEM;
294 if (!*treeblk) {
295 ret = get_free_dqblk(info);
296 if (ret < 0)
297 goto out_buf;
298 *treeblk = ret;
299 memset(buf, 0, info->dqi_usable_bs);
300 newact = 1;
301 } else {
302 ret = read_blk(info, *treeblk, buf);
303 if (ret < 0) {
304 printk(KERN_ERR "VFS: Can't read tree quota block "
305 "%u.\n", *treeblk);
306 goto out_buf;
307 }
308 }
309 ref = (__le32 *)buf;
310 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
311 if (!newblk)
312 newson = 1;
313 if (depth == info->dqi_qtree_depth - 1) {
314#ifdef __QUOTA_QT_PARANOIA
315 if (newblk) {
316 printk(KERN_ERR "VFS: Inserting already present quota "
317 "entry (block %u).\n",
318 le32_to_cpu(ref[get_index(info,
319 dquot->dq_id, depth)]));
320 ret = -EIO;
321 goto out_buf;
322 }
323#endif
324 newblk = find_free_dqentry(info, dquot, &ret);
325 } else {
326 ret = do_insert_tree(info, dquot, &newblk, depth+1);
327 }
328 if (newson && ret >= 0) {
329 ref[get_index(info, dquot->dq_id, depth)] =
330 cpu_to_le32(newblk);
331 ret = write_blk(info, *treeblk, buf);
332 } else if (newact && ret < 0) {
333 put_free_dqblk(info, buf, *treeblk);
334 }
335out_buf:
336 freedqbuf(buf);
337 return ret;
338}
339
340/* Wrapper for inserting quota structure into tree */
341static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
342 struct dquot *dquot)
343{
344 int tmp = QT_TREEOFF;
345 return do_insert_tree(info, dquot, &tmp, 0);
346}
347
348/*
349 * We don't have to be afraid of deadlocks as we never have quotas on quota files...
350 */
351int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
352{
353 int type = dquot->dq_type;
354 struct super_block *sb = dquot->dq_sb;
355 ssize_t ret;
356 dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
357
358 if (!ddquot)
359 return -ENOMEM;
360
361 /* dq_off is guarded by dqio_mutex */
362 if (!dquot->dq_off) {
363 ret = dq_insert_tree(info, dquot);
364 if (ret < 0) {
365 printk(KERN_ERR "VFS: Error %zd occurred while "
366 "creating quota.\n", ret);
367 freedqbuf(ddquot);
368 return ret;
369 }
370 }
371 spin_lock(&dq_data_lock);
372 info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
373 spin_unlock(&dq_data_lock);
374 ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
375 info->dqi_entry_size, dquot->dq_off);
376 if (ret != info->dqi_entry_size) {
377 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
378 sb->s_id);
379 if (ret >= 0)
380 ret = -ENOSPC;
381 } else {
382 ret = 0;
383 }
384 dqstats.writes++;
385 freedqbuf(ddquot);
386
387 return ret;
388}
389EXPORT_SYMBOL(qtree_write_dquot);
390
391/* Free dquot entry in data block */
392static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
393 uint blk)
394{
395 struct qt_disk_dqdbheader *dh;
396 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
397 int ret = 0;
398
399 if (!buf)
400 return -ENOMEM;
401 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
402 printk(KERN_ERR "VFS: Quota structure has offset to other "
403 "block (%u) than it should (%u).\n", blk,
404 (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
405 goto out_buf;
406 }
407 ret = read_blk(info, blk, buf);
408 if (ret < 0) {
409 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
410 goto out_buf;
411 }
412 dh = (struct qt_disk_dqdbheader *)buf;
413 le16_add_cpu(&dh->dqdh_entries, -1);
414 if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
415 ret = remove_free_dqentry(info, buf, blk);
416 if (ret >= 0)
417 ret = put_free_dqblk(info, buf, blk);
418 if (ret < 0) {
419 printk(KERN_ERR "VFS: Can't move quota data block (%u) "
420 "to free list.\n", blk);
421 goto out_buf;
422 }
423 } else {
424 memset(buf +
425 (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
426 0, info->dqi_entry_size);
427 if (le16_to_cpu(dh->dqdh_entries) ==
428 qtree_dqstr_in_blk(info) - 1) {
429 /* Insert will write block itself */
430 ret = insert_free_dqentry(info, buf, blk);
431 if (ret < 0) {
432 printk(KERN_ERR "VFS: Can't insert quota data "
433 "block (%u) to free entry list.\n", blk);
434 goto out_buf;
435 }
436 } else {
437 ret = write_blk(info, blk, buf);
438 if (ret < 0) {
439 printk(KERN_ERR "VFS: Can't write quota data "
440 "block %u\n", blk);
441 goto out_buf;
442 }
443 }
444 }
445 dquot->dq_off = 0; /* Quota is now unattached */
446out_buf:
447 freedqbuf(buf);
448 return ret;
449}
450
451/* Remove reference to dquot from tree */
452static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
453 uint *blk, int depth)
454{
455 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
456 int ret = 0;
457 uint newblk;
458 __le32 *ref = (__le32 *)buf;
459
460 if (!buf)
461 return -ENOMEM;
462 ret = read_blk(info, *blk, buf);
463 if (ret < 0) {
464 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
465 goto out_buf;
466 }
467 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
468 if (depth == info->dqi_qtree_depth - 1) {
469 ret = free_dqentry(info, dquot, newblk);
470 newblk = 0;
471 } else {
472 ret = remove_tree(info, dquot, &newblk, depth+1);
473 }
474 if (ret >= 0 && !newblk) {
475 int i;
476 ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
477 /* Block got empty? */
478 for (i = 0;
479 i < (info->dqi_usable_bs >> 2) && !ref[i];
480 i++);
481 /* Don't put the root block into the free block list */
482 if (i == (info->dqi_usable_bs >> 2)
483 && *blk != QT_TREEOFF) {
484 put_free_dqblk(info, buf, *blk);
485 *blk = 0;
486 } else {
487 ret = write_blk(info, *blk, buf);
488 if (ret < 0)
489 printk(KERN_ERR "VFS: Can't write quota tree "
490 "block %u.\n", *blk);
491 }
492 }
493out_buf:
494 freedqbuf(buf);
495 return ret;
496}
497
498/* Delete dquot from tree */
499int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
500{
501 uint tmp = QT_TREEOFF;
502
503 if (!dquot->dq_off) /* Even not allocated? */
504 return 0;
505 return remove_tree(info, dquot, &tmp, 0);
506}
507EXPORT_SYMBOL(qtree_delete_dquot);
508
509/* Find entry in block */
510static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
511 struct dquot *dquot, uint blk)
512{
513 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
514 loff_t ret = 0;
515 int i;
516 char *ddquot;
517
518 if (!buf)
519 return -ENOMEM;
520 ret = read_blk(info, blk, buf);
521 if (ret < 0) {
522 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
523 goto out_buf;
524 }
525 for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
526 i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
527 i++, ddquot += info->dqi_entry_size);
528 if (i == qtree_dqstr_in_blk(info)) {
529 printk(KERN_ERR "VFS: Quota for id %u referenced "
530 "but not present.\n", dquot->dq_id);
531 ret = -EIO;
532 goto out_buf;
533 } else {
534 ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
535 qt_disk_dqdbheader) + i * info->dqi_entry_size;
536 }
537out_buf:
538 freedqbuf(buf);
539 return ret;
540}
541
542/* Find entry for given id in the tree */
543static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
544 struct dquot *dquot, uint blk, int depth)
545{
546 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
547 loff_t ret = 0;
548 __le32 *ref = (__le32 *)buf;
549
550 if (!buf)
551 return -ENOMEM;
552 ret = read_blk(info, blk, buf);
553 if (ret < 0) {
554 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
555 goto out_buf;
556 }
557 ret = 0;
558 blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
559 if (!blk) /* No reference? */
560 goto out_buf;
561 if (depth < info->dqi_qtree_depth - 1)
562 ret = find_tree_dqentry(info, dquot, blk, depth+1);
563 else
564 ret = find_block_dqentry(info, dquot, blk);
565out_buf:
566 freedqbuf(buf);
567 return ret;
568}
569
570/* Find entry for given id in the tree - wrapper function */
571static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
572 struct dquot *dquot)
573{
574 return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
575}
576
577int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
578{
579 int type = dquot->dq_type;
580 struct super_block *sb = dquot->dq_sb;
581 loff_t offset;
582 dqbuf_t ddquot;
583 int ret = 0;
584
585#ifdef __QUOTA_QT_PARANOIA
586 /* Invalidated quota? */
587 if (!sb_dqopt(dquot->dq_sb)->files[type]) {
588 printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
589 return -EIO;
590 }
591#endif
592 /* Do we know offset of the dquot entry in the quota file? */
593 if (!dquot->dq_off) {
594 offset = find_dqentry(info, dquot);
595 if (offset <= 0) { /* Entry not present? */
596 if (offset < 0)
597 printk(KERN_ERR "VFS: Can't read quota "
598 "structure for id %u.\n", dquot->dq_id);
599 dquot->dq_off = 0;
600 set_bit(DQ_FAKE_B, &dquot->dq_flags);
601 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
602 ret = offset;
603 goto out;
604 }
605 dquot->dq_off = offset;
606 }
607 ddquot = getdqbuf(info->dqi_entry_size);
608 if (!ddquot)
609 return -ENOMEM;
610 ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
611 info->dqi_entry_size, dquot->dq_off);
612 if (ret != info->dqi_entry_size) {
613 if (ret >= 0)
614 ret = -EIO;
615 printk(KERN_ERR "VFS: Error while reading quota "
616 "structure for id %u.\n", dquot->dq_id);
617 set_bit(DQ_FAKE_B, &dquot->dq_flags);
618 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
619 freedqbuf(ddquot);
620 goto out;
621 }
622 spin_lock(&dq_data_lock);
623 info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
624 if (!dquot->dq_dqb.dqb_bhardlimit &&
625 !dquot->dq_dqb.dqb_bsoftlimit &&
626 !dquot->dq_dqb.dqb_ihardlimit &&
627 !dquot->dq_dqb.dqb_isoftlimit)
628 set_bit(DQ_FAKE_B, &dquot->dq_flags);
629 spin_unlock(&dq_data_lock);
630 freedqbuf(ddquot);
631out:
632 dqstats.reads++;
633 return ret;
634}
635EXPORT_SYMBOL(qtree_read_dquot);
636
637/* Check whether dquot should not be deleted. We know we are
638 * the only one operating on dquot (thanks to dq_lock) */
639int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
640{
641 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
642 return qtree_delete_dquot(info, dquot);
643 return 0;
644}
645EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 000000000000..a1ab8db81a51
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
1/*
2 * Definitions of structures for vfsv0 quota format
3 */
4
5#ifndef _LINUX_QUOTA_TREE_H
6#define _LINUX_QUOTA_TREE_H
7
8#include <linux/types.h>
9#include <linux/quota.h>
10
11/*
12 * Structure of header of block with quota structures. It is padded to 16 bytes so
13 * there will be space for exactly 21 quota-entries in a block
14 */
15struct qt_disk_dqdbheader {
16 __le32 dqdh_next_free; /* Number of next block with free entry */
17 __le32 dqdh_prev_free; /* Number of previous block with free entry */
18 __le16 dqdh_entries; /* Number of valid entries in block */
19 __le16 dqdh_pad1;
20 __le32 dqdh_pad2;
21};
22
23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
24
25#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb0..b4af1c69ad16 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,25 +3,39 @@
3#include <linux/quota.h> 3#include <linux/quota.h>
4#include <linux/quotaops.h> 4#include <linux/quotaops.h>
5#include <linux/dqblk_v1.h> 5#include <linux/dqblk_v1.h>
6#include <linux/quotaio_v1.h>
7#include <linux/kernel.h> 6#include <linux/kernel.h>
8#include <linux/init.h> 7#include <linux/init.h>
9#include <linux/module.h> 8#include <linux/module.h>
10 9
11#include <asm/byteorder.h> 10#include <asm/byteorder.h>
12 11
12#include "quotaio_v1.h"
13
13MODULE_AUTHOR("Jan Kara"); 14MODULE_AUTHOR("Jan Kara");
14MODULE_DESCRIPTION("Old quota format support"); 15MODULE_DESCRIPTION("Old quota format support");
15MODULE_LICENSE("GPL"); 16MODULE_LICENSE("GPL");
16 17
18#define QUOTABLOCK_BITS 10
19#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
20
21static inline qsize_t v1_stoqb(qsize_t space)
22{
23 return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
24}
25
26static inline qsize_t v1_qbtos(qsize_t blocks)
27{
28 return blocks << QUOTABLOCK_BITS;
29}
30
17static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d) 31static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
18{ 32{
19 m->dqb_ihardlimit = d->dqb_ihardlimit; 33 m->dqb_ihardlimit = d->dqb_ihardlimit;
20 m->dqb_isoftlimit = d->dqb_isoftlimit; 34 m->dqb_isoftlimit = d->dqb_isoftlimit;
21 m->dqb_curinodes = d->dqb_curinodes; 35 m->dqb_curinodes = d->dqb_curinodes;
22 m->dqb_bhardlimit = d->dqb_bhardlimit; 36 m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
23 m->dqb_bsoftlimit = d->dqb_bsoftlimit; 37 m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
24 m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS; 38 m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
25 m->dqb_itime = d->dqb_itime; 39 m->dqb_itime = d->dqb_itime;
26 m->dqb_btime = d->dqb_btime; 40 m->dqb_btime = d->dqb_btime;
27} 41}
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
31 d->dqb_ihardlimit = m->dqb_ihardlimit; 45 d->dqb_ihardlimit = m->dqb_ihardlimit;
32 d->dqb_isoftlimit = m->dqb_isoftlimit; 46 d->dqb_isoftlimit = m->dqb_isoftlimit;
33 d->dqb_curinodes = m->dqb_curinodes; 47 d->dqb_curinodes = m->dqb_curinodes;
34 d->dqb_bhardlimit = m->dqb_bhardlimit; 48 d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
35 d->dqb_bsoftlimit = m->dqb_bsoftlimit; 49 d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
36 d->dqb_curblocks = toqb(m->dqb_curspace); 50 d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
37 d->dqb_itime = m->dqb_itime; 51 d->dqb_itime = m->dqb_itime;
38 d->dqb_btime = m->dqb_btime; 52 d->dqb_btime = m->dqb_btime;
39} 53}
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d9..b618b563635c 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
6#include <linux/fs.h> 6#include <linux/fs.h>
7#include <linux/mount.h> 7#include <linux/mount.h>
8#include <linux/dqblk_v2.h> 8#include <linux/dqblk_v2.h>
9#include <linux/quotaio_v2.h>
10#include <linux/kernel.h> 9#include <linux/kernel.h>
11#include <linux/init.h> 10#include <linux/init.h>
12#include <linux/module.h> 11#include <linux/module.h>
@@ -15,16 +14,37 @@
15 14
16#include <asm/byteorder.h> 15#include <asm/byteorder.h>
17 16
17#include "quota_tree.h"
18#include "quotaio_v2.h"
19
18MODULE_AUTHOR("Jan Kara"); 20MODULE_AUTHOR("Jan Kara");
19MODULE_DESCRIPTION("Quota format v2 support"); 21MODULE_DESCRIPTION("Quota format v2 support");
20MODULE_LICENSE("GPL"); 22MODULE_LICENSE("GPL");
21 23
22#define __QUOTA_V2_PARANOIA 24#define __QUOTA_V2_PARANOIA
23 25
24typedef char *dqbuf_t; 26static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
27static void v2_disk2memdqb(struct dquot *dquot, void *dp);
28static int v2_is_id(void *dp, struct dquot *dquot);
29
30static struct qtree_fmt_operations v2_qtree_ops = {
31 .mem2disk_dqblk = v2_mem2diskdqb,
32 .disk2mem_dqblk = v2_disk2memdqb,
33 .is_id = v2_is_id,
34};
35
36#define QUOTABLOCK_BITS 10
37#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
25 38
26#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff) 39static inline qsize_t v2_stoqb(qsize_t space)
27#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader))) 40{
41 return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
42}
43
44static inline qsize_t v2_qbtos(qsize_t blocks)
45{
46 return blocks << QUOTABLOCK_BITS;
47}
28 48
29/* Check whether given file is really vfsv0 quotafile */ 49/* Check whether given file is really vfsv0 quotafile */
30static int v2_check_quota_file(struct super_block *sb, int type) 50static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
50static int v2_read_file_info(struct super_block *sb, int type) 70static int v2_read_file_info(struct super_block *sb, int type)
51{ 71{
52 struct v2_disk_dqinfo dinfo; 72 struct v2_disk_dqinfo dinfo;
53 struct mem_dqinfo *info = sb_dqopt(sb)->info+type; 73 struct mem_dqinfo *info = sb_dqinfo(sb, type);
74 struct qtree_mem_dqinfo *qinfo;
54 ssize_t size; 75 ssize_t size;
55 76
56 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 77 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
60 sb->s_id); 81 sb->s_id);
61 return -1; 82 return -1;
62 } 83 }
84 info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
85 if (!info->dqi_priv) {
86 printk(KERN_WARNING
87 "Not enough memory for quota information structure.\n");
88 return -1;
89 }
90 qinfo = info->dqi_priv;
63 /* limits are stored as unsigned 32-bit data */ 91 /* limits are stored as unsigned 32-bit data */
64 info->dqi_maxblimit = 0xffffffff; 92 info->dqi_maxblimit = 0xffffffff;
65 info->dqi_maxilimit = 0xffffffff; 93 info->dqi_maxilimit = 0xffffffff;
66 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); 94 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
67 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); 95 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
68 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); 96 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
69 info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); 97 qinfo->dqi_sb = sb;
70 info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); 98 qinfo->dqi_type = type;
71 info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); 99 qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
100 qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
101 qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
102 qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
103 qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
104 qinfo->dqi_qtree_depth = qtree_depth(qinfo);
105 qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
106 qinfo->dqi_ops = &v2_qtree_ops;
72 return 0; 107 return 0;
73} 108}
74 109
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
76static int v2_write_file_info(struct super_block *sb, int type) 111static int v2_write_file_info(struct super_block *sb, int type)
77{ 112{
78 struct v2_disk_dqinfo dinfo; 113 struct v2_disk_dqinfo dinfo;
79 struct mem_dqinfo *info = sb_dqopt(sb)->info+type; 114 struct mem_dqinfo *info = sb_dqinfo(sb, type);
115 struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
80 ssize_t size; 116 ssize_t size;
81 117
82 spin_lock(&dq_data_lock); 118 spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
85 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); 121 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
86 dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); 122 dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
87 spin_unlock(&dq_data_lock); 123 spin_unlock(&dq_data_lock);
88 dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks); 124 dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
89 dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk); 125 dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
90 dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry); 126 dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
91 size = sb->s_op->quota_write(sb, type, (char *)&dinfo, 127 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
92 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 128 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
93 if (size != sizeof(struct v2_disk_dqinfo)) { 129 if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
98 return 0; 134 return 0;
99} 135}
100 136
101static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d) 137static void v2_disk2memdqb(struct dquot *dquot, void *dp)
102{ 138{
139 struct v2_disk_dqblk *d = dp, empty;
140 struct mem_dqblk *m = &dquot->dq_dqb;
141
103 m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); 142 m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
104 m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); 143 m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
105 m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); 144 m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
106 m->dqb_itime = le64_to_cpu(d->dqb_itime); 145 m->dqb_itime = le64_to_cpu(d->dqb_itime);
107 m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); 146 m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
108 m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); 147 m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
109 m->dqb_curspace = le64_to_cpu(d->dqb_curspace); 148 m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
110 m->dqb_btime = le64_to_cpu(d->dqb_btime); 149 m->dqb_btime = le64_to_cpu(d->dqb_btime);
150 /* We need to escape back all-zero structure */
151 memset(&empty, 0, sizeof(struct v2_disk_dqblk));
152 empty.dqb_itime = cpu_to_le64(1);
153 if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
154 m->dqb_itime = 0;
111} 155}
112 156
113static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id) 157static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
114{ 158{
159 struct v2_disk_dqblk *d = dp;
160 struct mem_dqblk *m = &dquot->dq_dqb;
161 struct qtree_mem_dqinfo *info =
162 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
163
115 d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); 164 d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
116 d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); 165 d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
117 d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); 166 d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
118 d->dqb_itime = cpu_to_le64(m->dqb_itime); 167 d->dqb_itime = cpu_to_le64(m->dqb_itime);
119 d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); 168 d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
120 d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); 169 d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
121 d->dqb_curspace = cpu_to_le64(m->dqb_curspace); 170 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
122 d->dqb_btime = cpu_to_le64(m->dqb_btime); 171 d->dqb_btime = cpu_to_le64(m->dqb_btime);
123 d->dqb_id = cpu_to_le32(id); 172 d->dqb_id = cpu_to_le32(dquot->dq_id);
124} 173 if (qtree_entry_unused(info, dp))
125 174 d->dqb_itime = cpu_to_le64(1);
126static dqbuf_t getdqbuf(void)
127{
128 dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
129 if (!buf)
130 printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
131 return buf;
132}
133
134static inline void freedqbuf(dqbuf_t buf)
135{
136 kfree(buf);
137}
138
139static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
140{
141 memset(buf, 0, V2_DQBLKSIZE);
142 return sb->s_op->quota_read(sb, type, (char *)buf,
143 V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
144}
145
146static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
147{
148 return sb->s_op->quota_write(sb, type, (char *)buf,
149 V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
150}
151
152/* Remove empty block from list and return it */
153static int get_free_dqblk(struct super_block *sb, int type)
154{
155 dqbuf_t buf = getdqbuf();
156 struct mem_dqinfo *info = sb_dqinfo(sb, type);
157 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
158 int ret, blk;
159
160 if (!buf)
161 return -ENOMEM;
162 if (info->u.v2_i.dqi_free_blk) {
163 blk = info->u.v2_i.dqi_free_blk;
164 if ((ret = read_blk(sb, type, blk, buf)) < 0)
165 goto out_buf;
166 info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
167 }
168 else {
169 memset(buf, 0, V2_DQBLKSIZE);
170 /* Assure block allocation... */
171 if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
172 goto out_buf;
173 blk = info->u.v2_i.dqi_blocks++;
174 }
175 mark_info_dirty(sb, type);
176 ret = blk;
177out_buf:
178 freedqbuf(buf);
179 return ret;
180}
181
182/* Insert empty block to the list */
183static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
184{
185 struct mem_dqinfo *info = sb_dqinfo(sb, type);
186 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
187 int err;
188
189 dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
190 dh->dqdh_prev_free = cpu_to_le32(0);
191 dh->dqdh_entries = cpu_to_le16(0);
192 info->u.v2_i.dqi_free_blk = blk;
193 mark_info_dirty(sb, type);
194 /* Some strange block. We had better leave it... */
195 if ((err = write_blk(sb, type, blk, buf)) < 0)
196 return err;
197 return 0;
198} 175}
199 176
200/* Remove given block from the list of blocks with free entries */ 177static int v2_is_id(void *dp, struct dquot *dquot)
201static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
202{ 178{
203 dqbuf_t tmpbuf = getdqbuf(); 179 struct v2_disk_dqblk *d = dp;
204 struct mem_dqinfo *info = sb_dqinfo(sb, type); 180 struct qtree_mem_dqinfo *info =
205 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; 181 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
206 uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
207 int err;
208 182
209 if (!tmpbuf) 183 if (qtree_entry_unused(info, dp))
210 return -ENOMEM;
211 if (nextblk) {
212 if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
213 goto out_buf;
214 ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
215 if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
216 goto out_buf;
217 }
218 if (prevblk) {
219 if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
220 goto out_buf;
221 ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
222 if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
223 goto out_buf;
224 }
225 else {
226 info->u.v2_i.dqi_free_entry = nextblk;
227 mark_info_dirty(sb, type);
228 }
229 freedqbuf(tmpbuf);
230 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
231 /* No matter whether write succeeds block is out of list */
232 if (write_blk(sb, type, blk, buf) < 0)
233 printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
234 return 0;
235out_buf:
236 freedqbuf(tmpbuf);
237 return err;
238}
239
240/* Insert given block to the beginning of list with free entries */
241static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
242{
243 dqbuf_t tmpbuf = getdqbuf();
244 struct mem_dqinfo *info = sb_dqinfo(sb, type);
245 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
246 int err;
247
248 if (!tmpbuf)
249 return -ENOMEM;
250 dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
251 dh->dqdh_prev_free = cpu_to_le32(0);
252 if ((err = write_blk(sb, type, blk, buf)) < 0)
253 goto out_buf;
254 if (info->u.v2_i.dqi_free_entry) {
255 if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
256 goto out_buf;
257 ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
258 if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
259 goto out_buf;
260 }
261 freedqbuf(tmpbuf);
262 info->u.v2_i.dqi_free_entry = blk;
263 mark_info_dirty(sb, type);
264 return 0;
265out_buf:
266 freedqbuf(tmpbuf);
267 return err;
268}
269
270/* Find space for dquot */
271static uint find_free_dqentry(struct dquot *dquot, int *err)
272{
273 struct super_block *sb = dquot->dq_sb;
274 struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
275 uint blk, i;
276 struct v2_disk_dqdbheader *dh;
277 struct v2_disk_dqblk *ddquot;
278 struct v2_disk_dqblk fakedquot;
279 dqbuf_t buf;
280
281 *err = 0;
282 if (!(buf = getdqbuf())) {
283 *err = -ENOMEM;
284 return 0; 184 return 0;
285 } 185 return le32_to_cpu(d->dqb_id) == dquot->dq_id;
286 dh = (struct v2_disk_dqdbheader *)buf;
287 ddquot = GETENTRIES(buf);
288 if (info->u.v2_i.dqi_free_entry) {
289 blk = info->u.v2_i.dqi_free_entry;
290 if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
291 goto out_buf;
292 }
293 else {
294 blk = get_free_dqblk(sb, dquot->dq_type);
295 if ((int)blk < 0) {
296 *err = blk;
297 freedqbuf(buf);
298 return 0;
299 }
300 memset(buf, 0, V2_DQBLKSIZE);
301 /* This is enough as block is already zeroed and entry list is empty... */
302 info->u.v2_i.dqi_free_entry = blk;
303 mark_info_dirty(sb, dquot->dq_type);
304 }
305 if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */
306 if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
307 printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
308 goto out_buf;
309 }
310 le16_add_cpu(&dh->dqdh_entries, 1);
311 memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
312 /* Find free structure in block */
313 for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
314#ifdef __QUOTA_V2_PARANOIA
315 if (i == V2_DQSTRINBLK) {
316 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
317 *err = -EIO;
318 goto out_buf;
319 }
320#endif
321 if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
322 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
323 goto out_buf;
324 }
325 dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
326 freedqbuf(buf);
327 return blk;
328out_buf:
329 freedqbuf(buf);
330 return 0;
331}
332
333/* Insert reference to structure into the trie */
334static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
335{
336 struct super_block *sb = dquot->dq_sb;
337 dqbuf_t buf;
338 int ret = 0, newson = 0, newact = 0;
339 __le32 *ref;
340 uint newblk;
341
342 if (!(buf = getdqbuf()))
343 return -ENOMEM;
344 if (!*treeblk) {
345 ret = get_free_dqblk(sb, dquot->dq_type);
346 if (ret < 0)
347 goto out_buf;
348 *treeblk = ret;
349 memset(buf, 0, V2_DQBLKSIZE);
350 newact = 1;
351 }
352 else {
353 if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
354 printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
355 goto out_buf;
356 }
357 }
358 ref = (__le32 *)buf;
359 newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
360 if (!newblk)
361 newson = 1;
362 if (depth == V2_DQTREEDEPTH-1) {
363#ifdef __QUOTA_V2_PARANOIA
364 if (newblk) {
365 printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
366 ret = -EIO;
367 goto out_buf;
368 }
369#endif
370 newblk = find_free_dqentry(dquot, &ret);
371 }
372 else
373 ret = do_insert_tree(dquot, &newblk, depth+1);
374 if (newson && ret >= 0) {
375 ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
376 ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
377 }
378 else if (newact && ret < 0)
379 put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
380out_buf:
381 freedqbuf(buf);
382 return ret;
383} 186}
384 187
385/* Wrapper for inserting quota structure into tree */ 188static int v2_read_dquot(struct dquot *dquot)
386static inline int dq_insert_tree(struct dquot *dquot)
387{ 189{
388 int tmp = V2_DQTREEOFF; 190 return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
389 return do_insert_tree(dquot, &tmp, 0);
390} 191}
391 192
392/*
393 * We don't have to be afraid of deadlocks as we never have quotas on quota files...
394 */
395static int v2_write_dquot(struct dquot *dquot) 193static int v2_write_dquot(struct dquot *dquot)
396{ 194{
397 int type = dquot->dq_type; 195 return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
398 ssize_t ret;
399 struct v2_disk_dqblk ddquot, empty;
400
401 /* dq_off is guarded by dqio_mutex */
402 if (!dquot->dq_off)
403 if ((ret = dq_insert_tree(dquot)) < 0) {
404 printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
405 return ret;
406 }
407 spin_lock(&dq_data_lock);
408 mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
409 /* Argh... We may need to write structure full of zeroes but that would be
410 * treated as an empty place by the rest of the code. Format change would
411 * be definitely cleaner but the problems probably are not worth it */
412 memset(&empty, 0, sizeof(struct v2_disk_dqblk));
413 if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
414 ddquot.dqb_itime = cpu_to_le64(1);
415 spin_unlock(&dq_data_lock);
416 ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
417 (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
418 if (ret != sizeof(struct v2_disk_dqblk)) {
419 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
420 if (ret >= 0)
421 ret = -ENOSPC;
422 }
423 else
424 ret = 0;
425 dqstats.writes++;
426
427 return ret;
428} 196}
429 197
430/* Free dquot entry in data block */ 198static int v2_release_dquot(struct dquot *dquot)
431static int free_dqentry(struct dquot *dquot, uint blk)
432{
433 struct super_block *sb = dquot->dq_sb;
434 int type = dquot->dq_type;
435 struct v2_disk_dqdbheader *dh;
436 dqbuf_t buf = getdqbuf();
437 int ret = 0;
438
439 if (!buf)
440 return -ENOMEM;
441 if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
442 printk(KERN_ERR "VFS: Quota structure has offset to other "
443 "block (%u) than it should (%u).\n", blk,
444 (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
445 goto out_buf;
446 }
447 if ((ret = read_blk(sb, type, blk, buf)) < 0) {
448 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
449 goto out_buf;
450 }
451 dh = (struct v2_disk_dqdbheader *)buf;
452 le16_add_cpu(&dh->dqdh_entries, -1);
453 if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
454 if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
455 (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
456 printk(KERN_ERR "VFS: Can't move quota data block (%u) "
457 "to free list.\n", blk);
458 goto out_buf;
459 }
460 }
461 else {
462 memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
463 sizeof(struct v2_disk_dqblk));
464 if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
465 /* Insert will write block itself */
466 if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
467 printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
468 goto out_buf;
469 }
470 }
471 else
472 if ((ret = write_blk(sb, type, blk, buf)) < 0) {
473 printk(KERN_ERR "VFS: Can't write quota data "
474 "block %u\n", blk);
475 goto out_buf;
476 }
477 }
478 dquot->dq_off = 0; /* Quota is now unattached */
479out_buf:
480 freedqbuf(buf);
481 return ret;
482}
483
484/* Remove reference to dquot from tree */
485static int remove_tree(struct dquot *dquot, uint *blk, int depth)
486{
487 struct super_block *sb = dquot->dq_sb;
488 int type = dquot->dq_type;
489 dqbuf_t buf = getdqbuf();
490 int ret = 0;
491 uint newblk;
492 __le32 *ref = (__le32 *)buf;
493
494 if (!buf)
495 return -ENOMEM;
496 if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
497 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
498 goto out_buf;
499 }
500 newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
501 if (depth == V2_DQTREEDEPTH-1) {
502 ret = free_dqentry(dquot, newblk);
503 newblk = 0;
504 }
505 else
506 ret = remove_tree(dquot, &newblk, depth+1);
507 if (ret >= 0 && !newblk) {
508 int i;
509 ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
510 for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */
511 /* Don't put the root block into the free block list */
512 if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
513 put_free_dqblk(sb, type, buf, *blk);
514 *blk = 0;
515 }
516 else
517 if ((ret = write_blk(sb, type, *blk, buf)) < 0)
518 printk(KERN_ERR "VFS: Can't write quota tree "
519 "block %u.\n", *blk);
520 }
521out_buf:
522 freedqbuf(buf);
523 return ret;
524}
525
526/* Delete dquot from tree */
527static int v2_delete_dquot(struct dquot *dquot)
528{
529 uint tmp = V2_DQTREEOFF;
530
531 if (!dquot->dq_off) /* Even not allocated? */
532 return 0;
533 return remove_tree(dquot, &tmp, 0);
534}
535
536/* Find entry in block */
537static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
538{
539 dqbuf_t buf = getdqbuf();
540 loff_t ret = 0;
541 int i;
542 struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
543
544 if (!buf)
545 return -ENOMEM;
546 if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
547 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
548 goto out_buf;
549 }
550 if (dquot->dq_id)
551 for (i = 0; i < V2_DQSTRINBLK &&
552 le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
553 else { /* ID 0 as a bit more complicated searching... */
554 struct v2_disk_dqblk fakedquot;
555
556 memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
557 for (i = 0; i < V2_DQSTRINBLK; i++)
558 if (!le32_to_cpu(ddquot[i].dqb_id) &&
559 memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
560 break;
561 }
562 if (i == V2_DQSTRINBLK) {
563 printk(KERN_ERR "VFS: Quota for id %u referenced "
564 "but not present.\n", dquot->dq_id);
565 ret = -EIO;
566 goto out_buf;
567 }
568 else
569 ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
570 v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
571out_buf:
572 freedqbuf(buf);
573 return ret;
574}
575
576/* Find entry for given id in the tree */
577static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
578{
579 dqbuf_t buf = getdqbuf();
580 loff_t ret = 0;
581 __le32 *ref = (__le32 *)buf;
582
583 if (!buf)
584 return -ENOMEM;
585 if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
586 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
587 goto out_buf;
588 }
589 ret = 0;
590 blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
591 if (!blk) /* No reference? */
592 goto out_buf;
593 if (depth < V2_DQTREEDEPTH-1)
594 ret = find_tree_dqentry(dquot, blk, depth+1);
595 else
596 ret = find_block_dqentry(dquot, blk);
597out_buf:
598 freedqbuf(buf);
599 return ret;
600}
601
602/* Find entry for given id in the tree - wrapper function */
603static inline loff_t find_dqentry(struct dquot *dquot)
604{
605 return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
606}
607
608static int v2_read_dquot(struct dquot *dquot)
609{ 199{
610 int type = dquot->dq_type; 200 return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
611 loff_t offset;
612 struct v2_disk_dqblk ddquot, empty;
613 int ret = 0;
614
615#ifdef __QUOTA_V2_PARANOIA
616 /* Invalidated quota? */
617 if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
618 printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
619 return -EIO;
620 }
621#endif
622 offset = find_dqentry(dquot);
623 if (offset <= 0) { /* Entry not present? */
624 if (offset < 0)
625 printk(KERN_ERR "VFS: Can't read quota "
626 "structure for id %u.\n", dquot->dq_id);
627 dquot->dq_off = 0;
628 set_bit(DQ_FAKE_B, &dquot->dq_flags);
629 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
630 ret = offset;
631 }
632 else {
633 dquot->dq_off = offset;
634 if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
635 (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
636 != sizeof(struct v2_disk_dqblk)) {
637 if (ret >= 0)
638 ret = -EIO;
639 printk(KERN_ERR "VFS: Error while reading quota "
640 "structure for id %u.\n", dquot->dq_id);
641 memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
642 }
643 else {
644 ret = 0;
645 /* We need to escape back all-zero structure */
646 memset(&empty, 0, sizeof(struct v2_disk_dqblk));
647 empty.dqb_itime = cpu_to_le64(1);
648 if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
649 ddquot.dqb_itime = 0;
650 }
651 disk2memdqb(&dquot->dq_dqb, &ddquot);
652 if (!dquot->dq_dqb.dqb_bhardlimit &&
653 !dquot->dq_dqb.dqb_bsoftlimit &&
654 !dquot->dq_dqb.dqb_ihardlimit &&
655 !dquot->dq_dqb.dqb_isoftlimit)
656 set_bit(DQ_FAKE_B, &dquot->dq_flags);
657 }
658 dqstats.reads++;
659
660 return ret;
661} 201}
662 202
663/* Check whether dquot should not be deleted. We know we are 203static int v2_free_file_info(struct super_block *sb, int type)
664 * the only one operating on dquot (thanks to dq_lock) */
665static int v2_release_dquot(struct dquot *dquot)
666{ 204{
667 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) 205 kfree(sb_dqinfo(sb, type)->dqi_priv);
668 return v2_delete_dquot(dquot);
669 return 0; 206 return 0;
670} 207}
671 208
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
673 .check_quota_file = v2_check_quota_file, 210 .check_quota_file = v2_check_quota_file,
674 .read_file_info = v2_read_file_info, 211 .read_file_info = v2_read_file_info,
675 .write_file_info = v2_write_file_info, 212 .write_file_info = v2_write_file_info,
676 .free_file_info = NULL, 213 .free_file_info = v2_free_file_info,
677 .read_dqblk = v2_read_dquot, 214 .read_dqblk = v2_read_dquot,
678 .commit_dqblk = v2_write_dquot, 215 .commit_dqblk = v2_write_dquot,
679 .release_dqblk = v2_release_dquot, 216 .release_dqblk = v2_release_dquot,
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 000000000000..746654b5de70
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
1#ifndef _LINUX_QUOTAIO_V1_H
2#define _LINUX_QUOTAIO_V1_H
3
4#include <linux/types.h>
5
6/*
7 * The following constants define the amount of time given a user
8 * before the soft limits are treated as hard limits (usually resulting
9 * in an allocation failure). The timer is started when the user crosses
10 * their soft limit, it is reset when they go below their soft limit.
11 */
12#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
13#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
14
15/*
16 * The following structure defines the format of the disk quota file
17 * (as it appears on disk) - the file is an array of these structures
18 * indexed by user or group number.
19 */
20struct v1_disk_dqblk {
21 __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */
22 __u32 dqb_bsoftlimit; /* preferred limit on disk blks */
23 __u32 dqb_curblocks; /* current block count */
24 __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */
25 __u32 dqb_isoftlimit; /* preferred inode limit */
26 __u32 dqb_curinodes; /* current # allocated inodes */
27 time_t dqb_btime; /* time limit for excessive disk use */
28 time_t dqb_itime; /* time limit for excessive inode use */
29};
30
31#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
32
33#endif /* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 000000000000..530fe580685c
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,60 @@
1/*
2 * Definitions of structures for vfsv0 quota format
3 */
4
5#ifndef _LINUX_QUOTAIO_V2_H
6#define _LINUX_QUOTAIO_V2_H
7
8#include <linux/types.h>
9#include <linux/quota.h>
10
11/*
12 * Definitions of magics and versions of current quota files
13 */
14#define V2_INITQMAGICS {\
15 0xd9c01f11, /* USRQUOTA */\
16 0xd9c01927 /* GRPQUOTA */\
17}
18
19#define V2_INITQVERSIONS {\
20 0, /* USRQUOTA */\
21 0 /* GRPQUOTA */\
22}
23
24/* First generic header */
25struct v2_disk_dqheader {
26 __le32 dqh_magic; /* Magic number identifying file */
27 __le32 dqh_version; /* File version */
28};
29
30/*
31 * The following structure defines the format of the disk quota file
32 * (as it appears on disk) - the file is a radix tree whose leaves point
33 * to blocks of these structures.
34 */
35struct v2_disk_dqblk {
36 __le32 dqb_id; /* id this quota applies to */
37 __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */
38 __le32 dqb_isoftlimit; /* preferred inode limit */
39 __le32 dqb_curinodes; /* current # allocated inodes */
40 __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
41 __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
42 __le64 dqb_curspace; /* current space occupied (in bytes) */
43 __le64 dqb_btime; /* time limit for excessive disk use */
44 __le64 dqb_itime; /* time limit for excessive inode use */
45};
46
47/* Header with type and version specific information */
48struct v2_disk_dqinfo {
49 __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */
50 __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */
51 __le32 dqi_flags; /* Flags for quotafile (DQF_*) */
52 __le32 dqi_blocks; /* Number of blocks in file */
53 __le32 dqi_free_blk; /* Number of first free block in the list */
54 __le32 dqi_free_entry; /* Number of block with at least one free entry */
55};
56
57#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */
58#define V2_DQBLKSIZE_BITS 10 /* Size of leaf block in tree */
59
60#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae33..b7e6ac706b87 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
57 inode->i_mode = mode; 57 inode->i_mode = mode;
58 inode->i_uid = current_fsuid(); 58 inode->i_uid = current_fsuid();
59 inode->i_gid = current_fsgid(); 59 inode->i_gid = current_fsgid();
60 inode->i_blocks = 0;
61 inode->i_mapping->a_ops = &ramfs_aops; 60 inode->i_mapping->a_ops = &ramfs_aops;
62 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 61 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
63 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 62 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020b..5cc6924eb158 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
50 offset += inode->i_size; 50 offset += inode->i_size;
51 break; 51 break;
52 case SEEK_CUR: 52 case SEEK_CUR:
53 /*
54 * Here we special-case the lseek(fd, 0, SEEK_CUR)
55 * position-querying operation. Avoid rewriting the "same"
56 * f_pos value back to the file because a concurrent read(),
57 * write() or lseek() might have altered it
58 */
59 if (offset == 0)
60 return file->f_pos;
53 offset += file->f_pos; 61 offset += file->f_pos;
54 break; 62 break;
55 } 63 }
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
105 offset += i_size_read(file->f_path.dentry->d_inode); 113 offset += i_size_read(file->f_path.dentry->d_inode);
106 break; 114 break;
107 case SEEK_CUR: 115 case SEEK_CUR:
116 if (offset == 0) {
117 retval = file->f_pos;
118 goto out;
119 }
108 offset += file->f_pos; 120 offset += file->f_pos;
109 } 121 }
110 retval = -EINVAL; 122 retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
115 } 127 }
116 retval = offset; 128 retval = offset;
117 } 129 }
130out:
118 unlock_kernel(); 131 unlock_kernel();
119 return retval; 132 return retval;
120} 133}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ed04f47007f8..55fce92cdf18 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,6 +1782,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1782 goto out_bad_inode; 1782 goto out_bad_inode;
1783 } 1783 }
1784 args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); 1784 args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1785 if (old_format_only(sb))
1786 make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1787 TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1788 else
1789 make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1790 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1785 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); 1791 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1786 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); 1792 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
1787 if (insert_inode_locked4(inode, args.objectid, 1793 if (insert_inode_locked4(inode, args.objectid,
@@ -1834,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1834 reiserfs_init_acl_default(inode); 1840 reiserfs_init_acl_default(inode);
1835 reiserfs_init_xattr_rwsem(inode); 1841 reiserfs_init_xattr_rwsem(inode);
1836 1842
1837 if (old_format_only(sb))
1838 make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1839 TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1840 else
1841 make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1842 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1843
1844 /* key to search for correct place for new stat data */ 1843 /* key to search for correct place for new stat data */
1845 _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), 1844 _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
1846 le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, 1845 le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce8..c55651f1407c 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -649,6 +649,8 @@ static struct dquot_operations reiserfs_quota_operations = {
649 .release_dquot = reiserfs_release_dquot, 649 .release_dquot = reiserfs_release_dquot,
650 .mark_dirty = reiserfs_mark_dquot_dirty, 650 .mark_dirty = reiserfs_mark_dquot_dirty,
651 .write_info = reiserfs_write_info, 651 .write_info = reiserfs_write_info,
652 .alloc_dquot = dquot_alloc,
653 .destroy_dquot = dquot_destroy,
652}; 654};
653 655
654static struct quotactl_ops reiserfs_qctl_operations = { 656static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +996,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
994 if (c == 'u' || c == 'g') { 996 if (c == 'u' || c == 'g') {
995 int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; 997 int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
996 998
997 if ((sb_any_quota_enabled(s) || 999 if (sb_any_quota_loaded(s) &&
998 sb_any_quota_suspended(s)) &&
999 (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { 1000 (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
1000 reiserfs_warning(s, 1001 reiserfs_warning(s,
1001 "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); 1002 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1042,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1041 "reiserfs_parse_options: unknown quota format specified."); 1042 "reiserfs_parse_options: unknown quota format specified.");
1042 return 0; 1043 return 0;
1043 } 1044 }
1044 if ((sb_any_quota_enabled(s) || 1045 if (sb_any_quota_loaded(s) &&
1045 sb_any_quota_suspended(s)) &&
1046 *qfmt != REISERFS_SB(s)->s_jquota_fmt) { 1046 *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
1047 reiserfs_warning(s, 1047 reiserfs_warning(s,
1048 "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); 1048 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1067,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1067 } 1067 }
1068 /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ 1068 /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
1069 if (!(*mount_options & (1 << REISERFS_QUOTA)) 1069 if (!(*mount_options & (1 << REISERFS_QUOTA))
1070 && sb_any_quota_enabled(s)) { 1070 && sb_any_quota_loaded(s)) {
1071 reiserfs_warning(s, 1071 reiserfs_warning(s,
1072 "reiserfs_parse_options: quota options must be present when quota is turned on."); 1072 "reiserfs_parse_options: quota options must be present when quota is turned on.");
1073 return 0; 1073 return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87b..98a232f7196b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
490static struct inode * 490static struct inode *
491romfs_iget(struct super_block *sb, unsigned long ino) 491romfs_iget(struct super_block *sb, unsigned long ino)
492{ 492{
493 int nextfh; 493 int nextfh, ret;
494 struct romfs_inode ri; 494 struct romfs_inode ri;
495 struct inode *i; 495 struct inode *i;
496 496
@@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino)
524 i->i_size = be32_to_cpu(ri.size); 524 i->i_size = be32_to_cpu(ri.size);
525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; 525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; 526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
527 i->i_uid = i->i_gid = 0;
528 527
529 /* Precalculate the data offset */ 528 /* Precalculate the data offset */
530 ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN); 529 ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
531 if (ino >= 0) 530 if (ret >= 0)
532 ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK); 531 ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
533 else 532 else
534 ino = 0; 533 ino = 0;
535 534
536 ROMFS_I(i)->i_metasize = ino; 535 ROMFS_I(i)->i_metasize = ino;
537 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); 536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf2..08b91beed806 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
109void poll_initwait(struct poll_wqueues *pwq) 109void poll_initwait(struct poll_wqueues *pwq)
110{ 110{
111 init_poll_funcptr(&pwq->pt, __pollwait); 111 init_poll_funcptr(&pwq->pt, __pollwait);
112 pwq->polling_task = current;
112 pwq->error = 0; 113 pwq->error = 0;
113 pwq->table = NULL; 114 pwq->table = NULL;
114 pwq->inline_index = 0; 115 pwq->inline_index = 0;
115} 116}
116
117EXPORT_SYMBOL(poll_initwait); 117EXPORT_SYMBOL(poll_initwait);
118 118
119static void free_poll_entry(struct poll_table_entry *entry) 119static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
142 free_page((unsigned long) old); 142 free_page((unsigned long) old);
143 } 143 }
144} 144}
145
146EXPORT_SYMBOL(poll_freewait); 145EXPORT_SYMBOL(poll_freewait);
147 146
148static struct poll_table_entry *poll_get_entry(poll_table *_p) 147static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
149{ 148{
150 struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
151 struct poll_table_page *table = p->table; 149 struct poll_table_page *table = p->table;
152 150
153 if (p->inline_index < N_INLINE_POLL_ENTRIES) 151 if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
159 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); 157 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
160 if (!new_table) { 158 if (!new_table) {
161 p->error = -ENOMEM; 159 p->error = -ENOMEM;
162 __set_current_state(TASK_RUNNING);
163 return NULL; 160 return NULL;
164 } 161 }
165 new_table->entry = new_table->entries; 162 new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
171 return table->entry++; 168 return table->entry++;
172} 169}
173 170
171static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
172{
173 struct poll_wqueues *pwq = wait->private;
174 DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
175
176 /*
177 * Although this function is called under waitqueue lock, LOCK
178 * doesn't imply write barrier and the users expect write
179 * barrier semantics on wakeup functions. The following
180 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
181 * and is paired with set_mb() in poll_schedule_timeout.
182 */
183 smp_wmb();
184 pwq->triggered = 1;
185
186 /*
187 * Perform the default wake up operation using a dummy
188 * waitqueue.
189 *
190 * TODO: This is hacky but there currently is no interface to
191 * pass in @sync. @sync is scheduled to be removed and once
192 * that happens, wake_up_process() can be used directly.
193 */
194 return default_wake_function(&dummy_wait, mode, sync, key);
195}
196
174/* Add a new entry */ 197/* Add a new entry */
175static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, 198static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
176 poll_table *p) 199 poll_table *p)
177{ 200{
178 struct poll_table_entry *entry = poll_get_entry(p); 201 struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
202 struct poll_table_entry *entry = poll_get_entry(pwq);
179 if (!entry) 203 if (!entry)
180 return; 204 return;
181 get_file(filp); 205 get_file(filp);
182 entry->filp = filp; 206 entry->filp = filp;
183 entry->wait_address = wait_address; 207 entry->wait_address = wait_address;
184 init_waitqueue_entry(&entry->wait, current); 208 init_waitqueue_func_entry(&entry->wait, pollwake);
209 entry->wait.private = pwq;
185 add_wait_queue(wait_address, &entry->wait); 210 add_wait_queue(wait_address, &entry->wait);
186} 211}
187 212
213int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
214 ktime_t *expires, unsigned long slack)
215{
216 int rc = -EINTR;
217
218 set_current_state(state);
219 if (!pwq->triggered)
220 rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
221 __set_current_state(TASK_RUNNING);
222
223 /*
224 * Prepare for the next iteration.
225 *
226 * The following set_mb() serves two purposes. First, it's
227 * the counterpart rmb of the wmb in pollwake() such that data
228 * written before wake up is always visible after wake up.
229 * Second, the full barrier guarantees that triggered clearing
230 * doesn't pass event check of the next iteration. Note that
231 * this problem doesn't exist for the first iteration as
232 * add_wait_queue() has full barrier semantics.
233 */
234 set_mb(pwq->triggered, 0);
235
236 return rc;
237}
238EXPORT_SYMBOL(poll_schedule_timeout);
239
188/** 240/**
189 * poll_select_set_timeout - helper function to setup the timeout value 241 * poll_select_set_timeout - helper function to setup the timeout value
190 * @to: pointer to timespec variable for the final timeout 242 * @to: pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
340 for (;;) { 392 for (;;) {
341 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 393 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
342 394
343 set_current_state(TASK_INTERRUPTIBLE);
344
345 inp = fds->in; outp = fds->out; exp = fds->ex; 395 inp = fds->in; outp = fds->out; exp = fds->ex;
346 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; 396 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
347 397
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
411 to = &expire; 461 to = &expire;
412 } 462 }
413 463
414 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 464 if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
465 to, slack))
415 timed_out = 1; 466 timed_out = 1;
416 } 467 }
417 __set_current_state(TASK_RUNNING);
418 468
419 poll_freewait(&table); 469 poll_freewait(&table);
420 470
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
666 for (;;) { 716 for (;;) {
667 struct poll_list *walk; 717 struct poll_list *walk;
668 718
669 set_current_state(TASK_INTERRUPTIBLE);
670 for (walk = list; walk != NULL; walk = walk->next) { 719 for (walk = list; walk != NULL; walk = walk->next) {
671 struct pollfd * pfd, * pfd_end; 720 struct pollfd * pfd, * pfd_end;
672 721
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
709 to = &expire; 758 to = &expire;
710 } 759 }
711 760
712 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 761 if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
713 timed_out = 1; 762 timed_out = 1;
714 } 763 }
715 __set_current_state(TASK_RUNNING);
716 return count; 764 return count;
717} 765}
718 766
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..a54b3e3f10a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/splice.h> 23#include <linux/splice.h>
24#include <linux/memcontrol.h>
24#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b76..7e12a6f82795 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
305 struct inode *inode = path.dentry->d_inode; 305 struct inode *inode = path.dentry->d_inode;
306 306
307 error = -EINVAL; 307 error = -EINVAL;
308 if (inode->i_op && inode->i_op->readlink) { 308 if (inode->i_op->readlink) {
309 error = security_inode_readlink(path.dentry); 309 error = security_inode_readlink(path.dentry);
310 if (!error) { 310 if (!error) {
311 touch_atime(path.mnt, path.dentry); 311 touch_atime(path.mnt, path.dentry);
diff --git a/fs/super.c b/fs/super.c
index d5fd4498548a..ed080c417167 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,7 @@
38#include <linux/kobject.h> 38#include <linux/kobject.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <linux/file.h> 40#include <linux/file.h>
41#include <linux/async.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include "internal.h" 43#include "internal.h"
43 44
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
71 INIT_HLIST_HEAD(&s->s_anon); 72 INIT_HLIST_HEAD(&s->s_anon);
72 INIT_LIST_HEAD(&s->s_inodes); 73 INIT_LIST_HEAD(&s->s_inodes);
73 INIT_LIST_HEAD(&s->s_dentry_lru); 74 INIT_LIST_HEAD(&s->s_dentry_lru);
75 INIT_LIST_HEAD(&s->s_async_list);
74 init_rwsem(&s->s_umount); 76 init_rwsem(&s->s_umount);
75 mutex_init(&s->s_lock); 77 mutex_init(&s->s_lock);
76 lockdep_set_class(&s->s_umount, &type->s_umount_key); 78 lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb)
289{ 291{
290 const struct super_operations *sop = sb->s_op; 292 const struct super_operations *sop = sb->s_op;
291 293
294
292 if (sb->s_root) { 295 if (sb->s_root) {
293 shrink_dcache_for_umount(sb); 296 shrink_dcache_for_umount(sb);
294 fsync_super(sb); 297 fsync_super(sb);
295 lock_super(sb); 298 lock_super(sb);
296 sb->s_flags &= ~MS_ACTIVE; 299 sb->s_flags &= ~MS_ACTIVE;
300
301 /*
302 * wait for asynchronous fs operations to finish before going further
303 */
304 async_synchronize_full_special(&sb->s_async_list);
305
297 /* bad name - it should be evict_inodes() */ 306 /* bad name - it should be evict_inodes() */
298 invalidate_inodes(sb); 307 invalidate_inodes(sb);
299 lock_kernel(); 308 lock_kernel();
@@ -461,6 +470,7 @@ restart:
461 sb->s_count++; 470 sb->s_count++;
462 spin_unlock(&sb_lock); 471 spin_unlock(&sb_lock);
463 down_read(&sb->s_umount); 472 down_read(&sb->s_umount);
473 async_synchronize_full_special(&sb->s_async_list);
464 if (sb->s_root && (wait || sb->s_dirt)) 474 if (sb->s_root && (wait || sb->s_dirt))
465 sb->s_op->sync_fs(sb, wait); 475 sb->s_op->sync_fs(sb, wait);
466 up_read(&sb->s_umount); 476 up_read(&sb->s_umount);
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416f..ac02b56548bc 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
75 return ret; 75 return ret;
76} 76}
77 77
78long do_fsync(struct file *file, int datasync) 78/**
79 * vfs_fsync - perform a fsync or fdatasync on a file
80 * @file: file to sync
81 * @dentry: dentry of @file
82 * @data: only perform a fdatasync operation
83 *
84 * Write back data and metadata for @file to disk. If @datasync is
85 * set only metadata needed to access modified file data is written.
86 *
87 * In case this function is called from nfsd @file may be %NULL and
88 * only @dentry is set. This can only happen when the filesystem
89 * implements the export_operations API.
90 */
91int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
79{ 92{
80 int ret; 93 const struct file_operations *fop;
81 int err; 94 struct address_space *mapping;
82 struct address_space *mapping = file->f_mapping; 95 int err, ret;
96
97 /*
98 * Get mapping and operations from the file in case we have
99 * as file, or get the default values for them in case we
100 * don't have a struct file available. Damn nfsd..
101 */
102 if (file) {
103 mapping = file->f_mapping;
104 fop = file->f_op;
105 } else {
106 mapping = dentry->d_inode->i_mapping;
107 fop = dentry->d_inode->i_fop;
108 }
83 109
84 if (!file->f_op || !file->f_op->fsync) { 110 if (!fop || !fop->fsync) {
85 /* Why? We can still call filemap_fdatawrite */
86 ret = -EINVAL; 111 ret = -EINVAL;
87 goto out; 112 goto out;
88 } 113 }
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
94 * livelocks in fsync_buffers_list(). 119 * livelocks in fsync_buffers_list().
95 */ 120 */
96 mutex_lock(&mapping->host->i_mutex); 121 mutex_lock(&mapping->host->i_mutex);
97 err = file->f_op->fsync(file, file->f_path.dentry, datasync); 122 err = fop->fsync(file, dentry, datasync);
98 if (!ret) 123 if (!ret)
99 ret = err; 124 ret = err;
100 mutex_unlock(&mapping->host->i_mutex); 125 mutex_unlock(&mapping->host->i_mutex);
@@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync)
104out: 129out:
105 return ret; 130 return ret;
106} 131}
132EXPORT_SYMBOL(vfs_fsync);
107 133
108static long __do_fsync(unsigned int fd, int datasync) 134static int do_fsync(unsigned int fd, int datasync)
109{ 135{
110 struct file *file; 136 struct file *file;
111 int ret = -EBADF; 137 int ret = -EBADF;
112 138
113 file = fget(fd); 139 file = fget(fd);
114 if (file) { 140 if (file) {
115 ret = do_fsync(file, datasync); 141 ret = vfs_fsync(file, file->f_path.dentry, datasync);
116 fput(file); 142 fput(file);
117 } 143 }
118 return ret; 144 return ret;
@@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync)
120 146
121asmlinkage long sys_fsync(unsigned int fd) 147asmlinkage long sys_fsync(unsigned int fd)
122{ 148{
123 return __do_fsync(fd, 0); 149 return do_fsync(fd, 0);
124} 150}
125 151
126asmlinkage long sys_fdatasync(unsigned int fd) 152asmlinkage long sys_fdatasync(unsigned int fd)
127{ 153{
128 return __do_fsync(fd, 1); 154 return do_fsync(fd, 1);
129} 155}
130 156
131/* 157/*
@@ -269,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
269 295
270 if (flags & SYNC_FILE_RANGE_WRITE) { 296 if (flags & SYNC_FILE_RANGE_WRITE) {
271 ret = __filemap_fdatawrite_range(mapping, offset, endbyte, 297 ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
272 WB_SYNC_NONE); 298 WB_SYNC_ALL);
273 if (ret < 0) 299 if (ret < 0)
274 goto out; 300 goto out;
275 } 301 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f856..dfa3d94cfc74 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
107static inline void set_default_inode_attr(struct inode * inode, mode_t mode) 107static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
108{ 108{
109 inode->i_mode = mode; 109 inode->i_mode = mode;
110 inode->i_uid = 0;
111 inode->i_gid = 0;
112 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 110 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
113} 111}
114 112
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
149{ 147{
150 struct bin_attribute *bin_attr; 148 struct bin_attribute *bin_attr;
151 149
152 inode->i_blocks = 0;
153 inode->i_mapping->a_ops = &sysfs_aops; 150 inode->i_mapping->a_ops = &sysfs_aops;
154 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; 151 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
155 inode->i_op = &sysfs_inode_operations; 152 inode->i_op = &sysfs_inode_operations;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 91ceeda7e5bf..e35b54d5059d 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB
40 depends on UBIFS_FS 40 depends on UBIFS_FS
41 default y 41 default y
42 help 42 help
43 Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. 43 Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
44 44
45# Debugging-related stuff 45# Debugging-related stuff
46config UBIFS_FS_DEBUG 46config UBIFS_FS_DEBUG
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 0e5e54d82924..175f9c590b77 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -142,7 +142,7 @@ static long long get_liability(struct ubifs_info *c)
142 * 142 *
143 * This function is called when an operation cannot be budgeted because there 143 * This function is called when an operation cannot be budgeted because there
144 * is supposedly no free space. But in most cases there is some free space: 144 * is supposedly no free space. But in most cases there is some free space:
145 * o budgeting is pessimistic, so it always budgets more then it is actually 145 * o budgeting is pessimistic, so it always budgets more than it is actually
146 * needed, so shrinking the liability is one way to make free space - the 146 * needed, so shrinking the liability is one way to make free space - the
147 * cached data will take less space then it was budgeted for; 147 * cached data will take less space then it was budgeted for;
148 * o GC may turn some dark space into free space (budgeting treats dark space 148 * o GC may turn some dark space into free space (budgeting treats dark space
@@ -606,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
606 * @c: UBIFS file-system description object 606 * @c: UBIFS file-system description object
607 * 607 *
608 * This function converts budget which was allocated for a new page of data to 608 * This function converts budget which was allocated for a new page of data to
609 * the budget of changing an existing page of data. The latter is smaller then 609 * the budget of changing an existing page of data. The latter is smaller than
610 * the former, so this function only does simple re-calculation and does not 610 * the former, so this function only does simple re-calculation and does not
611 * involve any write-back. 611 * involve any write-back.
612 */ 612 */
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 0bef6501d58a..9832f9abe28e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -45,7 +45,7 @@
45#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ 45#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
46 46
47/* 47/*
48 * GC may need to move more then one LEB to make progress. The below constants 48 * GC may need to move more than one LEB to make progress. The below constants
49 * define "soft" and "hard" limits on the number of LEBs the garbage collector 49 * define "soft" and "hard" limits on the number of LEBs the garbage collector
50 * may move. 50 * may move.
51 */ 51 */
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 10ae25b7d1db..9b7c54e0cd2a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -191,7 +191,7 @@ again:
191 if (wbuf->lnum != -1 && avail >= len) { 191 if (wbuf->lnum != -1 && avail >= len) {
192 /* 192 /*
193 * Someone else has switched the journal head and we have 193 * Someone else has switched the journal head and we have
194 * enough space now. This happens when more then one process is 194 * enough space now. This happens when more than one process is
195 * trying to write to the same journal head at the same time. 195 * trying to write to the same journal head at the same time.
196 */ 196 */
197 dbg_jnl("return LEB %d back, already have LEB %d:%d", 197 dbg_jnl("return LEB %d back, already have LEB %d:%d",
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f248533841a2..e7bab52a1410 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
151 * @contention: if any contention, this is set to %1 151 * @contention: if any contention, this is set to %1
152 * 152 *
153 * This function walks the list of mounted UBIFS file-systems and frees clean 153 * This function walks the list of mounted UBIFS file-systems and frees clean
154 * znodes which are older then @age, until at least @nr znodes are freed. 154 * znodes which are older than @age, until at least @nr znodes are freed.
155 * Returns the number of freed znodes. 155 * Returns the number of freed znodes.
156 */ 156 */
157static int shrink_tnc_trees(int nr, int age, int *contention) 157static int shrink_tnc_trees(int nr, int age, int *contention)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 0d7564b95f8e..89556ee72518 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -432,12 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
432 int i, err; 432 int i, err;
433 struct ubifs_info *c = sb->s_fs_info; 433 struct ubifs_info *c = sb->s_fs_info;
434 struct writeback_control wbc = { 434 struct writeback_control wbc = {
435 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, 435 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
436 .range_start = 0, 436 .range_start = 0,
437 .range_end = LLONG_MAX, 437 .range_end = LLONG_MAX,
438 .nr_to_write = LONG_MAX, 438 .nr_to_write = LONG_MAX,
439 }; 439 };
440 440
441 /*
442 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
443 * advisory thing to help the file system shove lots of data into the
444 * queues. If some gets missed then it'll be picked up on the second
445 * '->sync_fs()' call, with non-zero @wait.
446 */
447
441 if (sb->s_flags & MS_RDONLY) 448 if (sb->s_flags & MS_RDONLY)
442 return 0; 449 return 0;
443 450
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e66531..237804cd6b56 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
175 if (error) 175 if (error)
176 return error; 176 return error;
177 error = -EOPNOTSUPP; 177 error = -EOPNOTSUPP;
178 if (d->d_inode->i_op && d->d_inode->i_op->listxattr) { 178 if (d->d_inode->i_op->listxattr) {
179 error = d->d_inode->i_op->listxattr(d, list, size); 179 error = d->d_inode->i_op->listxattr(d, list, size);
180 } else { 180 } else {
181 error = security_inode_listsecurity(d->d_inode, list, size); 181 error = security_inode_listsecurity(d->d_inode, list, size);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 36f6cc703ef2..be846d606ae8 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1348,7 +1348,7 @@ xfs_finish_flags(
1348{ 1348{
1349 int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); 1349 int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
1350 1350
1351 /* Fail a mount where the logbuf is smaller then the log stripe */ 1351 /* Fail a mount where the logbuf is smaller than the log stripe */
1352 if (xfs_sb_version_haslogv2(&mp->m_sb)) { 1352 if (xfs_sb_version_haslogv2(&mp->m_sb)) {
1353 if (mp->m_logbsize <= 0 && 1353 if (mp->m_logbsize <= 0 &&
1354 mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { 1354 mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {